In [None]:

import pandas as pd
import numpy as np
import nltk
import re
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

DATASET


In [None]:
df = pd.read_csv("/content/sample_data/spam_ham_dataset.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,label,text,label_num
0,605,ham,Subject: enron methanol ; meter # : 988291\r\n...,0
1,2349,ham,"Subject: hpl nom for january 9 , 2001\r\n( see...",0
2,3624,ham,"Subject: neon retreat\r\nho ho ho , we ' re ar...",0
3,4685,spam,"Subject: photoshop , windows , office . cheap ...",1
4,2030,ham,Subject: re : indian springs\r\nthis deal is t...,0


PROCESSING DATASET

In [None]:
df = df[['text', 'label']]

df = df.rename(columns={'text': 'messages', 'label': 'label'})
df.head()


Unnamed: 0,messages,label
0,Subject: enron methanol ; meter # : 988291\r\n...,ham
1,"Subject: hpl nom for january 9 , 2001\r\n( see...",ham
2,"Subject: neon retreat\r\nho ho ho , we ' re ar...",ham
3,"Subject: photoshop , windows , office . cheap ...",spam
4,Subject: re : indian springs\r\nthis deal is t...,ham


In [None]:
df.shape

(5171, 2)

In [None]:
df.columns

Index(['messages', 'label'], dtype='object')

In [None]:
df.drop_duplicates(inplace=True)

In [None]:
df.isnull().sum

<bound method NDFrame._add_numeric_operations.<locals>.sum of       messages  label
0        False  False
1        False  False
2        False  False
3        False  False
4        False  False
...        ...    ...
5165     False  False
5166     False  False
5167     False  False
5169     False  False
5170     False  False

[4993 rows x 2 columns]>

In [None]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
STOPWORDS = set(stopwords.words('english'))
def clean_text(text):
    # convert to lowercase
    text = text.lower()
    # remove special characters
    text = re.sub(r'[^0-9a-zA-Z]', ' ', text)
    # remove extra spaces
    text = re.sub(r'\s+', ' ', text)
    # remove stopwords
    text = " ".join(word for word in text.split() if word not in STOPWORDS)
    return text

In [None]:
df['clean_text'] = df['messages'].apply(clean_text)
df.head()

Unnamed: 0,messages,label,clean_text
0,Subject: enron methanol ; meter # : 988291\r\n...,ham,subject enron methanol meter 988291 follow not...
1,"Subject: hpl nom for january 9 , 2001\r\n( see...",ham,subject hpl nom january 9 2001 see attached fi...
2,"Subject: neon retreat\r\nho ho ho , we ' re ar...",ham,subject neon retreat ho ho ho around wonderful...
3,"Subject: photoshop , windows , office . cheap ...",spam,subject photoshop windows office cheap main tr...
4,Subject: re : indian springs\r\nthis deal is t...,ham,subject indian springs deal book teco pvr reve...


TRAINING MODEL

In [None]:
X = df['clean_text']
y = df['label']

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=0.2, random_state=3)

In [None]:
print(X.shape)
print(X_train.shape)
print(X_test.shape)

(4993,)
(3994,)
(999,)


In [None]:
feature_extraction = TfidfVectorizer(min_df = 1, stop_words='english')

X_train_features = feature_extraction.fit_transform(X_train)
X_test_features = feature_extraction.transform(X_test)

# convert Y_train and Y_test values as integers

#Y_train = Y_train.astype('int')
#Y_test = Y_test.astype('int')

In [None]:
print(X_train)

2493                      subject learn make fortune ebay
2141    subject rate tenaska deal daren thanks certain...
2143    subject italian rolex throw away prices derm s...
2246    subject see picture hi lucy saw profile online...
1275    subject 12 th nom going back 70 midcon forward...
                              ...                        
796     subject impress hard erection midband good mor...
978     subject central power light record time may do...
1696    subject like computers incredible offers windo...
3415    subject new product cialis soft tabs hi new pr...
1718    subject midtex daren need service list contrac...
Name: clean_text, Length: 3994, dtype: object


In [None]:
print(X_train_features)

  (0, 15770)	0.6125495870748028
  (0, 18578)	0.557645693321033
  (0, 26263)	0.2982705338173961
  (0, 25088)	0.4662578469095977
  (0, 38047)	0.08632840173723072
  (1, 43387)	0.10141428929538886
  (1, 27782)	0.07925608466360945
  (1, 41226)	0.08085483714173256
  (1, 10409)	0.08328509414403364
  (1, 37620)	0.05568341490694429
  (1, 42763)	0.04824964708670679
  (1, 34275)	0.09389074838642082
  (1, 22459)	0.12700509314480926
  (1, 39395)	0.08749672782163612
  (1, 27032)	0.06331981293615545
  (1, 20335)	0.06465878635575921
  (1, 5105)	0.05174219948255761
  (1, 26326)	0.050773548155027985
  (1, 33805)	0.05428601384065556
  (1, 27119)	0.07255236234542738
  (1, 31531)	0.03246039995550646
  (1, 1978)	0.06445850957469548
  (1, 86)	0.040742036432724726
  (1, 5100)	0.05618617452754515
  (1, 32658)	0.08421270598635276
  :	:
  (3993, 27273)	0.25436094139167703
  (3993, 5166)	0.15178607374365374
  (3993, 12317)	0.22027564214373796
  (3993, 11284)	0.15603458017574018
  (3993, 36049)	0.12633020767117176

In [None]:
model = LogisticRegression()

In [None]:
model.fit(X_train_features, Y_train)

In [None]:
prediction_on_training_data = model.predict(X_train_features)
accuracy_on_training_data = accuracy_score(Y_train, prediction_on_training_data)

In [None]:
print('Accuracy on training data : ', accuracy_on_training_data)

Accuracy on training data :  0.9962443665498247


In [None]:
prediction_on_test_data = model.predict(X_test_features)
accuracy_on_test_data = accuracy_score(Y_test, prediction_on_test_data)

In [None]:
print('Accuracy on test data : ', accuracy_on_test_data)

Accuracy on test data :  0.987987987987988


PREDICTIVE SYSTEM

In [None]:
input_mail = ["I've been searching for the right words to thank you for this breather. I promise i wont take your help for granted and will fulfil my promise. You have been wonderful and a blessing at all times"]

# convert text to feature vectors
input_data_features = feature_extraction.transform(input_mail)

# making prediction

prediction = model.predict(input_data_features)
print(prediction)


if (prediction[0]==1):
  print('Ham mail')

else:
  print('Spam mail')

['spam']
Spam mail
