In [1]:
import pandas as pd
import numpy as np
import nltk
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
df=pd.read_csv('SMSSpamCollection.txt',sep='\t')
df.columns=['out','text']
df.head()

Unnamed: 0,out,text
0,ham,Ok lar... Joking wif u oni...
1,spam,Free entry in 2 a wkly comp to win FA Cup fina...
2,ham,U dun say so early hor... U c already then say...
3,ham,"Nah I don't think he goes to usf, he lives aro..."
4,spam,FreeMsg Hey there darling it's been 3 week's n...


In [3]:
df.shape

(5571, 2)

In [4]:
df['out'].value_counts()

ham     4824
spam     747
Name: out, dtype: int64

In [5]:
df['text'][4]

"FreeMsg Hey there darling it's been 3 week's now and no word back! I'd like some fun you up for it still? Tb ok! XxX std chgs to send, £1.50 to rcv"

#### Stopwords

In [6]:
from nltk.corpus import stopwords
stops=set(stopwords.words('english'))

In [7]:
len(stops)

179

#### Splitting into train &  test

In [8]:
x=df['text']
y=pd.get_dummies(df['out'])
print(x.shape)
print(y.shape)

(5571,)
(5571, 2)


In [9]:
y=y['spam']
print(y.shape)

(5571,)


In [10]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,stratify=y,test_size=0.2,random_state=42)
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

(4456,)
(1115,)
(4456,)
(1115,)


#### Lemmatization

In [11]:
import re
lemme=WordNetLemmatizer()
def data_clean(df1):
    for val in range(df1.shape[0]):
        sen=re.sub('[^a-zA-Z]',' ',df1.iloc[val])
        sen=sen.lower()
        sen=sen.split(' ')
        sen=[lemme.lemmatize(word) for word in sen if word not in stops]
        sen= ' '.join(sen)
        df1.iloc[val]=sen

In [12]:
print(x_train.iloc[4])
print(x_test.iloc[4])

Sad story of a Man - Last week was my b'day. My Wife did'nt wish me. My Parents forgot n so did my Kids . I went to work. Even my Colleagues did not wish.
She ran off with a younger man. we will make pretty babies together :)


In [13]:
data_clean(x_train)
data_clean(x_test)

In [14]:
print(x_train.iloc[4])
print(x_test.iloc[4])

sad story man   last week b day  wife nt wish  parent forgot n kid   went work  even colleague wish 
ran younger man  make pretty baby together   


#### TF-IDF Vectorizer

In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer
tcv=TfidfVectorizer()
tcv.fit(x_train)
train_cv=tcv.transform(x_train)
test_cv=tcv.transform(x_test)
print(train_cv.shape)
print(test_cv.shape)

(4456, 6280)
(1115, 6280)


#### MultinomialNB

In [16]:
from sklearn.naive_bayes import MultinomialNB
mnb=MultinomialNB()
mnb.fit(train_cv,y_train)
y_pred=mnb.predict(test_cv)

In [17]:
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report
print('accuracy score is :',accuracy_score(y_test,y_pred))

accuracy score is : 0.967713004484305


In [18]:
print('confusion matix is :')
print(confusion_matrix(y_test,y_pred))

confusion matix is :
[[965   0]
 [ 36 114]]


In [19]:
print('classification report is :')
print(classification_report(y_test,y_pred))

classification report is :
              precision    recall  f1-score   support

           0       0.96      1.00      0.98       965
           1       1.00      0.76      0.86       150

    accuracy                           0.97      1115
   macro avg       0.98      0.88      0.92      1115
weighted avg       0.97      0.97      0.97      1115



#### Word2Vec

In [27]:
from gensim.models import Word2Vec
df=pd.read_csv('SMSSpamCollection.txt',sep='\t')
df.columns=['out','text']
df.head()

Unnamed: 0,out,text
0,ham,Ok lar... Joking wif u oni...
1,spam,Free entry in 2 a wkly comp to win FA Cup fina...
2,ham,U dun say so early hor... U c already then say...
3,ham,"Nah I don't think he goes to usf, he lives aro..."
4,spam,FreeMsg Hey there darling it's been 3 week's n...


In [28]:
df.shape[0]
df.iloc[0]['text']

'Ok lar... Joking wif u oni...'

In [29]:
def data_clean(df1):
    for val in range(df1.shape[0]):
        #print(val)
        sen=re.sub('[^a-zA-Z]',' ',df1.iloc[val]['text'])
        sen=sen.lower()
        sen=sen.split(' ')
        sen=[lemme.lemmatize(word) for word in sen if word not in stops]
        sen= ' '.join(sen)
        df1.iloc[val]['text']=sen
data_clean(df)

In [32]:
df.iloc[4]['text']

'freemsg hey darling   week word back  like fun still  tb ok  xxx std chgs send        rcv'

In [33]:
word_list=[nltk.word_tokenize(sentance) for sentance in df['text']]
print(len(word_list))

5571


In [35]:
print(word_list[4])

['freemsg', 'hey', 'darling', 'week', 'word', 'back', 'like', 'fun', 'still', 'tb', 'ok', 'xxx', 'std', 'chgs', 'send', 'rcv']


In [36]:
model=Word2Vec(word_list,min_count=3)
words=model.wv.vocab
print(len(words))

2470


In [40]:
vec_sample=model.wv['week']
print(vec_sample)

[ 0.33519036 -0.45839778  0.8655996  -0.28990802  0.20685752  0.19269685
  0.57512045 -0.29716092  0.14535595 -0.18863939  0.22242785  0.5404329
  0.06255581 -0.03602088  0.05116342 -0.07249787  0.6256583   0.15871368
 -0.2862919  -0.6465394  -0.2533076   0.02585667  0.05994944  0.11455377
 -0.9113442  -1.1813171   0.00983375 -0.00242898 -0.2855233  -0.3686769
  0.43154114  0.0290256  -0.01203125 -0.5936302   0.21572141  0.02374408
  0.23793873 -0.4816312   0.18219727 -0.2182783   0.35126257 -0.07576518
 -1.0390087   0.38444653  0.1843243   0.1430826  -0.07108847 -0.05216493
  0.1820082   0.4533136   0.27389333 -0.531763    0.4119889  -0.05037279
  0.38287288  0.82040817  0.33378074 -0.4207115  -0.23192482  0.17601155
 -0.328246    0.22757782 -0.1322149   0.40884602  0.47947586  0.29094478
 -0.16668072 -0.3914928   0.10295797 -0.03924046 -0.6202428   0.19793683
 -0.02752241  0.07076204  0.46843195 -0.9118295   0.09748876 -0.08291233
  0.0336386   0.45683193  0.74879575  0.5498763   0.0

In [41]:
similar= model.wv.most_similar('week')
print(similar)

[('every', 0.9999087452888489), ('reply', 0.9999009966850281), ('win', 0.9998984336853027), ('txt', 0.9998963475227356), ('ur', 0.9998956322669983), ('text', 0.9998952746391296), ('great', 0.9998908638954163), ('new', 0.9998884797096252), ('b', 0.9998827576637268), ('msg', 0.9998789429664612)]
