In [1]:
import gensim
from gensim.models import Word2Vec, KeyedVectors

In [2]:
import pandas as pd
messages=pd.read_csv('SMSSpamCollection',
                    sep='\t',names=["label","message"])

In [3]:
messages.shape

(5572, 2)

In [4]:
messages.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [5]:
from nltk.stem import WordNetLemmatizer
lemmatizer=WordNetLemmatizer()

In [6]:
import re
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hhaha\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [7]:
corpus = []
for i in range(0, len(messages)):
    review = re.sub('[^a-zA-Z]', ' ', messages['message'][i])
    review = review.lower()
    review = review.split()
    
    review = [lemmatizer.lemmatize(word) for word in review]
    review = ' '.join(review)
    corpus.append(review)

In [7]:
#this code to know which string has 0 length or we can say that NAN value in corpus
[[i,j,k] for i,j,k in zip(list(map(len,corpus)),corpus, messages['message']) if i<1]

[[0, '', '645'], [0, '', ':) '], [0, '', ':-) :-)']]

In [10]:
corpus

['go until jurong point crazy available only in bugis n great world la e buffet cine there got amore wat',
 'ok lar joking wif u oni',
 'free entry in a wkly comp to win fa cup final tkts st may text fa to to receive entry question std txt rate t c s apply over s',
 'u dun say so early hor u c already then say',
 'nah i don t think he go to usf he life around here though',
 'freemsg hey there darling it s been week s now and no word back i d like some fun you up for it still tb ok xxx std chgs to send to rcv',
 'even my brother is not like to speak with me they treat me like aid patent',
 'a per your request melle melle oru minnaminunginte nurungu vettam ha been set a your callertune for all caller press to copy your friend callertune',
 'winner a a valued network customer you have been selected to receivea prize reward to claim call claim code kl valid hour only',
 'had your mobile month or more u r entitled to update to the latest colour mobile with camera for free call the mobile up

In [12]:
from nltk import sent_tokenize
from gensim.utils import simple_preprocess

In [15]:
#simple_preprocess:- Convert a document into a list of lowercase tokens, 
#                    ignoring tokens that are too short or too long.
words=[]
for sent in corpus:
    sent_token=sent_tokenize(sent)# here we get all sentence in a list such that they are seperated by "."
    for sent in sent_token:# all sentences in that list.
        words.append(simple_preprocess(sent))


In [49]:
words[0]

['go',
 'until',
 'jurong',
 'point',
 'crazy',
 'available',
 'only',
 'in',
 'bugis',
 'great',
 'world',
 'la',
 'buffet',
 'cine',
 'there',
 'got',
 'amore',
 'wat']

In [17]:
import gensim

In [18]:
## Lets train Word2vec from scratch
model=gensim.models.Word2Vec(words,window=5,min_count=2,workers=4)

In [19]:
## To Get All the Vocabulary in Word2 vec model
model.wv.index_to_key

['to',
 'you',
 'the',
 'and',
 'it',
 'in',
 'is',
 'me',
 'my',
 'for',
 'your',
 'call',
 'of',
 'that',
 'have',
 'on',
 'now',
 'are',
 'can',
 'so',
 'but',
 'not',
 'or',
 'we',
 'do',
 'get',
 'at',
 'ur',
 'will',
 'if',
 'be',
 'with',
 'no',
 'just',
 'this',
 'gt',
 'lt',
 'go',
 'how',
 'up',
 'when',
 'ok',
 'day',
 'what',
 'free',
 'from',
 'all',
 'out',
 'know',
 'll',
 'come',
 'like',
 'good',
 'time',
 'am',
 'then',
 'got',
 'wa',
 'there',
 'he',
 'love',
 'text',
 'only',
 'want',
 'send',
 'one',
 'need',
 'txt',
 'today',
 'by',
 'going',
 'don',
 'stop',
 'she',
 'home',
 'about',
 'lor',
 'sorry',
 'see',
 'still',
 'mobile',
 'take',
 'back',
 'da',
 'reply',
 'dont',
 'our',
 'think',
 'tell',
 'week',
 'phone',
 'hi',
 'they',
 'new',
 'later',
 'please',
 'pls',
 'any',
 'ha',
 'her',
 'did',
 'co',
 'been',
 'msg',
 'min',
 'an',
 'some',
 'dear',
 'night',
 'make',
 'who',
 'here',
 'message',
 'say',
 'well',
 'where',
 're',
 'thing',
 'much',
 'clai

In [20]:
model.corpus_count

5569

In [21]:
model.epochs

5

In [22]:
model.wv.similar_by_word('good')

[('my', 0.9994509816169739),
 ('day', 0.9993947148323059),
 ('all', 0.9992988705635071),
 ('morning', 0.999220609664917),
 ('hope', 0.9992102980613708),
 ('wa', 0.9991809725761414),
 ('night', 0.9991713762283325),
 ('about', 0.9991674423217773),
 ('and', 0.9991400837898254),
 ('amp', 0.9991378784179688)]

In [23]:
model.wv['good'].shape

(100,)

In [24]:
words[0]

['go',
 'until',
 'jurong',
 'point',
 'crazy',
 'available',
 'only',
 'in',
 'bugis',
 'great',
 'world',
 'la',
 'buffet',
 'cine',
 'there',
 'got',
 'amore',
 'wat']

In [56]:
def avg_word2vec(doc):
    # remove out-of-vocabulary words
    #sent = [word for word in doc if word in model.wv.index_to_key]
    #print(sent)
    
    return np.mean([model.wv[word] for word in doc if word in model.wv.index_to_key],axis=0)
                #or [np.zeros(len(model.wv.index_to_key))], axis=0)

In [57]:
from tqdm import tqdm
#tqdm :- Instantly make your loops show a smart progress meter - 
#        just wrap any iterable with tqdm(iterable), and you’re done!

In [58]:
#apply for the entire sentences

import numpy as np
X=[]
for i in tqdm(range(len(words))):
    X.append(avg_word2vec(words[i]))

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
100%|████████████████████████████████████████████████████████████████████████████| 5569/5569 [00:01<00:00, 4806.13it/s]


In [59]:
len(X)

5569

In [60]:
X[1]

array([-0.12829514,  0.19092904,  0.07852744,  0.04536315,  0.08487733,
       -0.35635203,  0.14398429,  0.40741682, -0.20512421, -0.20818219,
       -0.09023391, -0.32872987,  0.01782176,  0.13869062,  0.09791772,
       -0.18502487,  0.07013766, -0.28567752, -0.10202445, -0.43084645,
        0.12394093,  0.15871993,  0.14658862, -0.2232677 , -0.03273162,
        0.02614798, -0.17595282, -0.1864786 , -0.22298296,  0.01621116,
        0.22731502, -0.03059172,  0.11119562, -0.19874506, -0.05726056,
        0.28099075,  0.06578591, -0.05786747, -0.16621853, -0.40156168,
        0.07045905, -0.1755878 , -0.17366788,  0.0701832 ,  0.2117463 ,
       -0.09234871, -0.1717765 , -0.06978481,  0.09220715,  0.1024362 ,
        0.12375444, -0.27128023, -0.1046806 ,  0.03904859, -0.18398544,
        0.12246195,  0.08789412,  0.01233926, -0.24213317,  0.10757673,
        0.10519037,  0.10341638, -0.00762066, -0.0254934 , -0.20858245,
        0.24779482,  0.10799816,  0.21375409, -0.29794487,  0.22

In [61]:
##independent Features
X_new=np.array(X)

  X_new=np.array(X)


In [62]:
messages.shape

(5572, 2)

In [63]:
X_new.shape

(5569,)

In [137]:
X_new[0]

array([-0.16850011,  0.2706835 ,  0.1127236 ,  0.06389467,  0.11423732,
       -0.50906414,  0.22208868,  0.5916308 , -0.29796046, -0.3006236 ,
       -0.13681762, -0.47927415,  0.01852698,  0.19881897,  0.13434033,
       -0.2639121 ,  0.08711408, -0.39859787, -0.14188477, -0.614002  ,
        0.1886918 ,  0.22607137,  0.21864079, -0.32865483, -0.06127972,
        0.04215064, -0.25364536, -0.2649125 , -0.3215748 ,  0.02667103,
        0.33336347, -0.05130834,  0.16649929, -0.2866144 , -0.08437752,
        0.4102761 ,  0.10711406, -0.07887978, -0.24320619, -0.57726467,
        0.10160967, -0.25044665, -0.25110492,  0.10183042,  0.3165902 ,
       -0.13436203, -0.24585097, -0.0978977 ,  0.13220094,  0.15685257,
        0.18789622, -0.38930428, -0.15380895,  0.04943855, -0.27097934,
        0.17662224,  0.1323218 ,  0.02510331, -0.34585157,  0.14500946,
        0.13546531,  0.14433077, -0.01126382, -0.03833937, -0.30217192,
        0.3560714 ,  0.15486415,  0.31000686, -0.42880335,  0.31

In [65]:
X_new.shape

(5569,)

In [66]:
## Dependent Features
## Output Features
# here we also removing all three dependent variable for which independent variable have Null value
y = messages[list(map(lambda x: len(x)>0 ,corpus))]
y=pd.get_dummies(y['label'])
y=y.iloc[:,0].values

In [67]:
y.shape

(5569,)

In [68]:
# we need to flatten the 
X_new[0].reshape(-1,1)

array([[-0.16850011],
       [ 0.2706835 ],
       [ 0.1127236 ],
       [ 0.06389467],
       [ 0.11423732],
       [-0.50906414],
       [ 0.22208868],
       [ 0.5916308 ],
       [-0.29796046],
       [-0.3006236 ],
       [-0.13681762],
       [-0.47927415],
       [ 0.01852698],
       [ 0.19881897],
       [ 0.13434033],
       [-0.2639121 ],
       [ 0.08711408],
       [-0.39859787],
       [-0.14188477],
       [-0.614002  ],
       [ 0.1886918 ],
       [ 0.22607137],
       [ 0.21864079],
       [-0.32865483],
       [-0.06127972],
       [ 0.04215064],
       [-0.25364536],
       [-0.2649125 ],
       [-0.3215748 ],
       [ 0.02667103],
       [ 0.33336347],
       [-0.05130834],
       [ 0.16649929],
       [-0.2866144 ],
       [-0.08437752],
       [ 0.4102761 ],
       [ 0.10711406],
       [-0.07887978],
       [-0.24320619],
       [-0.57726467],
       [ 0.10160967],
       [-0.25044665],
       [-0.25110492],
       [ 0.10183042],
       [ 0.3165902 ],
       [-0

In [69]:
## this is the final independent features
df=pd.DataFrame()
for i in range(0,len(X)):
    df=df.append(pd.DataFrame(X[i].reshape(1,-1)),ignore_index=True)
    

  df=df.append(pd.DataFrame(X[i].reshape(1,-1)),ignore_index=True)


In [70]:
df.shape

(5569, 100)

In [71]:
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,-0.1685,0.270683,0.112724,0.063895,0.114237,-0.509064,0.222089,0.591631,-0.29796,-0.300624,...,0.480569,0.137898,0.061389,0.153819,0.507263,0.242,0.143792,-0.358741,0.2703,0.040679
1,-0.128295,0.190929,0.078527,0.045363,0.084877,-0.356352,0.143984,0.407417,-0.205124,-0.208182,...,0.341986,0.088203,0.035493,0.108129,0.340669,0.16419,0.09989,-0.261039,0.184781,0.029516
2,-0.168786,0.264869,0.136456,0.09311,0.125868,-0.539656,0.225251,0.595742,-0.311613,-0.301772,...,0.474239,0.145106,0.067457,0.152407,0.501217,0.228789,0.112711,-0.372355,0.283051,0.049656
3,-0.218331,0.343764,0.1368,0.085236,0.141355,-0.640032,0.266081,0.738259,-0.369571,-0.373513,...,0.600216,0.167328,0.071148,0.200202,0.622903,0.308717,0.186361,-0.453743,0.33421,0.048481
4,-0.221917,0.332751,0.140593,0.080464,0.146616,-0.636888,0.268705,0.736671,-0.37456,-0.375563,...,0.603641,0.161477,0.077036,0.193744,0.618934,0.310355,0.174929,-0.469221,0.337919,0.048241


In [72]:
df['Output']=y

In [73]:
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,91,92,93,94,95,96,97,98,99,Output
0,-0.1685,0.270683,0.112724,0.063895,0.114237,-0.509064,0.222089,0.591631,-0.29796,-0.300624,...,0.137898,0.061389,0.153819,0.507263,0.242,0.143792,-0.358741,0.2703,0.040679,1
1,-0.128295,0.190929,0.078527,0.045363,0.084877,-0.356352,0.143984,0.407417,-0.205124,-0.208182,...,0.088203,0.035493,0.108129,0.340669,0.16419,0.09989,-0.261039,0.184781,0.029516,1
2,-0.168786,0.264869,0.136456,0.09311,0.125868,-0.539656,0.225251,0.595742,-0.311613,-0.301772,...,0.145106,0.067457,0.152407,0.501217,0.228789,0.112711,-0.372355,0.283051,0.049656,0
3,-0.218331,0.343764,0.1368,0.085236,0.141355,-0.640032,0.266081,0.738259,-0.369571,-0.373513,...,0.167328,0.071148,0.200202,0.622903,0.308717,0.186361,-0.453743,0.33421,0.048481,1
4,-0.221917,0.332751,0.140593,0.080464,0.146616,-0.636888,0.268705,0.736671,-0.37456,-0.375563,...,0.161477,0.077036,0.193744,0.618934,0.310355,0.174929,-0.469221,0.337919,0.048241,1


In [74]:
df.dropna(inplace=True)

In [75]:
df.isnull().sum()

0         0
1         0
2         0
3         0
4         0
         ..
96        0
97        0
98        0
99        0
Output    0
Length: 101, dtype: int64

In [115]:
## Independent Feature
X=df.drop("Output",axis=1)

In [116]:
X.isnull().sum()

0     0
1     0
2     0
3     0
4     0
     ..
95    0
96    0
97    0
98    0
99    0
Length: 100, dtype: int64

In [117]:
y=df['Output']


In [118]:
## Train Test Split
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.20)

In [119]:
X_train.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
4607,-0.112216,0.504572,0.130053,0.043225,0.01881,-0.768172,0.506098,0.968983,-0.415897,-0.550948,...,0.674788,0.264553,0.217445,0.295709,0.933618,0.378292,0.371456,-0.33361,0.363045,-0.072836
2415,-0.184342,0.295875,0.116839,0.062754,0.119897,-0.546069,0.236836,0.640165,-0.316623,-0.327331,...,0.519539,0.149484,0.069038,0.169011,0.548026,0.270849,0.16571,-0.388408,0.288893,0.036979
3542,-0.171883,0.250359,0.112885,0.066136,0.115358,-0.485969,0.202038,0.56201,-0.291887,-0.278417,...,0.456896,0.131301,0.052593,0.142165,0.476017,0.230074,0.129859,-0.354571,0.271466,0.043042
4833,-0.222232,0.343356,0.154713,0.089755,0.153574,-0.654192,0.280663,0.74898,-0.381266,-0.37551,...,0.608881,0.17476,0.078432,0.195654,0.637426,0.308681,0.171212,-0.462237,0.35063,0.054152
551,-0.218601,0.326675,0.142106,0.07728,0.150989,-0.622761,0.263996,0.723574,-0.367437,-0.372962,...,0.589683,0.155825,0.072314,0.181416,0.604587,0.30105,0.169644,-0.457236,0.335766,0.052971


In [120]:
y_train

4607    1
2415    1
3542    1
4833    1
551     1
       ..
4298    1
3760    1
2683    1
929     1
865     1
Name: Output, Length: 4450, dtype: uint8

In [121]:
from sklearn.ensemble import RandomForestClassifier
classifier=RandomForestClassifier()

In [122]:
classifier.fit(X_train,y_train)

RandomForestClassifier()

In [123]:
y_pred=classifier.predict(X_test)

In [124]:
X_test[0]

3549   -0.200091
3953   -0.228570
3829   -0.200500
3413   -0.228143
1135   -0.182004
          ...   
5439   -0.258486
1308   -0.169339
4511   -0.234462
1637   -0.224203
154    -0.154536
Name: 0, Length: 1113, dtype: float32

In [125]:
from sklearn.metrics import accuracy_score,classification_report
print(accuracy_score(y_test,y_pred))

0.9649595687331537


In [126]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.93      0.84      0.88       175
           1       0.97      0.99      0.98       938

    accuracy                           0.96      1113
   macro avg       0.95      0.91      0.93      1113
weighted avg       0.96      0.96      0.96      1113



In [135]:
def converting_corpus(text):
    corpus = []
    for i in range(0, len(messages)):
        review = re.sub('[^a-zA-Z]', ' ', messages['message'][i])
        review = review.lower()
        review = review.split()
    
        review = [lemmatizer.lemmatize(word) for word in review]
        review = ' '.join(review)
        corpus.append(review)
    return corpus

In [133]:
def out_put(text):
    wor=[]
    sent_token=sent_tokenize(text)# here we get all sentence in a list such that they are seperated by "."
    for sent in sent_token:# all sentences in that list.
        wor.append(simple_preprocess(sent))
    pre_vec=np.mean([model.wv[word] for word in wor[0] if word in model.wv.index_to_key],axis=0)
    y_pr=classifier.predict(pre_vec.reshape(1,-1))
    if y_pr==0:
        print("Spam")
    else:
        print("Not spam")

In [136]:
input_=input("Enter the text: ")
print()
out_put(input_)

Enter the text: thanks for your subscription to ringtone uk your mobile will be charged month please confirm by replying yes or no if you reply no you will not be charged

Spam
