In [68]:
# Loading the data
import numpy as np
import pandas as pd

data = pd.read_csv("dataset/SMSSpamCollection",delimiter="\t",names=["label","message"])

In [69]:
data

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [70]:
#  checking null values
data.isnull().sum()

label      0
message    0
dtype: int64

## For text cleaning / preprocessing
#### either we can do manually that we did in BOW,TFIDF or we can use the function simple_preprocess() provides by gensim.utils
 - simple_preprocess(sentence) ---> lowers the case , removes too short or too long words ,Removing special characters and punctuation , and makes data ready for training Word2Vec (Tokenizing the text -splitting into words)
 
#### Cons of simple_preprocess() :
 - it doesn't removes stopwords
 - it doesn't lemmatize or stem

In [71]:
# Data cleaning/Preprocessing with gensim 
from gensim.utils import simple_preprocess

data["cleaned_message"] = data["message"].apply(lambda x : simple_preprocess(x))

data

Unnamed: 0,label,message,cleaned_message
0,ham,"Go until jurong point, crazy.. Available only ...","[go, until, jurong, point, crazy, available, o..."
1,ham,Ok lar... Joking wif u oni...,"[ok, lar, joking, wif, oni]"
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,"[free, entry, in, wkly, comp, to, win, fa, cup..."
3,ham,U dun say so early hor... U c already then say...,"[dun, say, so, early, hor, already, then, say]"
4,ham,"Nah I don't think he goes to usf, he lives aro...","[nah, don, think, he, goes, to, usf, he, lives..."
...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,"[this, is, the, nd, time, we, have, tried, con..."
5568,ham,Will ü b going to esplanade fr home?,"[will, going, to, esplanade, fr, home]"
5569,ham,"Pity, * was in mood for that. So...any other s...","[pity, was, in, mood, for, that, so, any, othe..."
5570,ham,The guy did some bitching but I acted like i'd...,"[the, guy, did, some, bitching, but, acted, li..."


In [72]:
# After Cleaning these data is turning into empty message 
for i in range(len(data)):
    if len(data["cleaned_message"][i])<1:
        print(data["label"][i],data["message"][i],data["cleaned_message"][i])

ham 645 []
ham :)  []
ham G.W.R []
ham :-) :-) []
ham U 2. []


In [73]:
# dropping empty message
data = data[data["cleaned_message"].apply(len) !=0]

In [74]:
data = data.reset_index(drop=True)
data

Unnamed: 0,label,message,cleaned_message
0,ham,"Go until jurong point, crazy.. Available only ...","[go, until, jurong, point, crazy, available, o..."
1,ham,Ok lar... Joking wif u oni...,"[ok, lar, joking, wif, oni]"
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,"[free, entry, in, wkly, comp, to, win, fa, cup..."
3,ham,U dun say so early hor... U c already then say...,"[dun, say, so, early, hor, already, then, say]"
4,ham,"Nah I don't think he goes to usf, he lives aro...","[nah, don, think, he, goes, to, usf, he, lives..."
...,...,...,...
5562,spam,This is the 2nd time we have tried 2 contact u...,"[this, is, the, nd, time, we, have, tried, con..."
5563,ham,Will ü b going to esplanade fr home?,"[will, going, to, esplanade, fr, home]"
5564,ham,"Pity, * was in mood for that. So...any other s...","[pity, was, in, mood, for, that, so, any, othe..."
5565,ham,The guy did some bitching but I acted like i'd...,"[the, guy, did, some, bitching, but, acted, li..."


In [77]:
# splitting data into train test split

In [76]:
from sklearn.model_selection import train_test_split
X_train,X_test,Y_train,Y_test = train_test_split(data["cleaned_message"],data["label"],test_size=0.2,random_state=0)

In [78]:
X_train

253                     [what, you, doing, how, are, you]
1114    [no, good, for, the, movie, is, it, ok, if, le...
4257                      [am, late, will, be, there, at]
3979                                       [ringtoneking]
1012      [just, got, home, babe, are, you, still, awake]
                              ...                        
4931              [hey, do, you, want, anything, to, buy]
3264    [will, send, them, to, your, email, do, you, m...
1653    [was, at, bugis, juz, now, wat, but, now, walk...
2607    [yeah, lol, luckily, didn, have, starring, rol...
2732    [how, dare, you, stupid, wont, tell, anything,...
Name: cleaned_message, Length: 4453, dtype: object

In [79]:
Y_train

253      ham
1114     ham
4257     ham
3979    spam
1012     ham
        ... 
4931     ham
3264     ham
1653     ham
2607     ham
2732     ham
Name: label, Length: 4453, dtype: object

In [80]:
# training word2vec on our training dataset

from gensim.models import Word2Vec
model = Word2Vec(sentences=X_train,vector_size=100)

In [82]:
print(len(model.wv.index_to_key))
model.wv.index_to_key   # Vocabulary from high frequency to low frequency

1529


['you',
 'to',
 'the',
 'and',
 'is',
 'in',
 'me',
 'my',
 'it',
 'for',
 'your',
 'call',
 'of',
 'have',
 'that',
 'on',
 'are',
 'now',
 'can',
 'so',
 'not',
 'or',
 'but',
 'we',
 'at',
 'do',
 'get',
 'if',
 'with',
 'be',
 'ur',
 'no',
 'will',
 'just',
 'this',
 'gt',
 'lt',
 'how',
 'up',
 'free',
 'what',
 'ok',
 'when',
 'go',
 'all',
 'from',
 'out',
 'll',
 'like',
 'know',
 'good',
 'day',
 'then',
 'am',
 'its',
 'he',
 'was',
 'come',
 'love',
 'time',
 'there',
 'got',
 'send',
 'only',
 'txt',
 'want',
 'by',
 'text',
 'about',
 'need',
 'as',
 'stop',
 'home',
 'lor',
 'don',
 'going',
 'she',
 'see',
 'one',
 'sorry',
 'dont',
 'reply',
 'today',
 'still',
 'mobile',
 'back',
 'our',
 'tell',
 'take',
 'new',
 'da',
 'hi',
 'week',
 'later',
 'been',
 'any',
 'who',
 'please',
 'did',
 'think',
 'phone',
 'they',
 'oh',
 'dear',
 'where',
 'some',
 'her',
 'has',
 'pls',
 'hope',
 'great',
 're',
 'claim',
 'too',
 'here',
 'happy',
 'well',
 'hey',
 'night',
 'muc

In [83]:
model.corpus_count #gives the number of sentences (or "documents") that the model has been trained on

4453

In [85]:
model.epochs

5

In [86]:
# Function to get the average Word2Vec for a sentence

def avg_word2vec(tokens, model, vector_size):
    vectors = [model.wv[word] for word in tokens if word in model.wv.index_to_key]
    if len(vectors) == 0:
        return np.zeros(vector_size)
    return np.mean(vectors, axis=0)

In [87]:
# Converting sentence into Vector

import numpy as np
X_train_WORD2VEC = np.array([avg_word2vec(sentence, model, 100) for sentence in X_train])
X_test_WORD2VEC = np.array([avg_word2vec(sentence, model, 100) for sentence in X_test])

In [88]:
# Now last step , we will model our dataset

from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier()
rfc.fit(X_train_WORD2VEC,Y_train)

In [90]:
Y_pred = rfc.predict(X_test_WORD2VEC)

In [89]:
# now we will check our model performance

In [91]:
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix


In [92]:
accuracy_score(Y_test,Y_pred)

0.9685816876122083

In [93]:
print(classification_report(Y_test,Y_pred))

              precision    recall  f1-score   support

         ham       0.97      0.99      0.98       975
        spam       0.93      0.81      0.87       139

    accuracy                           0.97      1114
   macro avg       0.95      0.90      0.92      1114
weighted avg       0.97      0.97      0.97      1114



In [94]:
print(confusion_matrix(Y_test,Y_pred))

[[966   9]
 [ 26 113]]
