In [51]:
# Loading the data
import numpy as np
import pandas as pd

data = pd.read_csv("dataset/SMSSpamCollection",delimiter="\t",names=["label","message"])

In [52]:
data

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [53]:
#  checking null values
data.isnull().sum()

label      0
message    0
dtype: int64

## For text cleaning / preprocessing
#### either we can do manually that we did in BOW,TFIDF or we can use the function simple_preprocess() provides by gensim.utils
 - simple_preprocess(sentence) ---> lowers the case , removes too short or too long words ,Removing special characters and punctuation , and makes data ready for training Word2Vec (Tokenizing the text -splitting into words)
 
#### Cons of simple_preprocess() :
 - it doesn't removes stopwords
 - it doesn't lemmatize or stem

In [54]:
# # Data cleaning/Preprocessing with gensim 
# from gensim.utils import simple_preprocess

# data["cleaned_message"] = data["message"].apply(lambda x : simple_preprocess(x))


# text cleaning (removing special character(including digits),stopwords and doing lemmatization )
import re
import nltk
# nltk.download('stopwords') #already downloaded

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer # Using Lemmataization 

wnl=WordNetLemmatizer()

def clean(sentence):
    words = sentence.lower()
    words = re.sub('[^a-z]'," ",words)
    words = words.split()
    words = [wnl.lemmatize(word) for word in words if word not in stopwords.words("english") ] # removing stopwords
#     words = [wnl.lemmatize(word) for word in words ] # not removing stopwords
    return words

data["cleaned_message"] = data["message"].apply(lambda x : clean(x))
    


In [55]:
data

Unnamed: 0,label,message,cleaned_message
0,ham,"Go until jurong point, crazy.. Available only ...","[go, jurong, point, crazy, available, bugis, n..."
1,ham,Ok lar... Joking wif u oni...,"[ok, lar, joking, wif, u, oni]"
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,"[free, entry, wkly, comp, win, fa, cup, final,..."
3,ham,U dun say so early hor... U c already then say...,"[u, dun, say, early, hor, u, c, already, say]"
4,ham,"Nah I don't think he goes to usf, he lives aro...","[nah, think, go, usf, life, around, though]"
...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,"[nd, time, tried, contact, u, u, pound, prize,..."
5568,ham,Will ü b going to esplanade fr home?,"[b, going, esplanade, fr, home]"
5569,ham,"Pity, * was in mood for that. So...any other s...","[pity, mood, suggestion]"
5570,ham,The guy did some bitching but I acted like i'd...,"[guy, bitching, acted, like, interested, buyin..."


In [56]:
# After Cleaning these data is turning into empty message 
for i in range(len(data)):
    if len(data["cleaned_message"][i])<1:
        print(data["label"][i],data["message"][i],data["cleaned_message"][i])

ham What you doing?how are you? []
ham Where @ []
ham 645 []
ham Can a not? []
ham :)  []
ham What you doing?how are you? []
ham :( but your not here.... []
ham :-) :-) []


In [57]:
# dropping empty message
data = data[data["cleaned_message"].apply(len) !=0]

In [58]:
data = data.reset_index(drop=True)
data

Unnamed: 0,label,message,cleaned_message
0,ham,"Go until jurong point, crazy.. Available only ...","[go, jurong, point, crazy, available, bugis, n..."
1,ham,Ok lar... Joking wif u oni...,"[ok, lar, joking, wif, u, oni]"
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,"[free, entry, wkly, comp, win, fa, cup, final,..."
3,ham,U dun say so early hor... U c already then say...,"[u, dun, say, early, hor, u, c, already, say]"
4,ham,"Nah I don't think he goes to usf, he lives aro...","[nah, think, go, usf, life, around, though]"
...,...,...,...
5559,spam,This is the 2nd time we have tried 2 contact u...,"[nd, time, tried, contact, u, u, pound, prize,..."
5560,ham,Will ü b going to esplanade fr home?,"[b, going, esplanade, fr, home]"
5561,ham,"Pity, * was in mood for that. So...any other s...","[pity, mood, suggestion]"
5562,ham,The guy did some bitching but I acted like i'd...,"[guy, bitching, acted, like, interested, buyin..."


In [59]:
# splitting data into train test split

In [60]:
from sklearn.model_selection import train_test_split
X_train,X_test,Y_train,Y_test = train_test_split(data["cleaned_message"],data["label"],test_size=0.2,random_state=0)

In [61]:
X_train

5284    [urgent, please, call, landline, cash, luxury,...
3090    [staff, science, nu, edu, sg, phyhcmk, teachin...
1012     [buy, pizza, meat, lover, supreme, u, get, pick]
2986    [polyphonic, tone, ur, mob, every, week, txt, ...
4126    [ur, luck, love, someone, ur, fortune, love, o...
                              ...                        
4931    [cancelled, yeah, baby, well, sound, important...
3264                        [ok, come, n, pick, u, engin]
1653                                         [go, ok, na]
2607                           [awesome, text, restocked]
2732                           [mag, meeting, avo, point]
Name: cleaned_message, Length: 4451, dtype: object

In [62]:
Y_train

5284    spam
3090     ham
1012     ham
2986    spam
4126     ham
        ... 
4931     ham
3264     ham
1653     ham
2607     ham
2732     ham
Name: label, Length: 4451, dtype: object

In [63]:
# training word2vec on our training dataset

from gensim.models import Word2Vec
model = Word2Vec(sentences=X_train,vector_size=100)

In [64]:
print(len(model.wv.index_to_key))
model.wv.index_to_key   # Vocabulary from high frequency to low frequency

1401


['u',
 'call',
 'get',
 'ur',
 'gt',
 'lt',
 'go',
 'ok',
 'free',
 'day',
 'like',
 'know',
 'time',
 'good',
 'come',
 'got',
 'text',
 'love',
 'want',
 'send',
 'p',
 'need',
 'txt',
 'r',
 'n',
 'one',
 'home',
 'stop',
 'going',
 'c',
 'today',
 'k',
 'see',
 'take',
 'sorry',
 'back',
 'lor',
 'still',
 'mobile',
 'reply',
 'new',
 'dont',
 'hi',
 'phone',
 'week',
 'da',
 'think',
 'msg',
 'tell',
 'later',
 'min',
 'dear',
 'please',
 'co',
 'oh',
 'message',
 'well',
 'make',
 'say',
 'thing',
 'night',
 'pls',
 'claim',
 'give',
 'hope',
 'wat',
 'great',
 'much',
 'b',
 'friend',
 'hey',
 'www',
 'work',
 'prize',
 'way',
 'let',
 'happy',
 'e',
 'right',
 'yes',
 'number',
 'said',
 'already',
 'meet',
 'tone',
 'ask',
 'life',
 'win',
 'tomorrow',
 'im',
 'amp',
 'really',
 'cash',
 'com',
 'miss',
 'yeah',
 'babe',
 'would',
 'year',
 'last',
 'care',
 'morning',
 'uk',
 'every',
 'nokia',
 'lol',
 'service',
 'find',
 'feel',
 'thanks',
 'x',
 'anything',
 'keep',
 'pic

In [65]:
model.corpus_count #gives the number of sentences (or "documents") that the model has been trained on

4451

In [66]:
model.epochs

5

In [67]:
# Function to get the average Word2Vec for a sentence

def avg_word2vec(tokens, model, vector_size):
    vectors = [model.wv[word] for word in tokens if word in model.wv.index_to_key]
    if len(vectors) == 0:
        return np.zeros(vector_size)
    return np.mean(vectors, axis=0)

In [68]:
# Converting sentence into Vector

import numpy as np
X_train_WORD2VEC = np.array([avg_word2vec(sentence, model, 100) for sentence in X_train])
X_test_WORD2VEC = np.array([avg_word2vec(sentence, model, 100) for sentence in X_test])

In [69]:
# Now last step , we will model our dataset

from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier()
rfc.fit(X_train_WORD2VEC,Y_train)

In [70]:
Y_pred = rfc.predict(X_test_WORD2VEC)

In [71]:
# now we will check our model performance

In [72]:
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix


In [73]:
accuracy_score(Y_test,Y_pred)

0.963162623539982

In [74]:
print(classification_report(Y_test,Y_pred))

              precision    recall  f1-score   support

         ham       0.96      1.00      0.98       962
        spam       0.97      0.75      0.85       151

    accuracy                           0.96      1113
   macro avg       0.97      0.87      0.91      1113
weighted avg       0.96      0.96      0.96      1113



In [75]:
print(confusion_matrix(Y_test,Y_pred))

[[959   3]
 [ 38 113]]
