In [1]:
import pandas as pd

messages = pd.read_csv('SMSSpamCollection.txt', 
                       sep='\t', names=["label", "message"])

# tab is seprate feature in data that divides input and output data

In [2]:
messages.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [118]:
messages.shape

(5572, 2)

In [3]:
messages['message'].loc[451]

'hanks lotsly!'

In [8]:
messages['message'].loc[100]

"Please don't text me anymore. I have nothing else to say."

**Data cleaning and preprocessing**

In [9]:
import re
import nltk

In [10]:
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

ps = PorterStemmer()            # Perform Stemming 

In [29]:
corpus = []

for i in range(0, len(messages)):
    review = re.sub('[^a-zA-Z0-9]', ' ', messages['message'][i])
    # Remove all the special character other than a-z
    
    review = review.lower()
    review = review.split()
    
    review = [ps.stem(word) 
              for word in review 
              if not word in stopwords.words('english')]
    
    review = ' '.join(review)
    corpus.append(review)

In [74]:
# corpus

**Creating the Bag of Words model**

In [120]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(max_features=2500, binary=True)
X = cv.fit_transform(corpus).toarray()

In [121]:
X

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [122]:
X.shape

(5572, 2500)

In [35]:
y = pd.get_dummies(messages['label'])
y = y.iloc[:,1].values

In [105]:
y.shape

(5572,)

**Applying Models**

In [51]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size = 0.20, 
                                                    random_state = 0)

In [52]:
from sklearn.naive_bayes import MultinomialNB

spam_detect_model = MultinomialNB()
spam_detect_model.fit(X_train, y_train)

In [53]:
y_pred = spam_detect_model.predict(X_test)

In [54]:
from sklearn.metrics import accuracy_score

score = accuracy_score(y_test, y_pred)
print(score)

0.9847533632286996


In [55]:
from sklearn.metrics import classification_report

report = classification_report(y_pred, y_test)
print(report)

              precision    recall  f1-score   support

           0       1.00      0.98      0.99       972
           1       0.89      1.00      0.94       143

    accuracy                           0.98      1115
   macro avg       0.95      0.99      0.97      1115
weighted avg       0.99      0.98      0.99      1115



**Creating the TFIDF model**

In [56]:
from sklearn.feature_extraction.text import TfidfVectorizer

tv = TfidfVectorizer(max_features=2500)
X = tv.fit_transform(corpus).toarray()

In [57]:
X_train2, X_test2, y_train2, y_test2 = train_test_split(X, y, 
                                                    test_size = 0.20, 
                                                    random_state = 0)

In [58]:
spam_detect_model2 = MultinomialNB()
spam_detect_model2.fit(X_train2, y_train2)

In [59]:
y_pred2 = spam_detect_model2.predict(X_test2)

In [60]:
score2 = accuracy_score(y_test2, y_pred2)
print(score2)

0.9847533632286996


In [61]:
report2 = classification_report(y_pred2, y_test2)
print(report2)

              precision    recall  f1-score   support

           0       1.00      0.98      0.99       972
           1       0.89      1.00      0.94       143

    accuracy                           0.98      1115
   macro avg       0.95      0.99      0.97      1115
weighted avg       0.99      0.98      0.99      1115



In [62]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier()
clf.fit(X_train2, y_train2)

In [63]:
y_pred3 = clf.predict(X_test2)

In [64]:
score3 = accuracy_score(y_test2, y_pred3)
print(score3)

0.9838565022421525


In [65]:
report3 = classification_report(y_pred3, y_test2)
print(report3)

              precision    recall  f1-score   support

           0       1.00      0.98      0.99       973
           1       0.89      1.00      0.94       142

    accuracy                           0.98      1115
   macro avg       0.94      0.99      0.97      1115
weighted avg       0.99      0.98      0.98      1115



**By applying Lemmtization**

In [66]:
from nltk.stem import WordNetLemmatizer

lemmatizer=WordNetLemmatizer()

In [139]:
len(messages)

5572

In [141]:
corpus2 = []

for i in range(0, len(messages)):
    review = re.sub('[^a-zA-Z0-9]', ' ', messages['message'][i])
    
    review = review.lower()
    review = review.split()
    
    review = [lemmatizer.lemmatize(word)
              for word in review 
              if not word in stopwords.words('english')]
    
    review = ' '.join(review)
    corpus2.append(review)

In [142]:
len(corpus2)

5572

**Applying Word2Vec**

In [157]:
from nltk import sent_tokenize
from gensim.utils import simple_preprocess
# simple process fn convert a doc into a list of lowercase tokens

words=[]

for sent in corpus2:
    sent_token=sent_tokenize(sent)
    
    for sent in sent_token:
        words.append(simple_preprocess(sent))

In [158]:
len(words)

5565

In [73]:
# words

In [107]:
import gensim
from gensim.models import Word2Vec

model = Word2Vec(words, window=5, min_count=0)
# Vector size = Dimension is 100 by default
# min count ignores all frequency lower than this

In [106]:
# model.wv.index_to_key

In [108]:
model.corpus_count
# total vocabalary size

5565

In [80]:
model.epochs

5

In [81]:
model.wv.similar_by_word('kid')

[('game', 0.9977527260780334),
 ('went', 0.9977244734764099),
 ('pobox', 0.9976734519004822),
 ('gud', 0.9976728558540344),
 ('last', 0.9976262450218201),
 ('keep', 0.9976058602333069),
 ('thing', 0.9975892901420593),
 ('getting', 0.9975748658180237),
 ('hope', 0.9975689649581909),
 ('name', 0.9975687861442566)]

In [82]:
model.wv['kid'].shape

(100,)

In [92]:
import numpy as np

def avg_word2vec(doc):
    # remove out-of-vocabulary words
    # sent = [word for word in doc if word in model.wv.index_to_key]
    
    return np.mean([model.wv[word] 
                    for word in doc 
                    if word in model.wv.index_to_key],
                   axis=0)
                #or [np.zeros(len(model.wv.index_to_key))], axis=0)

In [87]:
# !pip install tqdm

In [88]:
from tqdm import tqdm

In [89]:
type(model.wv.index_to_key)

list

In [93]:
#apply for the entire sentences
X=[]

for i in tqdm(range(len(words))):
    X.append(avg_word2vec(words[i]))

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
100%|████████████████████████████████████████████████████████████████████████████| 5565/5565 [00:01<00:00, 3122.82it/s]


In [98]:
X[0].shape

(100,)

In [99]:
words[0]

['go',
 'jurong',
 'point',
 'crazy',
 'available',
 'bugis',
 'great',
 'world',
 'la',
 'buffet',
 'cine',
 'got',
 'amore',
 'wat']

In [100]:
words[1]

['ok', 'lar', 'joking', 'wif', 'oni']