In [1]:
import pandas as pd
dataset = pd.read_csv(
    'sms+spam+collection\SMSSpamCollection', 
    sep='\t',
    names=["label", "message"]
    )
dataset

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [5]:
dataset['message'].loc[100]

"Please don't text me anymore. I have nothing else to say."

In [2]:
messages = dataset['message']

In [3]:
import string
import nltk

In [4]:
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer

In [5]:
ss = SnowballStemmer('english')
stop_words_EN = set(stopwords.words('english'))

In [6]:
corpus = []
for i in range(len(messages)):
    messages[i] = messages[i].lower().translate(str.maketrans('', '', string.punctuation))
    words = nltk.word_tokenize(messages[i])
    words = [ss.stem(word) for word in words if word not in stop_words_EN]
    messages[i] = ' '.join(words)
    corpus.append(messages[i])

In [20]:
messages[100]

'pleas dont text anymor noth els say'

In [26]:
corpus

['go jurong point crazi avail bugi n great world la e buffet cine got amor wat',
 'ok lar joke wif u oni',
 'free entri 2 wkli comp win fa cup final tkts 21st may 2005 text fa 87121 receiv entri questionstd txt ratetc appli 08452810075over18',
 'u dun say ear hor u c alreadi say',
 'nah dont think goe usf live around though',
 'freemsg hey darl 3 week word back id like fun still tb ok xxx std chgs send £150 rcv',
 'even brother like speak treat like aid patent',
 'per request mell mell oru minnaminungint nurungu vettam set callertun caller press 9 copi friend callertun',
 'winner valu network custom select receivea £900 prize reward claim call 09061701461 claim code kl341 valid 12 hour',
 'mobil 11 month u r entitl updat latest colour mobil camera free call mobil updat co free 08002986030',
 'im gon na home soon dont want talk stuff anymor tonight k ive cri enough today',
 'six chanc win cash 100 20000 pound txt csh11 send 87575 cost 150pday 6day 16 tsandc appli repli hl 4 info',
 'urg

In [7]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=2500)
X = cv.fit_transform(corpus).toarray()

In [8]:
y=pd.get_dummies(dataset['label'])
y

Unnamed: 0,ham,spam
0,True,False
1,True,False
2,False,True
3,True,False
4,True,False
...,...,...
5567,False,True
5568,True,False
5569,True,False
5570,True,False


In [9]:
#Getting Spam values
y=y.iloc[:,1].values
y

array([False, False,  True, ..., False, False, False])

# BOW Model Implementation

In [30]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)

In [31]:
from sklearn.naive_bayes import MultinomialNB
spam_detect_model = MultinomialNB().fit(X_train, y_train)

In [32]:
#prediction
y_pred=spam_detect_model.predict(X_test)

In [33]:
from sklearn.metrics import classification_report
print(classification_report(y_pred,y_test))

              precision    recall  f1-score   support

       False       0.99      0.99      0.99       953
        True       0.95      0.94      0.94       162

    accuracy                           0.98      1115
   macro avg       0.97      0.96      0.97      1115
weighted avg       0.98      0.98      0.98      1115



# TF-IDF Model implementation

In [34]:
# Creating the TFIDF model
from sklearn.feature_extraction.text import TfidfVectorizer
tv = TfidfVectorizer(max_features=2500)
X = tv.fit_transform(corpus).toarray()

In [35]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)

In [39]:
spam_detect_model_tf_idf = MultinomialNB().fit(X_train, y_train)

In [40]:
#TF-IDF prediction
y_pred_tf_idf = spam_detect_model_tf_idf.predict(X_test)

In [41]:
print(classification_report(y_pred_tf_idf,y_test))

              precision    recall  f1-score   support

       False       1.00      0.98      0.99       973
        True       0.88      0.99      0.93       142

    accuracy                           0.98      1115
   macro avg       0.94      0.99      0.96      1115
weighted avg       0.98      0.98      0.98      1115



# Word2vec Implementation

In [10]:
!pip install gensim



In [11]:
import gensim   

In [12]:
from gensim.models import Word2Vec, KeyedVectors

In [13]:
import gensim.downloader as api

wv = api.load('word2vec-google-news-300')

In [14]:
messages = dataset['message']

In [15]:
from nltk.stem import WordNetLemmatizer
lemmatizer=WordNetLemmatizer()

In [16]:
corpus_w2v = []
for i in range(len(messages)):
    messages[i] = messages[i].lower().translate(str.maketrans('', '', string.punctuation))
    words = nltk.word_tokenize(messages[i])
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words_EN]
    messages[i] = ' '.join(words)
    corpus_w2v.append(messages[i])

In [17]:
corpus_w2v[100]

'plea dont text anymor noth el say'

In [21]:
from gensim.utils import simple_preprocess
from nltk import sent_tokenize

In [19]:
help(simple_preprocess)

Help on function simple_preprocess in module gensim.utils:

simple_preprocess(doc, deacc=False, min_len=2, max_len=15)
    Convert a document into a list of lowercase tokens, ignoring tokens that are too short or too long.
    
    Uses :func:`~gensim.utils.tokenize` internally.
    
    Parameters
    ----------
    doc : str
        Input document.
    deacc : bool, optional
        Remove accent marks from tokens using :func:`~gensim.utils.deaccent`?
    min_len : int, optional
        Minimum length of token (inclusive). Shorter tokens are discarded.
    max_len : int, optional
        Maximum length of token in result (inclusive). Longer tokens are discarded.
    
    Returns
    -------
    list of str
        Tokens extracted from `doc`.



In [22]:
sentences=[]
for sent in corpus_w2v:
    sent_token=sent_tokenize(sent)
    for sent in sent_token:
        sentences.append(simple_preprocess(sent))

In [23]:
sentences

[['go',
  'jurong',
  'point',
  'crazi',
  'avail',
  'bugi',
  'great',
  'world',
  'la',
  'buffet',
  'cine',
  'got',
  'amor',
  'wat'],
 ['ok', 'lar', 'joke', 'wif', 'oni'],
 ['free',
  'entri',
  'wkli',
  'comp',
  'win',
  'fa',
  'cup',
  'final',
  'tkts',
  'st',
  'may',
  'text',
  'fa',
  'receiv',
  'entri',
  'questionstd',
  'txt',
  'ratetc',
  'appli',
  'over'],
 ['dun', 'say', 'earli', 'hor', 'alreadi', 'say'],
 ['nah', 'dont', 'think', 'goe', 'usf', 'live', 'around', 'though'],
 ['freemsg',
  'hey',
  'darl',
  'week',
  'word',
  'back',
  'id',
  'like',
  'fun',
  'still',
  'tb',
  'ok',
  'xxx',
  'std',
  'chgs',
  'send',
  'rcv'],
 ['even', 'brother', 'like', 'speak', 'treat', 'like', 'aid', 'patent'],
 ['per',
  'request',
  'mell',
  'mell',
  'oru',
  'minnaminungint',
  'nurungu',
  'vettam',
  'set',
  'callertun',
  'caller',
  'press',
  'copi',
  'friend',
  'callertun'],
 ['winner',
  'valu',
  'network',
  'custom',
  'select',
  'receivea',
 

In [25]:
### Lets train Word2vec from scratch
model=gensim.models.Word2Vec(sentences,window=5,min_count=2)
model.wv.index_to_key

['call',
 'im',
 'go',
 'get',
 'ur',
 'come',
 'dont',
 'free',
 'ok',
 'ltgt',
 'know',
 'day',
 'like',
 'got',
 'love',
 'want',
 'ill',
 'time',
 'good',
 'text',
 'send',
 'need',
 'one',
 'see',
 'today',
 'think',
 'txt',
 'stop',
 'home',
 'week',
 'take',
 'lor',
 'repli',
 'sorri',
 'tell',
 'still',
 'mobil',
 'back',
 'make',
 'phone',
 'say',
 'new',
 'work',
 'hi',
 'plea',
 'well',
 'later',
 'da',
 'ask',
 'miss',
 'hope',
 'cant',
 'meet',
 'happi',
 'night',
 'give',
 'tri',
 'dear',
 'claim',
 'thing',
 'wait',
 'great',
 'much',
 'oh',
 'hey',
 'pls',
 'wat',
 'number',
 'messag',
 'na',
 'friend',
 'thank',
 'way',
 'msg',
 'min',
 'right',
 'prize',
 'wan',
 'feel',
 'even',
 'yes',
 'let',
 'pick',
 'tomorrow',
 'alreadi',
 'win',
 'said',
 'yeah',
 'realli',
 'amp',
 'leav',
 'care',
 'co',
 'babe',
 'morn',
 'didnt',
 'life',
 'last',
 'year',
 'sure',
 'servic',
 'find',
 'keep',
 'would',
 'anyth',
 'nokia',
 'contact',
 'cash',
 'ive',
 'buy',
 'sleep',
 't

In [26]:
model.corpus_count

5567

In [27]:
model.epochs

5

In [30]:
model.wv.similar_by_word('kid')

[('lol', 0.9974920749664307),
 ('realli', 0.9974381327629089),
 ('th', 0.997393012046814),
 ('thank', 0.9973331093788147),
 ('half', 0.9973282217979431),
 ('friend', 0.9973078966140747),
 ('much', 0.9972817301750183),
 ('went', 0.9972798228263855),
 ('well', 0.9972671270370483),
 ('man', 0.9972610473632812)]