# Install and import nltk

In [1]:
import nltk
#nltk.download()

In [None]:
#to see what attributes and functions we have with nltk
dir(nltk)

In [2]:
#stopwords are words that does not have a meaning seperate from the sentence 
from nltk.corpus import stopwords

stopwords.words('english')[0:5]

['i', 'me', 'my', 'myself', 'we']

In [3]:
# to see the first 500 stopwords with the interval of 25 (1. then 26. then 51...)
stopwords.words('english')[0:500:25]

['i', 'herself', 'been', 'with', 'here', 'very', 'doesn', 'won']

# Reading text data

In [4]:
import pandas as pd

messages = pd.read_csv('spam.csv', encoding ='latin-1')
messages.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [5]:
messages = messages.drop(labels = ['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], axis = 1)
messages.columns = ["label","text"]
messages.head()

Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [6]:
messages.shape

(5572, 2)

In [7]:
#to see the amount of data we have for each category
messages['label'].value_counts()

ham     4825
spam     747
Name: label, dtype: int64

In [8]:
#to see if we have any missing data
print('number of nulls in labels : {}'.format(messages['label'].isnull().sum()))
print('number of nulls in text : {}'.format(messages['text'].isnull().sum()))

number of nulls in labels : 0
number of nulls in text : 0


# Cleaning text data
there are 3 steps to clean text data:
1. removing punctuation
2. Tokenization
3. removing stopwords

In [9]:
import string

string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [10]:
#defining a function that will remove punctuation
def remove_punct(text): 
    text= ''.join([char for char in text if char not in string.punctuation])
    return text

In [11]:
#now we will apply it to our data with lambda function
messages['text_clean'] = messages['text'].apply(lambda x : remove_punct(x))
messages.head()

Unnamed: 0,label,text,text_clean
0,ham,"Go until jurong point, crazy.. Available only ...",Go until jurong point crazy Available only in ...
1,ham,Ok lar... Joking wif u oni...,Ok lar Joking wif u oni
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...,U dun say so early hor U c already then say
4,ham,"Nah I don't think he goes to usf, he lives aro...",Nah I dont think he goes to usf he lives aroun...


In [12]:
#defining a function that will tokenize our cleaned text
import re

def tokenize(text):
    tokens =  re.split('\W+',text)
    return tokens

messages['text_tokenized'] = messages['text_clean'].apply(lambda x: tokenize(x.lower()))
messages.head()

Unnamed: 0,label,text,text_clean,text_tokenized
0,ham,"Go until jurong point, crazy.. Available only ...",Go until jurong point crazy Available only in ...,"[go, until, jurong, point, crazy, available, o..."
1,ham,Ok lar... Joking wif u oni...,Ok lar Joking wif u oni,"[ok, lar, joking, wif, u, oni]"
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,Free entry in 2 a wkly comp to win FA Cup fina...,"[free, entry, in, 2, a, wkly, comp, to, win, f..."
3,ham,U dun say so early hor... U c already then say...,U dun say so early hor U c already then say,"[u, dun, say, so, early, hor, u, c, already, t..."
4,ham,"Nah I don't think he goes to usf, he lives aro...",Nah I dont think he goes to usf he lives aroun...,"[nah, i, dont, think, he, goes, to, usf, he, l..."


In [13]:
#to avoid confusion you need to add this code since stopwords is the variable that's pointing to the CorpusReader object in nltk
#The actual stopwords (i.e. a list of stopwords) you're looking for can be adressed as :
stop_words = set(stopwords.words("english"))

In [14]:
#now we will define a function to eliminate stopwords so our model will have a better understanding from prepared text

def remove_stopwords(tokenized_text):
    text = [word for word in tokenized_text if word not in stop_words]
    return text

messages['text_nostop'] = messages['text_tokenized'].apply(lambda x: remove_stopwords(x))
messages.head()

Unnamed: 0,label,text,text_clean,text_tokenized,text_nostop
0,ham,"Go until jurong point, crazy.. Available only ...",Go until jurong point crazy Available only in ...,"[go, until, jurong, point, crazy, available, o...","[go, jurong, point, crazy, available, bugis, n..."
1,ham,Ok lar... Joking wif u oni...,Ok lar Joking wif u oni,"[ok, lar, joking, wif, u, oni]","[ok, lar, joking, wif, u, oni]"
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,Free entry in 2 a wkly comp to win FA Cup fina...,"[free, entry, in, 2, a, wkly, comp, to, win, f...","[free, entry, 2, wkly, comp, win, fa, cup, fin..."
3,ham,U dun say so early hor... U c already then say...,U dun say so early hor U c already then say,"[u, dun, say, so, early, hor, u, c, already, t...","[u, dun, say, early, hor, u, c, already, say]"
4,ham,"Nah I don't think he goes to usf, he lives aro...",Nah I dont think he goes to usf he lives aroun...,"[nah, i, dont, think, he, goes, to, usf, he, l...","[nah, dont, think, goes, usf, lives, around, t..."


In [15]:
#so we learned how to make this steps part by part. Now we will make it one function to move on with TF-IDF process
def clean_text(text):
    text= ''.join([word.lower() for word in text if word not in string.punctuation])
    tokens =  re.split('\W+',text)
    text = [word for word in tokens if word not in stop_words]
    return text

In [16]:
# now we will use TF-IDF to make our text data numerical.The way that it is stored with this method is: sparse matrix
#use analyzer as preprocessing function and then fit transform the data. The first element (number of text) is same (5572) while
#the second part is not 2 since it represents all the unique word our TF-IDF detects
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vect = TfidfVectorizer(analyzer=clean_text)
X_tfidf = tfidf_vect.fit_transform(messages['text'])
print(X_tfidf.shape)
#print(tfidf_vect.get_feature_names())

(5572, 9395)


In [17]:
# now to train a model we need to make our sparse matrix a pandas dataframe 

X_features = pd.DataFrame(X_tfidf.toarray())
X_features.head(3)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,9385,9386,9387,9388,9389,9390,9391,9392,9393,9394
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [18]:
from sklearn.ensemble import RandomForestClassifier
print(RandomForestClassifier)

<class 'sklearn.ensemble._forest.RandomForestClassifier'>


In [19]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_features, messages['label'],test_size=0.2)

In [20]:
rf = RandomForestClassifier()
rf_model = rf.fit(X_train, y_train)

In [21]:
y_pred = rf_model.predict(X_test)
from sklearn.metrics import precision_score, recall_score

precision= precision_score(y_test, y_pred, pos_label = 'spam')
recall = recall_score(y_test, y_pred, pos_label = 'spam')
print('Precision : {} / Recall : {}'.format(round(precision,3),round(recall,3)))

Precision : 1.0 / Recall : 0.797


# word2vec

In [22]:
#to get better results we can use pretrained embaddings from known websites, they will give better results
!pip install -U gensim



In [23]:
import gensim.downloader as api

wiki_embeddings = api.load('glove-wiki-gigaword-100')

In [24]:
#let's see a vector embedding for a spesific word
wiki_embeddings['king']

array([-0.32307 , -0.87616 ,  0.21977 ,  0.25268 ,  0.22976 ,  0.7388  ,
       -0.37954 , -0.35307 , -0.84369 , -1.1113  , -0.30266 ,  0.33178 ,
       -0.25113 ,  0.30448 , -0.077491, -0.89815 ,  0.092496, -1.1407  ,
       -0.58324 ,  0.66869 , -0.23122 , -0.95855 ,  0.28262 , -0.078848,
        0.75315 ,  0.26584 ,  0.3422  , -0.33949 ,  0.95608 ,  0.065641,
        0.45747 ,  0.39835 ,  0.57965 ,  0.39267 , -0.21851 ,  0.58795 ,
       -0.55999 ,  0.63368 , -0.043983, -0.68731 , -0.37841 ,  0.38026 ,
        0.61641 , -0.88269 , -0.12346 , -0.37928 , -0.38318 ,  0.23868 ,
        0.6685  , -0.43321 , -0.11065 ,  0.081723,  1.1569  ,  0.78958 ,
       -0.21223 , -2.3211  , -0.67806 ,  0.44561 ,  0.65707 ,  0.1045  ,
        0.46217 ,  0.19912 ,  0.25802 ,  0.057194,  0.53443 , -0.43133 ,
       -0.34311 ,  0.59789 , -0.58417 ,  0.068995,  0.23944 , -0.85181 ,
        0.30379 , -0.34177 , -0.25746 , -0.031101, -0.16285 ,  0.45169 ,
       -0.91627 ,  0.64521 ,  0.73281 , -0.22752 , 

In [25]:
#to find to words that are most similar to king based on trained vectors
wiki_embeddings.most_similar('king')

[('prince', 0.7682328820228577),
 ('queen', 0.7507690787315369),
 ('son', 0.7020888328552246),
 ('brother', 0.6985775232315063),
 ('monarch', 0.6977890729904175),
 ('throne', 0.6919989585876465),
 ('kingdom', 0.6811409592628479),
 ('father', 0.6802029013633728),
 ('emperor', 0.6712858080863953),
 ('ii', 0.6676074266433716)]

In [26]:
#now let's see the same example with word2vec
import pandas as pd
messages = pd.read_csv('spam.csv', encoding ='latin-1')
messages = messages.drop(labels = ['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], axis = 1)
messages.columns = ["label","text"]
messages.head()

Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [27]:
import gensim

In [28]:
messages['text_clean'] = messages['text'].apply(lambda x : gensim.utils.simple_preprocess(x))
messages.head(3)

Unnamed: 0,label,text,text_clean
0,ham,"Go until jurong point, crazy.. Available only ...","[go, until, jurong, point, crazy, available, o..."
1,ham,Ok lar... Joking wif u oni...,"[ok, lar, joking, wif, oni]"
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,"[free, entry, in, wkly, comp, to, win, fa, cup..."


In [29]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(messages['text_clean'], messages['label'],test_size=0.2)

In [30]:
#now we will train the word2vec model which comes with gensim package
w2v_model = gensim.models.Word2Vec(X_train,
                                   vector_size=100,
                                   window=5,
                                   min_count=2)

In [31]:
w2v_model.wv['king']

array([-0.06534448,  0.07740548,  0.0098363 ,  0.01370418, -0.00305823,
       -0.11036395,  0.03916888,  0.14430027, -0.06123946, -0.01940518,
       -0.05578702, -0.08848359, -0.01178622,  0.02408841,  0.01230709,
       -0.02967128,  0.00807217, -0.08906545, -0.01969034, -0.15604548,
        0.03554842,  0.08074233,  0.07598514, -0.03046089, -0.01329776,
        0.01608412, -0.08783561, -0.05327848, -0.0487599 ,  0.05187511,
        0.08814919,  0.01783894,  0.01663311, -0.08259889, -0.02438338,
        0.09956935, -0.00438019, -0.09820671, -0.01771092, -0.1400168 ,
       -0.00707782, -0.06539749, -0.02387323, -0.00306203,  0.06027883,
       -0.00285255, -0.09187299, -0.01751372,  0.01852628,  0.04020339,
        0.0514887 , -0.08447365, -0.01119655, -0.00038854, -0.01396212,
        0.02000972,  0.03415037, -0.02795216, -0.07785056,  0.04497785,
        0.01201104, -0.01521989,  0.00862918,  0.02303184, -0.05566642,
        0.0913811 ,  0.04736568,  0.08335682, -0.11283415,  0.09

In [32]:
w2v_model.wv.most_similar('king')

[('after', 0.9964773654937744),
 ('msg', 0.9964723587036133),
 ('is', 0.996422529220581),
 ('said', 0.9964016675949097),
 ('with', 0.9963865280151367),
 ('im', 0.9963786005973816),
 ('last', 0.9963719844818115),
 ('games', 0.9963648915290833),
 ('of', 0.996343731880188),
 ('called', 0.9963247776031494)]

In [33]:
#we can observe that when we used wikipedia embedding similar words made more sense

In [35]:
#this represents aLL od the words our model created a vector for. ( all words that appears training data more than 2)
#w2v_model.wv.index_to_key

In [36]:
#now we take the every element in the x_test and turn to an array that will include learned words only
import numpy as np
w2v_vect = np.array([np.array([w2v_model.wv[i] for i in ls if i in w2v_model.wv.index_to_key]) for ls in X_test], dtype=object)

In [38]:
#let's see examples of how many words are learned in a sentence.
#for i, v in enumerate(w2v_vect):
    #print(len(X_test.iloc[i]), len(v))

In [39]:
#the vector length must be equal and equal to size of 100 as we gave in model so we will apply: 
w2v_vect_avg = []
for vect in w2v_vect:
    if len(vect)!= 0:
        w2v_vect_avg.append(vect.mean(axis=0))
    else:
        w2v_vect_avg.append(np.zeros(100))

In [41]:
#for i, v in enumerate(w2v_vect_avg):
    #print(len(X_test.iloc[i]), len(v))

# doc2vec

  doc2vec is a shallow, two layer neural network that accepts a text corpus as an input, and it returns a set of vectors 
(also known as embeddings); each vector is a numeric representation of a given sentence, paragraph or document
instead of creating a vector for each word doc2vec creates a vector for a document(group of words)

In [42]:
#for doc2vec we need to create tagged document objects to train the model
tagged_docs = [gensim.models.doc2vec.TaggedDocument(v, [i]) for i, v in enumerate(X_train)]

In [43]:
#let's look what a tagged document looks like
tagged_docs[0]

TaggedDocument(words=['just', 'got', 'up', 'have', 'to', 'be', 'out', 'of', 'the', 'room', 'very', 'soon', 'û_', 'hadn', 'put', 'the', 'clocks', 'back', 'til', 'at', 'shouted', 'at', 'everyone', 'to', 'get', 'up', 'and', 'then', 'realised', 'it', 'was', 'wahay', 'another', 'hour', 'in', 'bed'], tags=[0])

In [44]:
#train a basic doc2vec model
d2v_model = gensim.models.Doc2Vec(tagged_docs,
                                 vector_size=100,
                                 window=5,
                                 min_count=2)

In [45]:
#if you try to pass a single word from this model you will get an error so you need to pass a list of words
d2v_model.infer_vector(['i','am','learning','nlp'])

array([-8.54809675e-03,  8.92898627e-03,  7.45030865e-03, -5.89309447e-03,
        4.14397102e-03, -2.65573990e-02,  9.43924207e-03,  2.97321435e-02,
       -2.11078599e-02, -1.13650328e-02, -1.58824474e-02, -2.81047560e-02,
        2.47373641e-03,  3.97854578e-03,  4.55919374e-03, -2.14168318e-02,
        5.85795473e-03, -1.79522689e-02, -7.29219429e-03, -4.21248078e-02,
        1.64590981e-02,  5.59809525e-03,  1.31115252e-02, -7.84364901e-03,
       -1.34117831e-03, -1.39555475e-03, -2.38405820e-02, -1.44826472e-02,
       -1.53281605e-02,  6.73000654e-03,  2.39295829e-02,  3.33640887e-03,
        1.18160993e-02, -1.84785239e-02, -6.39353553e-03,  3.57452743e-02,
       -1.30600098e-03, -1.42853726e-02, -1.18792607e-02, -3.10618244e-02,
       -1.16173867e-02, -1.13137616e-02, -8.32940079e-03, -3.57804284e-03,
        2.29755584e-02, -3.59083060e-03, -2.16139574e-02, -8.07914510e-03,
        1.21471705e-02,  1.71006303e-02,  4.90416260e-03, -1.74141396e-02,
       -5.70918480e-03, -

In [46]:
#how do we prepare these vectors to be used in a machine learning model?
vectors = [[d2v_model.infer_vector(words)] for words in X_test]

In [47]:
vectors[0]

[array([-0.01055653, -0.00972038,  0.0114952 ,  0.00807805,  0.00609327,
        -0.01216493, -0.00361454,  0.00671203, -0.00659027, -0.00457923,
        -0.0219323 ,  0.00110492,  0.01618778,  0.02695931,  0.00175064,
         0.00071895,  0.00553958, -0.01323192,  0.00737582, -0.0130015 ,
         0.00493365, -0.00190548,  0.02939511,  0.0020456 ,  0.01326407,
         0.00415145, -0.01564152, -0.00351014,  0.00399554, -0.00722946,
         0.02043383, -0.0001234 ,  0.00109905,  0.00391867, -0.01017171,
        -0.00391985,  0.00840653, -0.01409786,  0.0173386 , -0.02481797,
         0.00153598,  0.00240076, -0.00347901, -0.00440429, -0.01170508,
        -0.00153421, -0.01564072,  0.00871679, -0.01284049,  0.00984484,
         0.00213477,  0.00264153,  0.00161318,  0.00861747, -0.00274245,
         0.02325118,  0.00849877, -0.00141126, -0.02058314,  0.00012791,
         0.0082948 ,  0.00329112,  0.00896829,  0.00990194, -0.00571436,
         0.00109737,  0.00704232,  0.00038155,  0.0