# Natural Language Processing (NLP)

In [21]:
# Libraries
import nltk
from nltk.corpus import stopwords

In [22]:
# Examples of some stop words in English
stopwords.words('english')[0:1000:25]

['i', 'herself', 'been', 'with', 'here', 'very', 'doesn', 'won']

### First Example

In [23]:
import pandas as pd

#Pulling in the data and looking at the top rows
messages = pd.read_csv('/Users/jared/Downloads/Ex_Files_Adv_NLP_Python_ML/Exercise Files/data/spam.csv', encoding = 'latin-1')
messages.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there g...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives around here though",,,


In [24]:
#Clean up the data
messages = messages.drop(labels = ["Unnamed: 2", "Unnamed: 3", "Unnamed: 4"], axis = 1)
messages.columns = ["label", "text"]
messages.head()

Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there g..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives around here though"


In [25]:
# Summary stats on our data
messages.shape


(5572, 2)

In [26]:
# We have a lot more ham values than ham values
messages['label'].value_counts()

label
ham     4825
spam     747
Name: count, dtype: int64

In [27]:
# Missing Data
print('Number of nulls in label: {}'.format(messages['label'].isnull().sum()))
print('Number of nulls in text: {}'.format(messages['text'].isnull().sum()))

Number of nulls in label: 0
Number of nulls in text: 0


### Pre-Processing Text Data

In [28]:
import pandas as pd

#Changing the way the tables will be displayed so that we can read more of the data this time around
pd.set_option('display.max_colwidth', 100)

messages.head()

Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there g..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives around here though"


In [29]:
#Remove Punctuation

#To do this, we need to show python what punctuation looks like
#This library has a package called "puncatuation" that we can use for this step
import string

#Showing the punctuation
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [30]:
#The reason we do this is to remove noise from the data
"This message is spam" == "This message is spam."

False

In [31]:
#Building a function to remove punctuation
def remove_punct(text):
    text = "".join([char for char in text if char not in string.punctuation])
    return text

messages['text_clean'] = messages['text'].apply(lambda x: remove_punct(x))
messages.head()

Unnamed: 0,label,text,text_clean
0,ham,"Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there g...",Go until jurong point crazy Available only in bugis n great world la e buffet Cine there got amo...
1,ham,Ok lar... Joking wif u oni...,Ok lar Joking wif u oni
2,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005 Text FA to 87121 to receive e...
3,ham,U dun say so early hor... U c already then say...,U dun say so early hor U c already then say
4,ham,"Nah I don't think he goes to usf, he lives around here though",Nah I dont think he goes to usf he lives around here though


In [32]:
# Tokenization - splitting our sentences into a list of words
import re

# \W+ will split a text wherever it sees one or more non-word characters (white space, special characters, etc.)
def tokenize(text):
    tokens = re.split('\W+', text)
    return tokens

# We apply our function and then lower case all our words because python is case sensitive
messages['text_tokenized'] = messages['text_clean'].apply(lambda x: tokenize(x.lower()))

messages.head()

Unnamed: 0,label,text,text_clean,text_tokenized
0,ham,"Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there g...",Go until jurong point crazy Available only in bugis n great world la e buffet Cine there got amo...,"[go, until, jurong, point, crazy, available, only, in, bugis, n, great, world, la, e, buffet, ci..."
1,ham,Ok lar... Joking wif u oni...,Ok lar Joking wif u oni,"[ok, lar, joking, wif, u, oni]"
2,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005 Text FA to 87121 to receive e...,"[free, entry, in, 2, a, wkly, comp, to, win, fa, cup, final, tkts, 21st, may, 2005, text, fa, to..."
3,ham,U dun say so early hor... U c already then say...,U dun say so early hor U c already then say,"[u, dun, say, so, early, hor, u, c, already, then, say]"
4,ham,"Nah I don't think he goes to usf, he lives around here though",Nah I dont think he goes to usf he lives around here though,"[nah, i, dont, think, he, goes, to, usf, he, lives, around, here, though]"


In [33]:
# Remove Stop Words
import nltk

stopwords = nltk.corpus.stopwords.words('english')

In [34]:
#Define a function to remove the stop words
def remove_stopwords(tokenized_text):
    text = [word for word in tokenized_text if word not in stopwords]
    return text

messages['text_nostop'] = messages['text_tokenized'].apply(lambda x: remove_stopwords(x))

messages.head()

Unnamed: 0,label,text,text_clean,text_tokenized,text_nostop
0,ham,"Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there g...",Go until jurong point crazy Available only in bugis n great world la e buffet Cine there got amo...,"[go, until, jurong, point, crazy, available, only, in, bugis, n, great, world, la, e, buffet, ci...","[go, jurong, point, crazy, available, bugis, n, great, world, la, e, buffet, cine, got, amore, wat]"
1,ham,Ok lar... Joking wif u oni...,Ok lar Joking wif u oni,"[ok, lar, joking, wif, u, oni]","[ok, lar, joking, wif, u, oni]"
2,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005 Text FA to 87121 to receive e...,"[free, entry, in, 2, a, wkly, comp, to, win, fa, cup, final, tkts, 21st, may, 2005, text, fa, to...","[free, entry, 2, wkly, comp, win, fa, cup, final, tkts, 21st, may, 2005, text, fa, 87121, receiv..."
3,ham,U dun say so early hor... U c already then say...,U dun say so early hor U c already then say,"[u, dun, say, so, early, hor, u, c, already, then, say]","[u, dun, say, early, hor, u, c, already, say]"
4,ham,"Nah I don't think he goes to usf, he lives around here though",Nah I dont think he goes to usf he lives around here though,"[nah, i, dont, think, he, goes, to, usf, he, lives, around, here, though]","[nah, dont, think, goes, usf, lives, around, though]"


### Term Frequency - Inverse Document Frequency (TF-IDF)
- This creates a document-term matrix; one row per document, one column per word in the corpus
- Generates a weighting for each word/document pair intended to reflect how important a given word is to the document within the context of its frequency within a larger corpus

In [35]:
#Libraries
import pandas as pd
import re
import string
import nltk
pd.set_option('display.max_colwidth', 100)

stopwords = nltk.corpus.stopwords.words('english')

#Reformat Data
mess = pd.read_csv('/Users/jared/Downloads/Ex_Files_Adv_NLP_Python_ML/Exercise Files/data/spam.csv', encoding = 'latin-1')
mess = mess.drop(labels = ["Unnamed: 2", "Unnamed: 3", "Unnamed: 4"], axis = 1)
mess.columns = ["label", "text"]
mess.head()

Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there g..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives around here though"


In [36]:
#One function to pre-process the data
def clean_text(text):
    text = "".join([word.lower() for word in text if word not in string.punctuation])
    tokens = re.split('\W+', text)
    text = [word for word in tokens if word not in stopwords]
    return text

In [51]:
#Fitting a basic TFIDF Vectorizer and view the results
from sklearn.feature_extraction.text import TfidfVectorizer

#This will clean the data, fit it in a vectorizer, then create our document form matrix
tfidf_vect = TfidfVectorizer(analyzer=clean_text)
X_tfidf = tfidf_vect.fit_transform(mess['text'])

#Output
print(X_tfidf.shape)

#Feature Names
print(tfidf_vect.get_feature_names_out())

#Converting the sparse matrix to dataframe
X_features = pd.DataFrame(X_tfidf.toarray())
X_features.head()

(5572, 9395)
['' '0' '008704050406' ... 'ûïharry' 'ûò' 'ûówell']


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,9385,9386,9387,9388,9389,9390,9391,9392,9393,9394
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [52]:
#Applying a machine learning 
from sklearn.ensemble import RandomForestClassifier

In [54]:
# Import the methods that will be needed to evaluate a basic model
from sklearn.metrics import precision_score, recall_score
from sklearn.model_selection import train_test_split

In [55]:
# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_features,
                                                    messages['label'],
                                                    test_size = 0.2)

In [59]:
# Fit a basic Random Forest Model
rf = RandomForestClassifier()
rf_model = rf.fit(X_train, y_train)

In [60]:
# Make a prediction
y_pred = rf_model.predict(X_test)

In [61]:
# Evaluate model predictions using precision and recall
precision = precision_score(y_test, y_pred, pos_label = 'spam')
recall = recall_score(y_test, y_pred, pos_label = 'spam')
print('Precision: {} / Recall: {}'.format(round(precision, 3), round(recall,3)))

Precision: 1.0 / Recall: 0.796


### Word2Vec
- Word2Vec is a shallow, two-layer neural network that accepts a text corpus as an input, and it returns a set of vectors (also known as embeddings); each vector is a numberic representation of a given word.
- "You shall know a word by the company it keeps."
- Python learns the context of a word by looking at a window of words before and after it in the corpus. (Skip gramm method)
- When we convert all the words in our corpus into a vector representation, we can then graph each vector. This gives us the ability to see how similar two words are (in meaning) by finding their "cosine similarity" (cosine of the two angles of the two vectors).
- Theorectically, you can construct analogies with these vectors. For example, if you subtracts man from the vector "King" and then added "woman" you would get the vector for "Queen". Thus, the model could create the analogy "A man is to king as a woman is to queen".

### Some other pre-trained embeddings to explore
- glove-twitter-{25/50/100/200}
- glove-wiki-gigaword-{50/200/300}
- word2vec-google-news-300
- word2vec-ruscorpora-news-300

In [2]:
#Installing Gensim
!pip install -U gensim



In [4]:
# Load pretrained word vectors using gensim
import gensim.downloader as api

wiki_embeddings = api.load('glove-wiki-gigaword-100')



In [5]:
#Explore the word vector for "king"
wiki_embeddings['king']

# This is a numeric representation the word "king". Using the cosine similarity technique, we can find vectors that are the most similar to the vector for "king".

array([-0.32307 , -0.87616 ,  0.21977 ,  0.25268 ,  0.22976 ,  0.7388  ,
       -0.37954 , -0.35307 , -0.84369 , -1.1113  , -0.30266 ,  0.33178 ,
       -0.25113 ,  0.30448 , -0.077491, -0.89815 ,  0.092496, -1.1407  ,
       -0.58324 ,  0.66869 , -0.23122 , -0.95855 ,  0.28262 , -0.078848,
        0.75315 ,  0.26584 ,  0.3422  , -0.33949 ,  0.95608 ,  0.065641,
        0.45747 ,  0.39835 ,  0.57965 ,  0.39267 , -0.21851 ,  0.58795 ,
       -0.55999 ,  0.63368 , -0.043983, -0.68731 , -0.37841 ,  0.38026 ,
        0.61641 , -0.88269 , -0.12346 , -0.37928 , -0.38318 ,  0.23868 ,
        0.6685  , -0.43321 , -0.11065 ,  0.081723,  1.1569  ,  0.78958 ,
       -0.21223 , -2.3211  , -0.67806 ,  0.44561 ,  0.65707 ,  0.1045  ,
        0.46217 ,  0.19912 ,  0.25802 ,  0.057194,  0.53443 , -0.43133 ,
       -0.34311 ,  0.59789 , -0.58417 ,  0.068995,  0.23944 , -0.85181 ,
        0.30379 , -0.34177 , -0.25746 , -0.031101, -0.16285 ,  0.45169 ,
       -0.91627 ,  0.64521 ,  0.73281 , -0.22752 , 

In [10]:
# Finding similar vectors
wiki_embeddings.most_similar('virtue')

[('ideals', 0.6479806303977966),
 ('devotion', 0.6334191560745239),
 ('belief', 0.6317397952079773),
 ('morality', 0.6111955046653748),
 ('discipline', 0.6109753251075745),
 ('virtues', 0.6106486916542053),
 ('true', 0.6067261099815369),
 ('patriotism', 0.6050897836685181),
 ('moral', 0.5999709963798523),
 ('necessity', 0.5992565751075745)]

## Training our own Word2Vec Model

In [13]:
# Read in data and clean up column names
import gensim
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
pd.set_option('display.max_colwidth',100)

messages = pd.read_csv('/Users/jared/Downloads/Ex_Files_Adv_NLP_Python_ML/Exercise Files/data/spam.csv', encoding = 'latin-1')
messages = messages.drop(labels = ["Unnamed: 2", "Unnamed: 3", "Unnamed: 4"], axis = 1)
messages.columns = ["label", "text"]
messages.head()

Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there g..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives around here though"


In [14]:
# Using gensim's built-in data pre-processor
messages['text_clean'] = messages['text'].apply(lambda x: gensim.utils.simple_preprocess(x))
messages.head()

Unnamed: 0,label,text,text_clean
0,ham,"Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there g...","[go, until, jurong, point, crazy, available, only, in, bugis, great, world, la, buffet, cine, th..."
1,ham,Ok lar... Joking wif u oni...,"[ok, lar, joking, wif, oni]"
2,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...,"[free, entry, in, wkly, comp, to, win, fa, cup, final, tkts, st, may, text, fa, to, to, receive,..."
3,ham,U dun say so early hor... U c already then say...,"[dun, say, so, early, hor, already, then, say]"
4,ham,"Nah I don't think he goes to usf, he lives around here though","[nah, don, think, he, goes, to, usf, he, lives, around, here, though]"


In [15]:
# Splitting the data for training purposes
X_train, X_test, y_train, y_test = train_test_split(messages['text_clean'],
                                                    messages['label'], test_size = 0.3)

In [20]:
# Training our word2vec model
w2v_model = gensim.models.Word2Vec(X_train,
                                   window=5,
                                   min_count=2)


In [22]:
# Explore the word vecotr for "king" base on our trained model
w2v_model.wv['king']

array([-0.01501366,  0.02767842,  0.0081264 ,  0.00025378,  0.01012371,
       -0.06689952,  0.02671312,  0.08102237, -0.0200811 , -0.05020381,
       -0.01313363, -0.06205527, -0.00835234,  0.01877814,  0.00205935,
       -0.01102802,  0.02242907, -0.03060235, -0.00116967, -0.09444411,
        0.01765111,  0.00886815,  0.01604872, -0.03018015, -0.01928473,
        0.00773258, -0.04124585, -0.0270099 , -0.0254945 ,  0.00595166,
        0.0337431 ,  0.0068646 ,  0.01580341, -0.04091191, -0.01823169,
        0.05437686,  0.00846641, -0.03355348, -0.00264585, -0.07072156,
        0.0035527 , -0.03916473, -0.03678926,  0.00590911,  0.05124301,
       -0.00368593, -0.03757008, -0.0146032 ,  0.01419565,  0.02798258,
        0.02450917, -0.02944633, -0.00941287,  0.01597572, -0.00533353,
        0.01565976,  0.01492448,  0.01272453, -0.01411441,  0.01342273,
        0.02347603,  0.01636804, -0.00064177, -0.00951722, -0.05886045,
        0.04269448,  0.00506825,  0.03498377, -0.04760833,  0.04

In [23]:
# See most similar words to "king" based on word vectors from our trained model
w2v_model.wv.most_similar('king')

[('kind', 0.9858094453811646),
 ('voucher', 0.9847893118858337),
 ('kinda', 0.9847197532653809),
 ('probably', 0.9846299290657043),
 ('hope', 0.9845755696296692),
 ('are', 0.9845169186592102),
 ('good', 0.9845061302185059),
 ('says', 0.9844889640808105),
 ('looking', 0.9844638109207153),
 ('too', 0.9844145178794861)]

## Prep Word Vectors

In [25]:
# Generate a list of words the word2vec model learned word vectors for
w2v_model.wv.index_to_key

['you',
 'to',
 'the',
 'and',
 'is',
 'in',
 'me',
 'my',
 'for',
 'it',
 'your',
 'call',
 'of',
 'that',
 'have',
 'on',
 'now',
 'are',
 'so',
 'can',
 'not',
 'but',
 'or',
 'at',
 'we',
 'get',
 'be',
 'do',
 'with',
 'no',
 'just',
 'if',
 'ur',
 'will',
 'this',
 'up',
 'free',
 'how',
 'gt',
 'lt',
 'when',
 'from',
 'what',
 'go',
 'all',
 'll',
 'out',
 'ok',
 'know',
 'am',
 'like',
 'day',
 'was',
 'then',
 'got',
 'he',
 'good',
 'its',
 'come',
 'only',
 'there',
 'time',
 'want',
 'love',
 'send',
 'text',
 'she',
 'today',
 'txt',
 'as',
 'stop',
 'by',
 'one',
 'going',
 'sorry',
 'mobile',
 'don',
 'home',
 'our',
 'about',
 'still',
 'lor',
 'see',
 'hi',
 'need',
 'reply',
 'back',
 'tell',
 'her',
 'take',
 'they',
 'later',
 'new',
 'please',
 'pls',
 'any',
 'been',
 'da',
 'some',
 'week',
 'did',
 'dont',
 'ì_',
 'think',
 'has',
 'here',
 'hope',
 'great',
 'too',
 'where',
 'phone',
 'dear',
 'well',
 'night',
 'msg',
 're',
 'him',
 'who',
 'much',
 'won',


In [34]:
# We are getting a nested set of arrays inside an array of all the word vectors for each word in a text
w2v_vect = np.array([np.array([w2v_model.wv[i] for i in ls if i in w2v_model.wv.index_to_key]) for ls in X_test])

ValueError: setting an array element with a sequence. The requested array has an inhomogeneous shape after 1 dimensions. The detected shape was (1672,) + inhomogeneous part.

In [28]:
# Length help
for i, v in enumerate(w2v_vect):
    print(len(X_test.iloc[i]), len(v))

NameError: name 'w2v_vect' is not defined