# Preprocessing Data

To preprocess the dataset for our movie dialogue pairs, we could run the Count Vectorizer and TF-IDF from sklearn.

In [1]:
# Importing Dependencies for Vectorizers
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

In [2]:
# Generating path
work_dir = os.getcwd()
main_chatbot_dir = os.path.dirname(work_dir)
processed_files_dir = main_chatbot_dir + '/processed_files'

In [3]:
# Grabbing file from path
file = processed_files_dir + "/processed_movie_dialogue_pairs.pkl"
df = pd.read_pickle(file, compression='xz')
df

Unnamed: 0,Comment,Reply,Expanded Comment,Expanded Reply,Comment Length,Reply Length,Tokenized Comment,Tokenized Reply,Tokenized Comment_no_sw,Tokenized Reply_no_sw,...,Sentiment Comment,Sentiment Reply,Sentiment Comment_no_sw,Sentiment Reply_no_sw,Sentiment Comment Compound Score,Sentiment Reply Compound Score,Sentiment Comment_no_sw Compound Score,Sentiment Reply_no_sw Compound Score,Sentiment Score Overall,Sentiment Score_no_sw Overall
0,Can we make this quick? Roxanne Korrine and An...,"Well, I thought we'd start with pronunciation,...",can we make this quick roxanne korrine and and...,well i thought we would start with pronunciati...,21,14,"[can, we, make, this, quick, roxanne, korrine,...","[well, i, thought, we, would, start, with, pro...","[make, quick, roxanne, korrine, andrew, barret...","[well, thought, would, start, pronunciation, o...",...,"{'neg': 0.163, 'neu': 0.837, 'pos': 0.0, 'comp...","{'neg': 0.0, 'neu': 0.733, 'pos': 0.267, 'comp...","{'neg': 0.29, 'neu': 0.71, 'pos': 0.0, 'compou...","{'neg': 0.0, 'neu': 0.5, 'pos': 0.5, 'compound...",-0.6240,0.4588,-0.6240,0.4588,-0.1652,-0.1652
1,"Well, I thought we'd start with pronunciation,...",Not the hacking and gagging and spitting part ...,well i thought we would start with pronunciati...,not the hacking and gagging and spitting part ...,14,9,"[well, i, thought, we, would, start, with, pro...","[not, the, hacking, and, gagging, and, spittin...","[well, thought, would, start, pronunciation, o...","[hacking, gagging, spitting, part, please]",...,"{'neg': 0.0, 'neu': 0.733, 'pos': 0.267, 'comp...","{'neg': 0.0, 'neu': 0.777, 'pos': 0.223, 'comp...","{'neg': 0.0, 'neu': 0.5, 'pos': 0.5, 'compound...","{'neg': 0.312, 'neu': 0.39, 'pos': 0.299, 'com...",0.4588,0.3182,0.4588,-0.0258,0.7770,0.4330
2,Not the hacking and gagging and spitting part ...,Okay then how 'bout we try out some French cui...,not the hacking and gagging and spitting part ...,okay then how about we try out some french cui...,9,12,"[not, the, hacking, and, gagging, and, spittin...","[okay, then, how, about, we, try, out, some, f...","[hacking, gagging, spitting, part, please]","[okay, try, french, cuisine, saturday, night]",...,"{'neg': 0.0, 'neu': 0.777, 'pos': 0.223, 'comp...","{'neg': 0.0, 'neu': 0.853, 'pos': 0.147, 'comp...","{'neg': 0.312, 'neu': 0.39, 'pos': 0.299, 'com...","{'neg': 0.0, 'neu': 0.725, 'pos': 0.275, 'comp...",0.3182,0.2263,-0.0258,0.2263,0.5445,0.2005
3,You're asking me out That's so cute What's you...,Forget it,you are asking me out that is so cute what is ...,forget it,14,2,"[you, are, asking, me, out, that, is, so, cute...","[forget, it]","[asking, cute, name]",[forget],...,"{'neg': 0.0, 'neu': 0.771, 'pos': 0.229, 'comp...","{'neg': 0.655, 'neu': 0.345, 'pos': 0.0, 'comp...","{'neg': 0.0, 'neu': 0.4, 'pos': 0.6, 'compound...","{'neg': 1.0, 'neu': 0.0, 'pos': 0.0, 'compound...",0.5949,-0.2263,0.4588,-0.2263,0.3686,0.2325
4,"No, no, it's my fault - we didn't have a prope...",Cameron,no no it is my fault we did not have a proper ...,cameron,16,1,"[no, no, it, is, my, fault, we, did, not, have...",[cameron],"[fault, proper, introduction]",[cameron],...,"{'neg': 0.441, 'neu': 0.559, 'pos': 0.0, 'comp...","{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...","{'neg': 0.574, 'neu': 0.426, 'pos': 0.0, 'comp...","{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",-0.7269,0.0000,-0.4019,0.0000,-0.7269,-0.4019
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
221611,"Your orders, Mr Vereker?",I'm to take the Sikali with the main column to...,your orders mr vereker,i am to take the sikali with the main column t...,4,13,"[your, orders, mr, vereker]","[i, am, to, take, the, sikali, with, the, main...","[orders, mr, vereker]","[take, sikali, main, column, river]",...,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...","{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...","{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...","{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",0.0000,0.0000,0.0000,0.0000,0.0000,0.0000
221612,I'm to take the Sikali with the main column to...,Lord Chelmsford seems to want me to stay back ...,i am to take the sikali with the main column t...,lord chelmsford seems to want me to stay back ...,13,12,"[i, am, to, take, the, sikali, with, the, main...","[lord, chelmsford, seems, to, want, me, to, st...","[take, sikali, main, column, river]","[lord, chelmsford, seems, want, stay, back, ba...",...,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...","{'neg': 0.0, 'neu': 0.894, 'pos': 0.106, 'comp...","{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...","{'neg': 0.0, 'neu': 0.822, 'pos': 0.178, 'comp...",0.0000,0.0772,0.0000,0.0772,0.0772,0.0772
221613,Lord Chelmsford seems to want me to stay back ...,I think Chelmsford wants a good man on the bor...,lord chelmsford seems to want me to stay back ...,i think chelmsford wants a good man on the bor...,12,23,"[lord, chelmsford, seems, to, want, me, to, st...","[i, think, chelmsford, wants, a, good, man, on...","[lord, chelmsford, seems, want, stay, back, ba...","[think, chelmsford, wants, good, man, border, ...",...,"{'neg': 0.0, 'neu': 0.894, 'pos': 0.106, 'comp...","{'neg': 0.247, 'neu': 0.588, 'pos': 0.165, 'co...","{'neg': 0.0, 'neu': 0.822, 'pos': 0.178, 'comp...","{'neg': 0.323, 'neu': 0.462, 'pos': 0.215, 'co...",0.0772,-0.4767,0.0772,-0.4767,-0.3995,-0.3995
221614,"Well I assure you, Sir, I have no desire to cr...","And I assure you, you do not In fact I'd be ob...",well i assure you sir i have no desire to crea...,and i assure you you do not in fact i would be...,13,22,"[well, i, assure, you, sir, i, have, no, desir...","[and, i, assure, you, you, do, not, in, fact, ...","[well, assure, sir, desire, create, difficulti...","[assure, fact, would, obliged, best, advice, s...",...,"{'neg': 0.243, 'neu': 0.265, 'pos': 0.492, 'co...","{'neg': 0.0, 'neu': 0.732, 'pos': 0.268, 'comp...","{'neg': 0.175, 'neu': 0.146, 'pos': 0.679, 'co...","{'neg': 0.0, 'neu': 0.476, 'pos': 0.524, 'comp...",0.5719,0.7650,0.7096,0.7650,1.3369,1.4746


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 191928 entries, 0 to 221615
Data columns (total 28 columns):
 #   Column                                  Non-Null Count   Dtype  
---  ------                                  --------------   -----  
 0   Comment                                 191928 non-null  object 
 1   Reply                                   191928 non-null  object 
 2   Expanded Comment                        191928 non-null  object 
 3   Expanded Reply                          191928 non-null  object 
 4   Comment Length                          191928 non-null  int64  
 5   Reply Length                            191928 non-null  int64  
 6   Tokenized Comment                       191928 non-null  object 
 7   Tokenized Reply                         191928 non-null  object 
 8   Tokenized Comment_no_sw                 191928 non-null  object 
 9   Tokenized Reply_no_sw                   191928 non-null  object 
 10  Lemmatized Comment                      1919

In [5]:
df.columns

Index(['Comment', 'Reply', 'Expanded Comment', 'Expanded Reply',
       'Comment Length', 'Reply Length', 'Tokenized Comment',
       'Tokenized Reply', 'Tokenized Comment_no_sw', 'Tokenized Reply_no_sw',
       'Lemmatized Comment', 'Lemmatized Reply', 'Lemmatized Comment_no_sw',
       'Lemmatized Reply_no_sw', 'Lem Comment Length', 'Lem Reply Length',
       'Lem Comment_no_sw Length', 'Lem Reply_no_sw Length',
       'Sentiment Comment', 'Sentiment Reply', 'Sentiment Comment_no_sw',
       'Sentiment Reply_no_sw', 'Sentiment Comment Compound Score',
       'Sentiment Reply Compound Score',
       'Sentiment Comment_no_sw Compound Score',
       'Sentiment Reply_no_sw Compound Score', 'Sentiment Score Overall',
       'Sentiment Score_no_sw Overall'],
      dtype='object')

Here, we'll build training and testing sets for lemmatized comments and replies with and without stop-words. Then we'll start with the CountVectorizer from sklearn.

In [6]:
# Generating training and testing data with sklearn's train_test_split() function
# Unfortunately, sklearn's text vectorizers expects an array of strings rather than tokens
# So in this case, we'll join the them back together for our train/test splitting
X_train, X_test, y_train, y_test = train_test_split(df['Lemmatized Comment'].str.join(' '),
                                                    df['Lemmatized Reply'].str.join(' '),
                                                    test_size=0.25, random_state=7)

In [7]:
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(143946,) (47982,) (143946,) (47982,)


In [8]:
# Checking one of the training sets
X_train

131709                                                  yah
131075    please tell me your dawg s not try to rekindle...
120040                                   about mark carlton
10481                    mom do you want some time with dad
46688     headache huh too much sun i guess you want to ...
                                ...                        
61683                           that be your fault not mine
12307     we look all over for you what be you do back here
57399                                            want a cup
219010                               where s this come from
71203                               really what do you feel
Name: Lemmatized Comment, Length: 143946, dtype: object

In [9]:
y_train

131709        yah okay i think i will drive down there then
131075                         course not strictly business
120040                                         about benito
10481                                         yes thank you
46688                                 no no i want to go in
                                ...                        
61683                   i do not think i be go to the party
12307               oh i be just examine this rear hatchway
57399                                            predate it
219010    i work juvenile in dade county for three year ...
71203     i be uncertain i have have little experience w...
Name: Lemmatized Reply, Length: 143946, dtype: object

In [10]:
X_test

143862                                 an honor carlo pazzi
183942    sorry police order i will stay out of the way ...
155364         i think you will find it be in your interest
194481    be that what you be worried about look bad at ...
164181    oh well er brave concorde you shall not have b...
                                ...                        
11579     of course he be not real you think i would be ...
40413                                   can not you stop it
165640                                                   hi
117121     the moon be up my night to howl will you be okay
149237                       i would not be a bit surprised
Name: Lemmatized Comment, Length: 47982, dtype: object

In [11]:
y_test

143862                           no y know i never doubt it
183942                                                 shit
155364    hold on ok look willie it be plank outside ask...
194481                            you be defend what you do
164181                     i think i could pull through sir
                                ...                        
11579                                      it be a good job
40413                    if i could stop it i would stop it
165640                                           come on in
117121    oh god i forget yes yes you go poor thing you ...
149237    ridiculous of you to think of kill yourself fo...
Name: Lemmatized Reply, Length: 47982, dtype: object

In [12]:
# Reseting indexes for all my training and testing data
X_train = X_train.reset_index().drop('index', axis=1).iloc[:,0]
X_test = X_test.reset_index().drop('index', axis=1).iloc[:,0]
y_train = y_train.reset_index().drop('index', axis=1).iloc[:,0]
y_test = y_test.reset_index().drop('index', axis=1).iloc[:,0]

In [13]:
# Checking format of dataset after index reset and manipulation
X_train

0                                                       yah
1         please tell me your dawg s not try to rekindle...
2                                        about mark carlton
3                        mom do you want some time with dad
4         headache huh too much sun i guess you want to ...
                                ...                        
143941                          that be your fault not mine
143942    we look all over for you what be you do back here
143943                                           want a cup
143944                               where s this come from
143945                              really what do you feel
Name: Lemmatized Comment, Length: 143946, dtype: object

In [14]:
y_train

0             yah okay i think i will drive down there then
1                              course not strictly business
2                                              about benito
3                                             yes thank you
4                                     no no i want to go in
                                ...                        
143941                  i do not think i be go to the party
143942              oh i be just examine this rear hatchway
143943                                           predate it
143944    i work juvenile in dade county for three year ...
143945    i be uncertain i have have little experience w...
Name: Lemmatized Reply, Length: 143946, dtype: object

In [15]:
X_test

0                                     an honor carlo pazzi
1        sorry police order i will stay out of the way ...
2             i think you will find it be in your interest
3        be that what you be worried about look bad at ...
4        oh well er brave concorde you shall not have b...
                               ...                        
47977    of course he be not real you think i would be ...
47978                                  can not you stop it
47979                                                   hi
47980     the moon be up my night to howl will you be okay
47981                       i would not be a bit surprised
Name: Lemmatized Comment, Length: 47982, dtype: object

In [16]:
y_test

0                               no y know i never doubt it
1                                                     shit
2        hold on ok look willie it be plank outside ask...
3                                you be defend what you do
4                         i think i could pull through sir
                               ...                        
47977                                     it be a good job
47978                   if i could stop it i would stop it
47979                                           come on in
47980    oh god i forget yes yes you go poor thing you ...
47981    ridiculous of you to think of kill yourself fo...
Name: Lemmatized Reply, Length: 47982, dtype: object

In [17]:
# Checking the very first element in the training set X_train
X_train.iloc[1]

'please tell me your dawg s not try to rekindle thing with my sister'

In [19]:
y_train.iloc[1]

'course not strictly business'

In [20]:
# Checking the very first element in the training set y_train
X_test.iloc[1]

'sorry police order i will stay out of the way i promise'

In [22]:
y_test.iloc[1]

'shit'

Having split up our data into training and test sets, we'll start fitting our vectorizers.

In [23]:
# Initializing CountVectorizer for each dataset, cv for the those with stop-words
# and cv_no_sw for those without; we'll also use the min_df=3 for filtering
# infrequent words
cv = CountVectorizer(min_df=3)

In [31]:
# Using .fit_transform on training input data and .transform on the rest
# This will build the model's vocabulary
X_train_transformed_cv = cv.fit_transform(X_train)
X_test_transformed_cv = cv.transform(X_test)
y_train_transformed_cv = cv.transform(y_train)
y_test_transformed_cv = cv.transform(y_test)

In [32]:
# Checking values in the second element that are non-zero
print([i for i, n in enumerate(X_train_transformed_cv.toarray()[1]) if n > 0])
[cv.get_feature_names()[i] for i, n in enumerate(X_train_transformed_cv.toarray()[1]) if n > 0]

[2826, 7053, 7526, 7776, 8522, 10217, 11133, 11246, 11360, 11592, 12382, 12532]


['dawg',
 'me',
 'my',
 'not',
 'please',
 'sister',
 'tell',
 'thing',
 'to',
 'try',
 'with',
 'your']

In [33]:
print([i for i, n in enumerate(X_test_transformed_cv.toarray()[1]) if n > 0])
[cv.get_feature_names()[i] for i, n in enumerate(X_test_transformed_cv.toarray()[1]) if n > 0]

[7881, 7975, 8023, 8568, 8811, 10455, 10677, 11212, 12172, 12330]


['of',
 'order',
 'out',
 'police',
 'promise',
 'sorry',
 'stay',
 'the',
 'way',
 'will']

In [36]:
print([i for i, n in enumerate(y_train_transformed_cv.toarray()[1]) if n > 0])
[cv.get_feature_names()[i] for i, n in enumerate(y_train_transformed_cv.toarray()[1]) if n > 0]

[1567, 2566, 7776, 10779]


['business', 'course', 'not', 'strictly']

In [37]:
print([i for i, n in enumerate(y_test_transformed_cv.toarray()[1]) if n > 0])
[cv.get_feature_names()[i] for i, n in enumerate(y_test_transformed_cv.toarray()[1]) if n > 0]

[10072]


['shit']

In [38]:
# Creating dictionaries for word vectors
cv_vocab_dict = {i:word for i, word in enumerate(cv.get_feature_names())}
print("cv_vocab_size:", len(cv_vocab_dict))

cv_vocab_size: 12585


In [39]:
X_train_transformed_cv

<143946x12585 sparse matrix of type '<class 'numpy.int64'>'
	with 1240887 stored elements in Compressed Sparse Row format>

In [40]:
X_test_transformed_cv

<47982x12585 sparse matrix of type '<class 'numpy.int64'>'
	with 409304 stored elements in Compressed Sparse Row format>

In [41]:
y_train_transformed_cv

<143946x12585 sparse matrix of type '<class 'numpy.int64'>'
	with 1251188 stored elements in Compressed Sparse Row format>

In [42]:
y_test_transformed_cv

<47982x12585 sparse matrix of type '<class 'numpy.int64'>'
	with 413741 stored elements in Compressed Sparse Row format>

These 4 sparse matrices above contains the word vectors generated by the Count Vectorizer. We could do the same with data without stop-words.

In [43]:
# Generating training/testing sets for data with no stop-words
X_train_no_sw, X_test_no_sw, y_train_no_sw, y_test_no_sw = train_test_split(df['Lemmatized Comment_no_sw'].str.join(' '),
                                                                            df['Lemmatized Reply_no_sw'].str.join(' '),
                                                                            test_size=0.25, random_state=7)

In [44]:
print(X_train_no_sw.shape, X_test_no_sw.shape, y_train_no_sw.shape, y_test_no_sw.shape)

(143946,) (47982,) (143946,) (47982,)


In [49]:
X_train_no_sw = X_train_no_sw.reset_index().drop('index', axis=1).iloc[:,0]
X_test_no_sw = X_test_no_sw.reset_index().drop('index', axis=1).iloc[:,0]
y_train_no_sw = y_train_no_sw.reset_index().drop('index', axis=1).iloc[:,0]
y_test_no_sw = y_test_no_sw.reset_index().drop('index', axis=1).iloc[:,0]

In [50]:
X_train_no_sw

0                                                       yah
1                please tell dawg try rekindle thing sister
2                                              mark carlton
3                                         mom want time dad
4         headache huh much sun guess want come still ge...
                                ...                        
143941                                           fault mine
143942                                            look back
143943                                             want cup
143944                                                 come
143945                                          really feel
Name: Lemmatized Comment_no_sw, Length: 143946, dtype: object

In [51]:
X_test_no_sw

0                                        honor carlo pazzi
1                      sorry police order stay way promise
2                                      think find interest
3                                     worried look bad pta
4        oh well er brave concorde shall fatally wound ...
                               ...                        
47977    course real think would work could afford real...
47978                                                 stop
47979                                                   hi
47980                                 moon night howl okay
47981                                 would bite surprised
Name: Lemmatized Comment_no_sw, Length: 47982, dtype: object

In [52]:
y_train_no_sw

0                                      yah okay think drive
1                                  course strictly business
2                                                    benito
3                                                 yes thank
4                                                   want go
                                ...                        
143941                                       think go party
143942                             oh examine rear hatchway
143943                                              predate
143944    work juvenile dade county three year miss pers...
143945    uncertain little experience emotion unable art...
Name: Lemmatized Reply_no_sw, Length: 143946, dtype: object

In [53]:
y_test_no_sw

0                                         know never doubt
1                                                     shit
2        hold ok look willie plank outside ask say inte...
3                                                   defend
4                                     think could pull sir
                               ...                        
47977                                             good job
47978                                could stop would stop
47979                                                 come
47980        oh god forget yes yes go poor thing ought get
47981    ridiculous think kill money eight thousand dollar
Name: Lemmatized Reply_no_sw, Length: 47982, dtype: object

In [54]:
X_train_no_sw.iloc[1]

'please tell dawg try rekindle thing sister'

In [55]:
X_test_no_sw.iloc[1]

'sorry police order stay way promise'

In [56]:
y_train_no_sw.iloc[1]

'course strictly business'

In [57]:
y_test_no_sw.iloc[1]

'shit'

In [58]:
# Initializing CountVectorizer for each dataset, cv for the those with stop-words
# and cv_no_sw for those without; we'll also use the min_df=3 for filtering
# infrequent words
cv_no_sw = CountVectorizer(min_df=3)

In [59]:
# Using .fit_transform on training input data
# This will build the model's vocabulary
X_train_no_sw_transformed_cv = cv_no_sw.fit_transform(X_train_no_sw)
X_test_no_sw_transformed_cv = cv_no_sw.transform(X_test_no_sw)
y_train_no_sw_transformed_cv = cv_no_sw.transform(y_train_no_sw)
y_test_no_sw_transformed_cv = cv_no_sw.transform(y_test_no_sw)

In [60]:
# Checking values in the second element that are non-zero
print([i for i, n in enumerate(X_train_no_sw_transformed_cv.toarray()[1]) if n > 0])
[cv_no_sw.get_feature_names()[i] for i, n in enumerate(X_train_no_sw_transformed_cv.toarray()[1]) if n > 0]

[2826, 8540, 10245, 11176, 11279, 11620]


['dawg', 'please', 'sister', 'tell', 'thing', 'try']

In [61]:
print([i for i, n in enumerate(X_test_no_sw_transformed_cv.toarray()[1]) if n > 0])
[cv_no_sw.get_feature_names()[i] for i, n in enumerate(X_test_no_sw_transformed_cv.toarray()[1]) if n > 0]

[7987, 8584, 8832, 10484, 10712, 12196]


['order', 'police', 'promise', 'sorry', 'stay', 'way']

In [62]:
print([i for i, n in enumerate(y_train_no_sw_transformed_cv.toarray()[1]) if n > 0])
[cv_no_sw.get_feature_names()[i] for i, n in enumerate(y_train_no_sw_transformed_cv.toarray()[1]) if n > 0]

[1566, 2559, 10816]


['business', 'course', 'strictly']

In [63]:
print([i for i, n in enumerate(y_test_no_sw_transformed_cv.toarray()[1]) if n > 0])
[cv_no_sw.get_feature_names()[i] for i, n in enumerate(y_test_no_sw_transformed_cv.toarray()[1]) if n > 0]

[10098]


['shit']

In [67]:
X_train_no_sw_transformed_cv

<143946x12601 sparse matrix of type '<class 'numpy.int64'>'
	with 613541 stored elements in Compressed Sparse Row format>

In [68]:
X_test_no_sw_transformed_cv

<47982x12601 sparse matrix of type '<class 'numpy.int64'>'
	with 201388 stored elements in Compressed Sparse Row format>

In [69]:
y_train_no_sw_transformed_cv

<143946x12601 sparse matrix of type '<class 'numpy.int64'>'
	with 618979 stored elements in Compressed Sparse Row format>

In [70]:
y_test_no_sw_transformed_cv

<47982x12601 sparse matrix of type '<class 'numpy.int64'>'
	with 204833 stored elements in Compressed Sparse Row format>

In [71]:
# Creating dictionaries for word vectors
cv_no_sw_vocab_dict = {i:word for i, word in enumerate(cv_no_sw.get_feature_names())}
print("cv_no_sw_vocab_size:", len(cv_no_sw_vocab_dict))

cv_no_sw_vocab_size: 12601


These 4 sparse matrices above now contains the word vectors generated by the Count Vectorizer for data without stop-words. Now, we could try out a different vectorizer, TF-IDF.

In [64]:
# Initializing CountVectorizer for each dataset, cv for the those with stop-words
# and cv_no_sw for those without; we'll also use the min_df=3 for filtering
# infrequent words
tv = TfidfVectorizer(min_df=3)
tv_no_sw = TfidfVectorizer(min_df=3)

In [66]:
# Using .fit_transform on training input data and .transform on the rest
# This will build the model's vocabulary
X_train_transformed_tv = tv.fit_transform(X_train)
X_test_transformed_tv = tv.transform(X_test)
y_train_transformed_tv = tv.transform(y_train)
y_test_transformed_tv = tv.transform(y_test)

X_train_no_sw_transformed_tv = tv_no_sw.fit_transform(X_train_no_sw)
X_test_no_sw_transformed_tv = tv_no_sw.transform(X_test_no_sw)
y_train_no_sw_transformed_tv = tv_no_sw.transform(y_train_no_sw)
y_test_no_sw_transformed_tv = tv_no_sw.transform(y_test_no_sw)

In [72]:
# Creating dictionaries for word vectors
tv_vocab_dict = {i:word for i, word in enumerate(tv.get_feature_names())}
tv_no_sw_vocab_dict = {i:word for i, word in enumerate(tv_no_sw.get_feature_names())}
print("tv_vocab_size:", len(tv_vocab_dict))
print("tv_no_sw_vocab_size:", len(tv_no_sw_vocab_dict))

tv_vocab_size: 12585
tv_no_sw_vocab_size: 12601


In [75]:
# Checking values in the second element that are non-zero
print([i for i, n in enumerate(X_train_transformed_tv.toarray()[1]) if n > 0])
print([n for i, n in enumerate(X_train_transformed_tv.toarray()[1]) if n > 0])
[tv.get_feature_names()[i] for i, n in enumerate(X_train_transformed_tv.toarray()[1]) if n > 0]

[2826, 7053, 7526, 7776, 8522, 10217, 11133, 11246, 11360, 11592, 12382, 12532]
[0.5948923085462593, 0.18067779144282037, 0.20411788411116605, 0.13882057020867816, 0.31070970016015215, 0.38370669101748445, 0.23511199364941004, 0.26025882622475827, 0.1341573911723825, 0.2882458664421002, 0.2106319929013772, 0.2007238036036941]


['dawg',
 'me',
 'my',
 'not',
 'please',
 'sister',
 'tell',
 'thing',
 'to',
 'try',
 'with',
 'your']

In [76]:
print([i for i, n in enumerate(X_test_transformed_tv.toarray()[1]) if n > 0])
print([n for i, n in enumerate(X_test_transformed_tv.toarray()[1]) if n > 0])
[tv.get_feature_names()[i] for i, n in enumerate(X_test_transformed_tv.toarray()[1]) if n > 0]

[7881, 7975, 8023, 8568, 8811, 10455, 10677, 11212, 12172, 12330]
[0.19046093805590988, 0.41205386534971333, 0.2484796884060098, 0.4077250012568682, 0.4134530444387653, 0.3233196045891979, 0.35465283849954415, 0.14529664807666048, 0.30482794893618875, 0.22509019082763915]


['of',
 'order',
 'out',
 'police',
 'promise',
 'sorry',
 'stay',
 'the',
 'way',
 'will']

In [79]:
print([i for i, n in enumerate(y_train_transformed_tv.toarray()[1]) if n > 0])
print([n for i, n in enumerate(y_train_transformed_tv.toarray()[1]) if n > 0])
[tv.get_feature_names()[i] for i, n in enumerate(y_train_transformed_tv.toarray()[1]) if n > 0]

[1567, 2566, 7776, 10779]
[0.48477414136750346, 0.4544823231782565, 0.19473167408669412, 0.7214703215568109]


['business', 'course', 'not', 'strictly']

In [80]:
print([i for i, n in enumerate(y_test_transformed_tv.toarray()[1]) if n > 0])
print([n for i, n in enumerate(y_test_transformed_tv.toarray()[1]) if n > 0])
[tv.get_feature_names()[i] for i, n in enumerate(y_test_transformed_tv.toarray()[1]) if n > 0]

[10072]
[1.0]


['shit']

In [81]:
print([i for i, n in enumerate(X_train_no_sw_transformed_tv.toarray()[1]) if n > 0])
print([n for i, n in enumerate(X_train_no_sw_transformed_tv.toarray()[1]) if n > 0])
[tv_no_sw.get_feature_names()[i] for i, n in enumerate(X_train_no_sw_transformed_tv.toarray()[1]) if n > 0]

[2826, 8540, 10245, 11176, 11279, 11620]
[0.6623763522672441, 0.34601038188394034, 0.42701909334369953, 0.2655398027211868, 0.2897823513324258, 0.32357978851071306]


['dawg', 'please', 'sister', 'tell', 'thing', 'try']

In [82]:
print([i for i, n in enumerate(X_test_no_sw_transformed_tv.toarray()[1]) if n > 0])
print([n for i, n in enumerate(X_test_no_sw_transformed_tv.toarray()[1]) if n > 0])
[tv_no_sw.get_feature_names()[i] for i, n in enumerate(X_test_no_sw_transformed_tv.toarray()[1]) if n > 0]

[7987, 8584, 8832, 10484, 10712, 12196]
[0.45218986287527785, 0.4472514873360692, 0.4539243490795595, 0.3546634952275756, 0.3896761299154679, 0.3343791848011361]


['order', 'police', 'promise', 'sorry', 'stay', 'way']

In [83]:
print([i for i, n in enumerate(y_train_no_sw_transformed_tv.toarray()[1]) if n > 0])
print([n for i, n in enumerate(y_train_no_sw_transformed_tv.toarray()[1]) if n > 0])
[tv_no_sw.get_feature_names()[i] for i, n in enumerate(y_train_no_sw_transformed_tv.toarray()[1]) if n > 0]

[1566, 2559, 10816]
[0.49423551334134924, 0.463352487545929, 0.7355513099962588]


['business', 'course', 'strictly']

In [84]:
print([i for i, n in enumerate(y_test_no_sw_transformed_tv.toarray()[1]) if n > 0])
print([n for i, n in enumerate(y_test_no_sw_transformed_tv.toarray()[1]) if n > 0])
[tv_no_sw.get_feature_names()[i] for i, n in enumerate(y_test_no_sw_transformed_tv.toarray()[1]) if n > 0]

[10098]
[1.0]


['shit']

In [85]:
X_train_transformed_tv

<143946x12585 sparse matrix of type '<class 'numpy.float64'>'
	with 1240887 stored elements in Compressed Sparse Row format>

In [86]:
X_test_transformed_tv

<47982x12585 sparse matrix of type '<class 'numpy.float64'>'
	with 409304 stored elements in Compressed Sparse Row format>

In [87]:
y_train_transformed_tv

<143946x12585 sparse matrix of type '<class 'numpy.float64'>'
	with 1251188 stored elements in Compressed Sparse Row format>

In [88]:
y_test_transformed_tv

<47982x12585 sparse matrix of type '<class 'numpy.float64'>'
	with 413741 stored elements in Compressed Sparse Row format>

In [89]:
X_train_no_sw_transformed_tv

<143946x12601 sparse matrix of type '<class 'numpy.float64'>'
	with 613541 stored elements in Compressed Sparse Row format>

In [90]:
X_test_no_sw_transformed_tv

<47982x12601 sparse matrix of type '<class 'numpy.float64'>'
	with 201388 stored elements in Compressed Sparse Row format>

In [91]:
y_train_no_sw_transformed_tv

<143946x12601 sparse matrix of type '<class 'numpy.float64'>'
	with 618979 stored elements in Compressed Sparse Row format>

In [92]:
y_test_no_sw_transformed_tv

<47982x12601 sparse matrix of type '<class 'numpy.float64'>'
	with 204833 stored elements in Compressed Sparse Row format>

These 8 sparse matrices above contains the word vectors generated by the TF-IDF Vectorizer for data with and without stop-words.

Another approach to vectorizing outside of counting words and frequencies is the Word2Vec approach, a deep learning neural network developed by Google. For Word2Vec, we'll use gensim, using the documentation as a guide (https://radimrehurek.com/gensim/auto_examples/tutorials/run_word2vec.html).

In [93]:
# We'll follow the example for training my own model using my own dataset
import gensim.models

sentences_train = [comment.split() for comment in X_train]
word2vec_model = gensim.models.Word2Vec(sentences=sentences_train, vector_size=300)

In [94]:
sentences_train_no_sw = [comment.split() for comment in X_train_no_sw]
word2vec_model_no_sw = gensim.models.Word2Vec(sentences=sentences_train_no_sw, vector_size=300)

The documentation for gensim provided a really nice example on visualizing embedded words by reducing the dimensionality of the words to 2 dimensions using tSNE.

In [95]:
# Visualizing word embeddings via plotly and matplotlib and the TSNE algorithm
from sklearn.decomposition import IncrementalPCA    # inital reduction
from sklearn.manifold import TSNE                   # final reduction
import numpy as np                                  # array handling

def reduce_dimensions(model):
    num_dimensions = 2  # final num dimensions (2D, 3D, etc)

    # extract the words & their vectors, as numpy arrays
    vectors = np.asarray(model.wv.vectors)
    labels = np.asarray(model.wv.index_to_key)  # fixed-width numpy strings

    # reduce using t-SNE
    tsne = TSNE(n_components=num_dimensions, random_state=0)
    vectors = tsne.fit_transform(vectors)

    x_vals = [v[0] for v in vectors]
    y_vals = [v[1] for v in vectors]
    return x_vals, y_vals, labels

x_vals, y_vals, labels = reduce_dimensions(word2vec_model)

def plot_with_plotly(x_vals, y_vals, labels, plot_in_notebook=True):
    from plotly.offline import init_notebook_mode, iplot, plot
    import plotly.graph_objs as go

    trace = go.Scatter(x=x_vals, y=y_vals, mode='text', text=labels)
    data = [trace]

    if plot_in_notebook:
        init_notebook_mode(connected=True)
        iplot(data, filename='word-embedding-plot')
    else:
        plot(data, filename='word-embedding-plot.html')

def plot_with_matplotlib(x_vals, y_vals, labels):
    import matplotlib.pyplot as plt
    import random

    random.seed(0)

    plt.figure(figsize=(12, 12))
    plt.scatter(x_vals, y_vals)

    #
    # Label randomly subsampled 25 data points
    #
    indices = list(range(len(labels)))
    selected_indices = random.sample(indices, 25)
    for i in selected_indices:
        plt.annotate(labels[i], (x_vals[i], y_vals[i]))

try:
    get_ipython()
except Exception:
    plot_function = plot_with_matplotlib
else:
    plot_function = plot_with_plotly

print('This is a plot for word2vec_model')
plot_function(x_vals, y_vals, labels)

This is a plot for word2vec_model


In [96]:
x_vals_no_sw, y_vals_no_sw, labels_no_sw = reduce_dimensions(word2vec_model_no_sw)
print('This is a plot for word2vec_model_no_sw')
plot_function(x_vals_no_sw, y_vals_no_sw, labels_no_sw)

This is a plot for word2vec_model_no_sw


In [97]:
print(word2vec_model)

Word2Vec(vocab=8886, vector_size=300, alpha=0.025)


In [98]:
print(word2vec_model_no_sw)

Word2Vec(vocab=8844, vector_size=300, alpha=0.025)


With this, we have three different vectorizations of the mini_20k set. CountVectorization and TF-IDF Vectorization using sklearn and Word2Vec using gensim. To export our word vectors from Count Vectorizer and TF-IDF, we'll use scipy's sparse.save_npz function.

In [99]:
from scipy.sparse import save_npz

In [100]:
save_npz(processed_files_dir + "/count_vec_X_train.npz", X_train_transformed_cv)
save_npz(processed_files_dir + "/count_vec_X_test.npz", X_test_transformed_cv)
save_npz(processed_files_dir + "/count_vec_y_train.npz", y_train_transformed_cv)
save_npz(processed_files_dir + "/count_vec_y_test.npz", y_test_transformed_cv)

save_npz(processed_files_dir + "/count_vec_no_sw_X_train.npz", X_train_no_sw_transformed_cv)
save_npz(processed_files_dir + "/count_vec_no_sw_X_test.npz", X_test_no_sw_transformed_cv)
save_npz(processed_files_dir + "/count_vec_no_sw_y_train.npz", y_train_no_sw_transformed_cv)
save_npz(processed_files_dir + "/count_vec_no_sw_y_test.npz", y_test_no_sw_transformed_cv)

save_npz(processed_files_dir + "/tfidf_X_train.npz", X_train_transformed_tv)
save_npz(processed_files_dir + "/tfidf_X_test.npz", X_test_transformed_tv)
save_npz(processed_files_dir + "/tfidf_y_train.npz", y_train_transformed_tv)
save_npz(processed_files_dir + "/tfidf_y_test.npz", y_test_transformed_tv)

save_npz(processed_files_dir + "/tfidf_no_sw_X_train.npz", X_train_no_sw_transformed_tv)
save_npz(processed_files_dir + "/tfidf_no_sw_X_test.npz", X_test_no_sw_transformed_tv)
save_npz(processed_files_dir + "/tfidf_no_sw_y_train.npz", y_train_no_sw_transformed_tv)
save_npz(processed_files_dir + "/tfidf_no_sw_y_test.npz", y_test_no_sw_transformed_tv)

In [105]:
# We'll store vocabulary dictionaries as dataframes
cv_vocab_df = pd.DataFrame({'vocabulary':cv_vocab_dict})
cv_vocab_no_sw_df = pd.DataFrame({'vocabulary':cv_no_sw_vocab_dict})
tv_vocab_df = pd.DataFrame({'vocabulary':tv_vocab_dict})
tv_vocab_no_sw_df = pd.DataFrame({'vocabulary':tv_no_sw_vocab_dict})

In [106]:
# Exporting vocabulary dictionaries as pickled dataframes
cv_vocab_df.to_pickle(processed_files_dir + '/count_vec_vocab.pkl', compression='xz')
cv_vocab_no_sw_df.to_pickle(processed_files_dir + '/count_vec_vocab_no_sw.pkl', compression='xz')
tv_vocab_df.to_pickle(processed_files_dir + '/tfidf_vocab.pkl', compression='xz')
tv_vocab_no_sw_df.to_pickle(processed_files_dir + '/tfidf_vocab_no_sw.pkl', compression='xz')

In [107]:
# Saving Word2Vec models
word2vec_model.save(processed_files_dir + '/word2vec_model.model')
word2vec_model_no_sw.save(processed_files_dir + '/word2vec_model_no_sw.model')