# Preprocessing Data

To preprocess the dataset for our movie dialogue pairs, we could run the Count Vectorizer and TF-IDF from sklearn.

In [1]:
# Importing Dependencies for TF-IDF
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

In [2]:
file = "processed_movie_dialogue_pairs.csv"

In [3]:
df = pd.read_csv(file)
df

Unnamed: 0.1,Unnamed: 0,Comment,Reply,Expanded Comment,Expanded Reply,Comment Length,Reply Length,Tokenized Comment,Tokenized Reply,Tokenized Comment_no_sw,...,Sentiment Comment,Sentiment Reply,Sentiment Comment_no_sw,Sentiment Reply_no_sw,Sentiment Comment Compound Score,Sentiment Reply Compound Score,Sentiment Comment_no_sw Compound Score,Sentiment Reply_no_sw Compound Score,Sentiment Score Overall,Sentiment Score_no_sw Overall
0,0,Can we make this quick? Roxanne Korrine and An...,"Well, I thought we'd start with pronunciation,...",can we make this quick roxanne korrine and and...,well i thought we would start with pronunciati...,21,14,"['can', 'we', 'make', 'this', 'quick', 'roxann...","['well', 'i', 'thought', 'we', 'would', 'start...","['make', 'quick', 'roxanne', 'korrine', 'andre...",...,"{'neg': 0.17, 'neu': 0.83, 'pos': 0.0, 'compou...","{'neg': 0.0, 'neu': 0.733, 'pos': 0.267, 'comp...","{'neg': 0.29, 'neu': 0.71, 'pos': 0.0, 'compou...","{'neg': 0.0, 'neu': 0.5, 'pos': 0.5, 'compound...",-0.6240,0.4588,-0.6240,0.4588,-0.1652,-0.1652
1,1,"Well, I thought we'd start with pronunciation,...",Not the hacking and gagging and spitting part....,well i thought we would start with pronunciati...,not the hacking and gagging and spitting part ...,14,9,"['well', 'i', 'thought', 'we', 'would', 'start...","['not', 'the', 'hacking', 'and', 'gagging', 'a...","['well', 'thought', 'would', 'start', 'pronunc...",...,"{'neg': 0.0, 'neu': 0.733, 'pos': 0.267, 'comp...","{'neg': 0.0, 'neu': 0.777, 'pos': 0.223, 'comp...","{'neg': 0.0, 'neu': 0.5, 'pos': 0.5, 'compound...","{'neg': 0.312, 'neu': 0.39, 'pos': 0.299, 'com...",0.4588,0.3182,0.4588,-0.0258,0.7770,0.4330
2,2,Not the hacking and gagging and spitting part....,Okay... then how 'bout we try out some French ...,not the hacking and gagging and spitting part ...,okay then how about we try out some french cui...,9,12,"['not', 'the', 'hacking', 'and', 'gagging', 'a...","['okay', 'then', 'how', 'about', 'we', 'try', ...","['hacking', 'gagging', 'spitting', 'part', 'pl...",...,"{'neg': 0.0, 'neu': 0.777, 'pos': 0.223, 'comp...","{'neg': 0.0, 'neu': 0.853, 'pos': 0.147, 'comp...","{'neg': 0.312, 'neu': 0.39, 'pos': 0.299, 'com...","{'neg': 0.0, 'neu': 0.725, 'pos': 0.275, 'comp...",0.3182,0.2263,-0.0258,0.2263,0.5445,0.2005
3,3,You're asking me out. That's so cute. What's y...,Forget it.,you are asking me out that is so cute what is ...,forget it,14,2,"['you', 'are', 'asking', 'me', 'out', 'that', ...","['forget', 'it']","['asking', 'cute', 'name']",...,"{'neg': 0.0, 'neu': 0.771, 'pos': 0.229, 'comp...","{'neg': 0.655, 'neu': 0.345, 'pos': 0.0, 'comp...","{'neg': 0.0, 'neu': 0.4, 'pos': 0.6, 'compound...","{'neg': 1.0, 'neu': 0.0, 'pos': 0.0, 'compound...",0.5949,-0.2263,0.4588,-0.2263,0.3686,0.2325
4,4,"No, no, it's my fault - we didn't have a prope...",Cameron.,no no it is my fault we did not have a proper ...,cameron,16,1,"['no', 'no', 'it', 'is', 'my', 'fault', 'we', ...",['cameron'],"['fault', 'proper', 'introduction']",...,"{'neg': 0.441, 'neu': 0.559, 'pos': 0.0, 'comp...","{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...","{'neg': 0.574, 'neu': 0.426, 'pos': 0.0, 'comp...","{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",-0.7269,0.0000,-0.4019,0.0000,-0.7269,-0.4019
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
193166,214669,"Your orders, Mr Vereker?",I'm to take the Sikali with the main column to...,your orders mr vereker,i am to take the sikali with the main column t...,4,13,"['your', 'orders', 'mr', 'vereker']","['i', 'am', 'to', 'take', 'the', 'sikali', 'wi...","['orders', 'mr', 'vereker']",...,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...","{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...","{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...","{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",0.0000,0.0000,0.0000,0.0000,0.0000,0.0000
193167,214670,I'm to take the Sikali with the main column to...,Lord Chelmsford seems to want me to stay back ...,i am to take the sikali with the main column t...,lord chelmsford seems to want me to stay back ...,13,12,"['i', 'am', 'to', 'take', 'the', 'sikali', 'wi...","['lord', 'chelmsford', 'seems', 'to', 'want', ...","['take', 'sikali', 'main', 'column', 'river']",...,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...","{'neg': 0.0, 'neu': 0.894, 'pos': 0.106, 'comp...","{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...","{'neg': 0.0, 'neu': 0.822, 'pos': 0.178, 'comp...",0.0000,0.0772,0.0000,0.0772,0.0772,0.0772
193168,214671,Lord Chelmsford seems to want me to stay back ...,I think Chelmsford wants a good man on the bor...,lord chelmsford seems to want me to stay back ...,i think chelmsford wants a good man on the bor...,12,23,"['lord', 'chelmsford', 'seems', 'to', 'want', ...","['i', 'think', 'chelmsford', 'wants', 'a', 'go...","['lord', 'chelmsford', 'seems', 'want', 'stay'...",...,"{'neg': 0.0, 'neu': 0.894, 'pos': 0.106, 'comp...","{'neg': 0.247, 'neu': 0.588, 'pos': 0.165, 'co...","{'neg': 0.0, 'neu': 0.822, 'pos': 0.178, 'comp...","{'neg': 0.323, 'neu': 0.462, 'pos': 0.215, 'co...",0.0772,-0.4767,0.0772,-0.4767,-0.3995,-0.3995
193169,214672,"Well I assure you, Sir, I have no desire to cr...","And I assure you, you do not In fact I'd be ob...",well i assure you sir i have no desire to crea...,and i assure you you do not in fact i would be...,13,22,"['well', 'i', 'assure', 'you', 'sir', 'i', 'ha...","['and', 'i', 'assure', 'you', 'you', 'do', 'no...","['well', 'assure', 'sir', 'desire', 'create', ...",...,"{'neg': 0.243, 'neu': 0.265, 'pos': 0.492, 'co...","{'neg': 0.0, 'neu': 0.732, 'pos': 0.268, 'comp...","{'neg': 0.175, 'neu': 0.146, 'pos': 0.679, 'co...","{'neg': 0.0, 'neu': 0.476, 'pos': 0.524, 'comp...",0.5719,0.7650,0.7096,0.7650,1.3369,1.4746


Looking at our DataFrame, aside from needing to drop the extra 'Unnamed: 0' column, it would seem that we have to convert our processed lists of strings back into ordinary list of strings.

In [4]:
df = df.drop(['Unnamed: 0'], axis=1)

In [5]:
df.columns

Index(['Comment', 'Reply', 'Expanded Comment', 'Expanded Reply',
       'Comment Length', 'Reply Length', 'Tokenized Comment',
       'Tokenized Reply', 'Tokenized Comment_no_sw', 'Tokenized Reply_no_sw',
       'Lemmatized Comment', 'Lemmatized Reply', 'Lemmatized Comment_no_sw',
       'Lemmatized Reply_no_sw', 'Lem Comment Length', 'Lem Reply Length',
       'Lem Comment_no_sw Length', 'Lem Reply_no_sw Length',
       'Sentiment Comment', 'Sentiment Reply', 'Sentiment Comment_no_sw',
       'Sentiment Reply_no_sw', 'Sentiment Comment Compound Score',
       'Sentiment Reply Compound Score',
       'Sentiment Comment_no_sw Compound Score',
       'Sentiment Reply_no_sw Compound Score', 'Sentiment Score Overall',
       'Sentiment Score_no_sw Overall'],
      dtype='object')

In [6]:
columns_to_fix = ['Tokenized Comment','Tokenized Reply',
                  'Tokenized Comment_no_sw','Tokenized Reply_no_sw',
                  'Lemmatized Comment','Lemmatized Reply',
                  'Lemmatized Comment_no_sw','Lemmatized Reply_no_sw']

for column in columns_to_fix:
    df[column] = df[column].apply(lambda x: eval(' '.join(x.split())))

While we do have a decently sized dataset (~200k), we'll limit our dataset down to 20k. If it works, we can always scale it back up. It doesn't have to be perfect right now, we just want to see if everything is working properly. So we could define this new subset as mini_20k.

In [7]:
mini_20k = df[:20000]

Here, we'll build training and testing sets for lemmatized comments and replies with and without stop-words from mini_5k. Then we'll start with the CountVectorizer from sklearn.

In [8]:
# Generating training and testing data with sklearn's train_test_split() function
# Unfortunately, sklearn's text vectorizers expects an array of strings rather than tokens
# So in this case, we'll join the them back together for our train/test splitting
X_train, X_test, y_train, y_test = train_test_split(mini_20k['Lemmatized Comment'].str.join(' '),
                                                    mini_20k['Lemmatized Reply'].str.join(' '),
                                                    test_size=0.2, random_state=7)
X_train_no_sw, X_test_no_sw, y_train_no_sw, y_test_no_sw = train_test_split(mini_20k['Lemmatized Comment_no_sw'].str.join(' '),
                                                                            mini_20k['Lemmatized Reply_no_sw'].str.join(' '),
                                                                            test_size=0.2, random_state=7)

In [9]:
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)
print(X_train_no_sw.shape, X_test_no_sw.shape, y_train_no_sw.shape, y_test_no_sw.shape)

(16000,) (4000,) (16000,) (4000,)
(16000,) (4000,) (16000,) (4000,)


Having split up our data into training and test sets, we'll start fitting our vectorizers.

In [10]:
# Initializing CountVectorizer for each dataset, cv for the those with stop-words
# and cv_no_sw for those without; we'll also filter out infrequent words using 
# min_df=0.0005 which will eliminate words used less than 0.005% of utterances in our sets
cv = CountVectorizer(min_df=0.0005)
cv_no_sw = CountVectorizer(min_df=0.0005)

In [11]:
# Fitting and transforming vectorizer to training data and only tranforming the testing data
cv_transformed_train = cv.fit_transform(X_train)
cv_transformed_test = cv.transform(X_test)
cv_transformed_train_no_sw = cv_no_sw.fit_transform(X_train_no_sw)
cv_transformed_test_no_sw = cv_no_sw.transform(X_test_no_sw)

In [12]:
# Placing vectors into the DataFrame for the training data
cv_transformed_train_df = pd.DataFrame(cv_transformed_train.toarray(),
                                       columns=cv.get_feature_names()).add_prefix('CV_')
cv_transformed_train_df_no_sw = pd.DataFrame(cv_transformed_train_no_sw.toarray(),
                                       columns=cv_no_sw.get_feature_names()).add_prefix('CV_')

In [13]:
cv_train_df = pd.concat([mini_20k['Lemmatized Comment'],cv_transformed_train_df], axis=1, sort=False)
cv_train_df_no_sw = pd.concat([mini_20k['Lemmatized Comment_no_sw'],cv_transformed_train_df_no_sw], axis=1, sort=False)

In [14]:
cv_train_df.head()

Unnamed: 0,Lemmatized Comment,CV_able,CV_about,CV_above,CV_absolutely,CV_accept,CV_accident,CV_accord,CV_account,CV_across,...,CV_yknow,CV_yo,CV_york,CV_you,CV_young,CV_your,CV_youre,CV_yours,CV_yourself,CV_zoo
0,"[can, we, make, this, quick, roxanne, korrine,...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,"[well, i, think, we, would, start, with, pronu...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,"[not, the, hacking, and, gagging, and, spit, p...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,"[you, be, ask, me, out, that, be, so, cute, wh...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4,"[no, no, it, be, my, fault, we, do, not, have,...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [15]:
cv_transformed_train_df.iloc[0].sort_values(ascending=False)

CV_he          1
CV_be          1
CV_that        1
CV_you         1
CV_see         1
              ..
CV_please      0
CV_pleasure    0
CV_plenty      0
CV_plus        0
CV_able        0
Name: 0, Length: 1402, dtype: int64

In [16]:
cv_train_df_no_sw.head()

Unnamed: 0,Lemmatized Comment_no_sw,CV_able,CV_absolutely,CV_accept,CV_accident,CV_accord,CV_account,CV_across,CV_act,CV_action,...,CV_year,CV_yes,CV_yesterday,CV_yet,CV_yknow,CV_yo,CV_york,CV_young,CV_youre,CV_zoo
0,"[make, quick, roxanne, korrine, andrew, barret...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,"[well, thought, would, start, pronunciation, o...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,"[hack, gag, spit, part, please]",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,"[ask, cute, name]",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,"[fault, proper, introduction]",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [17]:
cv_transformed_train_df_no_sw.iloc[0].sort_values(ascending=False)

CV_excite    1
CV_see       1
CV_zoo       0
CV_field     0
CV_fifty     0
            ..
CV_place     0
CV_plan      0
CV_plane     0
CV_play      0
CV_able      0
Name: 0, Length: 1313, dtype: int64

In [18]:
# Placing vectors into the DataFrame for the testing data
cv_transformed_test_df = pd.DataFrame(cv_transformed_test.toarray(),
                                       columns=cv.get_feature_names()).add_prefix('CV_')
cv_transformed_test_df_no_sw = pd.DataFrame(cv_transformed_test_no_sw.toarray(),
                                       columns=cv_no_sw.get_feature_names()).add_prefix('CV_')

In [19]:
cv_test_df = pd.concat([mini_20k['Lemmatized Comment'],cv_transformed_test_df], axis=1, sort=False)
cv_test_df_no_sw = pd.concat([mini_20k['Lemmatized Comment_no_sw'],cv_transformed_test_df_no_sw], axis=1, sort=False)

In [20]:
cv_test_df.head()

Unnamed: 0,Lemmatized Comment,CV_able,CV_about,CV_above,CV_absolutely,CV_accept,CV_accident,CV_accord,CV_account,CV_across,...,CV_yknow,CV_yo,CV_york,CV_you,CV_young,CV_your,CV_youre,CV_yours,CV_yourself,CV_zoo
0,"[can, we, make, this, quick, roxanne, korrine,...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,"[well, i, think, we, would, start, with, pronu...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,"[not, the, hacking, and, gagging, and, spit, p...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,"[you, be, ask, me, out, that, be, so, cute, wh...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4,"[no, no, it, be, my, fault, we, do, not, have,...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [21]:
cv_transformed_test_df.iloc[0].sort_values(ascending=False)

CV_stand       1
CV_up          1
CV_zoo         0
CV_fish        0
CV_finally     0
              ..
CV_pleasure    0
CV_plenty      0
CV_plus        0
CV_point       0
CV_able        0
Name: 0, Length: 1402, dtype: int64

In [22]:
cv_test_df_no_sw.head()

Unnamed: 0,Lemmatized Comment_no_sw,CV_able,CV_absolutely,CV_accept,CV_accident,CV_accord,CV_account,CV_across,CV_act,CV_action,...,CV_year,CV_yes,CV_yesterday,CV_yet,CV_yknow,CV_yo,CV_york,CV_young,CV_youre,CV_zoo
0,"[make, quick, roxanne, korrine, andrew, barret...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,"[well, thought, would, start, pronunciation, o...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,"[hack, gag, spit, part, please]",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,"[ask, cute, name]",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,"[fault, proper, introduction]",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [23]:
cv_transformed_test_df_no_sw.iloc[0].sort_values(ascending=False)

CV_stand     1
CV_zoo       0
CV_fifty     0
CV_fight     0
CV_figure    0
            ..
CV_place     0
CV_plan      0
CV_plane     0
CV_play      0
CV_able      0
Name: 0, Length: 1313, dtype: int64

Here, we have our word vectors stored in four DataFrames, cv_train_df, cv_train_df_no_sw, cv_test_df, and cv_test_df_no_sw. Now that we have a taste of word vectoring from the Count Vectorizer, we'll have a try on TF-IDF next.

In [24]:
# Initializing TF-IDF Vectorizer for each dataset, tv for the those with stop-words and
# tv_no_sw for those without
tv = TfidfVectorizer(min_df=0.0005)
tv_no_sw = TfidfVectorizer(min_df=0.0005)

In [25]:
# Fitting and transforming vectorizer to training data and only tranforming the testing data
tv_transformed_train = tv.fit_transform(X_train)
tv_transformed_test = tv.transform(X_test)
tv_transformed_train_no_sw = tv_no_sw.fit_transform(X_train_no_sw)
tv_transformed_test_no_sw = tv_no_sw.transform(X_test_no_sw)

In [26]:
# Placing vectors into the DataFrame for the training data
tv_transformed_train_df = pd.DataFrame(tv_transformed_train.toarray(),
                                       columns=tv.get_feature_names()).add_prefix('TFIDF_')
tv_transformed_train_df_no_sw = pd.DataFrame(tv_transformed_train_no_sw.toarray(),
                                       columns=tv_no_sw.get_feature_names()).add_prefix('TFIDF_')

In [27]:
tv_train_df = pd.concat([mini_20k['Lemmatized Comment'],tv_transformed_train_df], axis=1, sort=False)
tv_train_df_no_sw = pd.concat([mini_20k['Lemmatized Comment_no_sw'],tv_transformed_train_df_no_sw], axis=1, sort=False)

In [28]:
tv_train_df.head()

Unnamed: 0,Lemmatized Comment,TFIDF_able,TFIDF_about,TFIDF_above,TFIDF_absolutely,TFIDF_accept,TFIDF_accident,TFIDF_accord,TFIDF_account,TFIDF_across,...,TFIDF_yknow,TFIDF_yo,TFIDF_york,TFIDF_you,TFIDF_young,TFIDF_your,TFIDF_youre,TFIDF_yours,TFIDF_yourself,TFIDF_zoo
0,"[can, we, make, this, quick, roxanne, korrine,...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.248898,0.0,0.0,0.0,0.0,0.0,0.0
1,"[well, i, think, we, would, start, with, pronu...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,"[not, the, hacking, and, gagging, and, spit, p...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,"[you, be, ask, me, out, that, be, so, cute, wh...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.197574,0.0,0.0,0.0,0.0,0.0,0.0
4,"[no, no, it, be, my, fault, we, do, not, have,...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [29]:
tv_transformed_train_df.iloc[0].sort_values(ascending=False)

TFIDF_see         0.558334
TFIDF_he          0.465713
TFIDF_that        0.383615
TFIDF_not         0.333207
TFIDF_to          0.317543
                    ...   
TFIDF_please      0.000000
TFIDF_pleasure    0.000000
TFIDF_plenty      0.000000
TFIDF_plus        0.000000
TFIDF_able        0.000000
Name: 0, Length: 1402, dtype: float64

In [30]:
tv_train_df_no_sw.head()

Unnamed: 0,Lemmatized Comment_no_sw,TFIDF_able,TFIDF_absolutely,TFIDF_accept,TFIDF_accident,TFIDF_accord,TFIDF_account,TFIDF_across,TFIDF_act,TFIDF_action,...,TFIDF_year,TFIDF_yes,TFIDF_yesterday,TFIDF_yet,TFIDF_yknow,TFIDF_yo,TFIDF_york,TFIDF_young,TFIDF_youre,TFIDF_zoo
0,"[make, quick, roxanne, korrine, andrew, barret...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,"[well, thought, would, start, pronunciation, o...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,"[hack, gag, spit, part, please]",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,"[ask, cute, name]",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,"[fault, proper, introduction]",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [31]:
tv_transformed_train_df_no_sw.iloc[0].sort_values(ascending=False)

TFIDF_excite    0.872634
TFIDF_see       0.488375
TFIDF_zoo       0.000000
TFIDF_field     0.000000
TFIDF_fifty     0.000000
                  ...   
TFIDF_place     0.000000
TFIDF_plan      0.000000
TFIDF_plane     0.000000
TFIDF_play      0.000000
TFIDF_able      0.000000
Name: 0, Length: 1313, dtype: float64

In [32]:
# Placing vectors into the DataFrame for the testing data
tv_transformed_test_df = pd.DataFrame(tv_transformed_test.toarray(),
                                       columns=tv.get_feature_names()).add_prefix('TFIDF_')
tv_transformed_test_df_no_sw = pd.DataFrame(tv_transformed_test_no_sw.toarray(),
                                       columns=tv_no_sw.get_feature_names()).add_prefix('TFIDF_')

In [33]:
tv_test_df = pd.concat([mini_20k['Lemmatized Reply'],tv_transformed_test_df], axis=1, sort=False)
tv_test_df_no_sw = pd.concat([mini_20k['Lemmatized Reply_no_sw'],tv_transformed_test_df_no_sw], axis=1, sort=False)

In [34]:
tv_test_df.head()

Unnamed: 0,Lemmatized Reply,TFIDF_able,TFIDF_about,TFIDF_above,TFIDF_absolutely,TFIDF_accept,TFIDF_accident,TFIDF_accord,TFIDF_account,TFIDF_across,...,TFIDF_yknow,TFIDF_yo,TFIDF_york,TFIDF_you,TFIDF_young,TFIDF_your,TFIDF_youre,TFIDF_yours,TFIDF_yourself,TFIDF_zoo
0,"[well, i, think, we, would, start, with, pronu...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,"[not, the, hacking, and, gagging, and, spit, p...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.113467,0.0,0.0,0.0,0.0,0.0,0.0
2,"[okay, then, how, about, we, try, out, some, f...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,"[forget, it]",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.147799,0.0,0.0,0.0,0.0,0.0,0.0
4,[cameron],0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [35]:
tv_transformed_test_df.iloc[0].sort_values(ascending=False)

TFIDF_stand       0.844140
TFIDF_up          0.536123
TFIDF_zoo         0.000000
TFIDF_fish        0.000000
TFIDF_finally     0.000000
                    ...   
TFIDF_pleasure    0.000000
TFIDF_plenty      0.000000
TFIDF_plus        0.000000
TFIDF_point       0.000000
TFIDF_able        0.000000
Name: 0, Length: 1402, dtype: float64

In [36]:
tv_test_df_no_sw.head()

Unnamed: 0,Lemmatized Reply_no_sw,TFIDF_able,TFIDF_absolutely,TFIDF_accept,TFIDF_accident,TFIDF_accord,TFIDF_account,TFIDF_across,TFIDF_act,TFIDF_action,...,TFIDF_year,TFIDF_yes,TFIDF_yesterday,TFIDF_yet,TFIDF_yknow,TFIDF_yo,TFIDF_york,TFIDF_young,TFIDF_youre,TFIDF_zoo
0,"[well, thought, would, start, pronunciation, o...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,"[hack, gag, spit, part, please]",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,"[okay, try, french, cuisine, saturday, night]",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,[forget],0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,[cameron],0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [37]:
tv_transformed_test_df_no_sw.iloc[0].sort_values(ascending=False)

TFIDF_stand     1.0
TFIDF_zoo       0.0
TFIDF_fifty     0.0
TFIDF_fight     0.0
TFIDF_figure    0.0
               ... 
TFIDF_place     0.0
TFIDF_plan      0.0
TFIDF_plane     0.0
TFIDF_play      0.0
TFIDF_able      0.0
Name: 0, Length: 1313, dtype: float64

With TF-IDF, we have our vectors stored within our four DataFrames, tv_train_df, tv_train_df_no_sw, tv_test_df, and tv_test_df_no_sw.

Another approach to vectorizing outside of counting words and frequencies is the Word2Vec approach, a deep learning neural network developed by Google. For Word2Vec, we'll use gensim, using the documentation as a guide (https://radimrehurek.com/gensim/auto_examples/tutorials/run_word2vec.html).

In [38]:
# We'll follow the example for training my own model using my own dataset
import gensim.models

sentences_train = [comment.split() for comment in X_train]
word2vec_model = gensim.models.Word2Vec(sentences=sentences_train, vector_size=300)

In [39]:
sentences_train_no_sw = [comment.split() for comment in X_train_no_sw]
word2vec_model_no_sw = gensim.models.Word2Vec(sentences=sentences_train_no_sw, vector_size=300)

The documentation for gensim provided a really nice example on visualizing embedded words by reducing the dimensionality of the words to 2 dimensions using tSNE.

In [40]:
# Visualizing word embeddings via plotly and matplotlib and the TSNE algorithm
from sklearn.decomposition import IncrementalPCA    # inital reduction
from sklearn.manifold import TSNE                   # final reduction
import numpy as np                                  # array handling

def reduce_dimensions(model):
    num_dimensions = 2  # final num dimensions (2D, 3D, etc)

    # extract the words & their vectors, as numpy arrays
    vectors = np.asarray(model.wv.vectors)
    labels = np.asarray(model.wv.index_to_key)  # fixed-width numpy strings

    # reduce using t-SNE
    tsne = TSNE(n_components=num_dimensions, random_state=0)
    vectors = tsne.fit_transform(vectors)

    x_vals = [v[0] for v in vectors]
    y_vals = [v[1] for v in vectors]
    return x_vals, y_vals, labels

x_vals, y_vals, labels = reduce_dimensions(word2vec_model)

def plot_with_plotly(x_vals, y_vals, labels, plot_in_notebook=True):
    from plotly.offline import init_notebook_mode, iplot, plot
    import plotly.graph_objs as go

    trace = go.Scatter(x=x_vals, y=y_vals, mode='text', text=labels)
    data = [trace]

    if plot_in_notebook:
        init_notebook_mode(connected=True)
        iplot(data, filename='word-embedding-plot')
    else:
        plot(data, filename='word-embedding-plot.html')

def plot_with_matplotlib(x_vals, y_vals, labels):
    import matplotlib.pyplot as plt
    import random

    random.seed(0)

    plt.figure(figsize=(12, 12))
    plt.scatter(x_vals, y_vals)

    #
    # Label randomly subsampled 25 data points
    #
    indices = list(range(len(labels)))
    selected_indices = random.sample(indices, 25)
    for i in selected_indices:
        plt.annotate(labels[i], (x_vals[i], y_vals[i]))

try:
    get_ipython()
except Exception:
    plot_function = plot_with_matplotlib
else:
    plot_function = plot_with_plotly

print('This is a plot for word2vec_model')
plot_function(x_vals, y_vals, labels)

This is a plot for word2vec_model


In [41]:
x_vals_no_sw, y_vals_no_sw, labels_no_sw = reduce_dimensions(word2vec_model_no_sw)
print('This is a plot for word2vec_model_no_sw')
plot_function(x_vals_no_sw, y_vals_no_sw, labels_no_sw)

This is a plot for word2vec_model_no_sw


In [43]:
print(word2vec_model)

Word2Vec(vocab=2168, vector_size=300, alpha=0.025)


In [44]:
print(word2vec_model_no_sw)

Word2Vec(vocab=2093, vector_size=300, alpha=0.025)


With this, we have three different vectorizations of the mini_5k set. CountVectorization and TF-IDF Vectorization using sklearn and Word2Vec using gensim.