# Preprocessing Data

To preprocess the dataset for our movie dialogue pairs, we could run the Count Vectorizer and TF-IDF from sklearn.

In [1]:
# Importing Dependencies for TF-IDF
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

In [2]:
file = "processed_movie_dialogue_pairs.csv"

In [3]:
df = pd.read_csv(file)
df

Unnamed: 0.1,Unnamed: 0,Comment,Reply,Expanded Comment,Expanded Reply,Comment Length,Reply Length,Tokenized Comment,Tokenized Reply,Tokenized Comment_no_sw,...,Sentiment Comment,Sentiment Reply,Sentiment Comment_no_sw,Sentiment Reply_no_sw,Sentiment Comment Compound Score,Sentiment Reply Compound Score,Sentiment Comment_no_sw Compound Score,Sentiment Reply_no_sw Compound Score,Sentiment Score Overall,Sentiment Score_no_sw Overall
0,0,Can we make this quick? Roxanne Korrine and An...,"Well, I thought we'd start with pronunciation,...",can we make this quick roxanne korrine and and...,well i thought we would start with pronunciati...,21,14,"['can', 'we', 'make', 'this', 'quick', 'roxann...","['well', 'i', 'thought', 'we', 'would', 'start...","['make', 'quick', 'roxanne', 'korrine', 'andre...",...,"{'neg': 0.163, 'neu': 0.837, 'pos': 0.0, 'comp...","{'neg': 0.0, 'neu': 0.733, 'pos': 0.267, 'comp...","{'neg': 0.29, 'neu': 0.71, 'pos': 0.0, 'compou...","{'neg': 0.0, 'neu': 0.5, 'pos': 0.5, 'compound...",-0.6240,0.4588,-0.6240,0.4588,-0.1652,-0.1652
1,1,"Well, I thought we'd start with pronunciation,...",Not the hacking and gagging and spitting part ...,well i thought we would start with pronunciati...,not the hacking and gagging and spitting part ...,14,9,"['well', 'i', 'thought', 'we', 'would', 'start...","['not', 'the', 'hacking', 'and', 'gagging', 'a...","['well', 'thought', 'would', 'start', 'pronunc...",...,"{'neg': 0.0, 'neu': 0.733, 'pos': 0.267, 'comp...","{'neg': 0.0, 'neu': 0.777, 'pos': 0.223, 'comp...","{'neg': 0.0, 'neu': 0.5, 'pos': 0.5, 'compound...","{'neg': 0.312, 'neu': 0.39, 'pos': 0.299, 'com...",0.4588,0.3182,0.4588,-0.0258,0.7770,0.4330
2,2,Not the hacking and gagging and spitting part ...,Okay then how 'bout we try out some French cui...,not the hacking and gagging and spitting part ...,okay then how about we try out some french cui...,9,12,"['not', 'the', 'hacking', 'and', 'gagging', 'a...","['okay', 'then', 'how', 'about', 'we', 'try', ...","['hacking', 'gagging', 'spitting', 'part', 'pl...",...,"{'neg': 0.0, 'neu': 0.777, 'pos': 0.223, 'comp...","{'neg': 0.0, 'neu': 0.853, 'pos': 0.147, 'comp...","{'neg': 0.312, 'neu': 0.39, 'pos': 0.299, 'com...","{'neg': 0.0, 'neu': 0.725, 'pos': 0.275, 'comp...",0.3182,0.2263,-0.0258,0.2263,0.5445,0.2005
3,3,You're asking me out That's so cute What's you...,Forget it,you are asking me out that is so cute what is ...,forget it,14,2,"['you', 'are', 'asking', 'me', 'out', 'that', ...","['forget', 'it']","['asking', 'cute', 'name']",...,"{'neg': 0.0, 'neu': 0.771, 'pos': 0.229, 'comp...","{'neg': 0.655, 'neu': 0.345, 'pos': 0.0, 'comp...","{'neg': 0.0, 'neu': 0.4, 'pos': 0.6, 'compound...","{'neg': 1.0, 'neu': 0.0, 'pos': 0.0, 'compound...",0.5949,-0.2263,0.4588,-0.2263,0.3686,0.2325
4,4,"No, no, it's my fault - we didn't have a prope...",Cameron,no no it is my fault we did not have a proper ...,cameron,16,1,"['no', 'no', 'it', 'is', 'my', 'fault', 'we', ...",['cameron'],"['fault', 'proper', 'introduction']",...,"{'neg': 0.441, 'neu': 0.559, 'pos': 0.0, 'comp...","{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...","{'neg': 0.574, 'neu': 0.426, 'pos': 0.0, 'comp...","{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",-0.7269,0.0000,-0.4019,0.0000,-0.7269,-0.4019
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
191923,214681,"Your orders, Mr Vereker?",I'm to take the Sikali with the main column to...,your orders mr vereker,i am to take the sikali with the main column t...,4,13,"['your', 'orders', 'mr', 'vereker']","['i', 'am', 'to', 'take', 'the', 'sikali', 'wi...","['orders', 'mr', 'vereker']",...,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...","{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...","{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...","{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",0.0000,0.0000,0.0000,0.0000,0.0000,0.0000
191924,214682,I'm to take the Sikali with the main column to...,Lord Chelmsford seems to want me to stay back ...,i am to take the sikali with the main column t...,lord chelmsford seems to want me to stay back ...,13,12,"['i', 'am', 'to', 'take', 'the', 'sikali', 'wi...","['lord', 'chelmsford', 'seems', 'to', 'want', ...","['take', 'sikali', 'main', 'column', 'river']",...,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...","{'neg': 0.0, 'neu': 0.894, 'pos': 0.106, 'comp...","{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...","{'neg': 0.0, 'neu': 0.822, 'pos': 0.178, 'comp...",0.0000,0.0772,0.0000,0.0772,0.0772,0.0772
191925,214683,Lord Chelmsford seems to want me to stay back ...,I think Chelmsford wants a good man on the bor...,lord chelmsford seems to want me to stay back ...,i think chelmsford wants a good man on the bor...,12,23,"['lord', 'chelmsford', 'seems', 'to', 'want', ...","['i', 'think', 'chelmsford', 'wants', 'a', 'go...","['lord', 'chelmsford', 'seems', 'want', 'stay'...",...,"{'neg': 0.0, 'neu': 0.894, 'pos': 0.106, 'comp...","{'neg': 0.247, 'neu': 0.588, 'pos': 0.165, 'co...","{'neg': 0.0, 'neu': 0.822, 'pos': 0.178, 'comp...","{'neg': 0.323, 'neu': 0.462, 'pos': 0.215, 'co...",0.0772,-0.4767,0.0772,-0.4767,-0.3995,-0.3995
191926,214684,"Well I assure you, Sir, I have no desire to cr...","And I assure you, you do not In fact I'd be ob...",well i assure you sir i have no desire to crea...,and i assure you you do not in fact i would be...,13,22,"['well', 'i', 'assure', 'you', 'sir', 'i', 'ha...","['and', 'i', 'assure', 'you', 'you', 'do', 'no...","['well', 'assure', 'sir', 'desire', 'create', ...",...,"{'neg': 0.243, 'neu': 0.265, 'pos': 0.492, 'co...","{'neg': 0.0, 'neu': 0.732, 'pos': 0.268, 'comp...","{'neg': 0.175, 'neu': 0.146, 'pos': 0.679, 'co...","{'neg': 0.0, 'neu': 0.476, 'pos': 0.524, 'comp...",0.5719,0.7650,0.7096,0.7650,1.3369,1.4746


Looking at our DataFrame, aside from needing to drop the extra 'Unnamed: 0' column, it would seem that we have to convert our processed lists of strings back into ordinary list of strings.

In [4]:
df = df.drop(['Unnamed: 0'], axis=1)

In [5]:
df.columns

Index(['Comment', 'Reply', 'Expanded Comment', 'Expanded Reply',
       'Comment Length', 'Reply Length', 'Tokenized Comment',
       'Tokenized Reply', 'Tokenized Comment_no_sw', 'Tokenized Reply_no_sw',
       'Lemmatized Comment', 'Lemmatized Reply', 'Lemmatized Comment_no_sw',
       'Lemmatized Reply_no_sw', 'Lem Comment Length', 'Lem Reply Length',
       'Lem Comment_no_sw Length', 'Lem Reply_no_sw Length',
       'Sentiment Comment', 'Sentiment Reply', 'Sentiment Comment_no_sw',
       'Sentiment Reply_no_sw', 'Sentiment Comment Compound Score',
       'Sentiment Reply Compound Score',
       'Sentiment Comment_no_sw Compound Score',
       'Sentiment Reply_no_sw Compound Score', 'Sentiment Score Overall',
       'Sentiment Score_no_sw Overall'],
      dtype='object')

In [6]:
columns_to_fix = ['Tokenized Comment','Tokenized Reply',
                  'Tokenized Comment_no_sw','Tokenized Reply_no_sw',
                  'Lemmatized Comment','Lemmatized Reply',
                  'Lemmatized Comment_no_sw','Lemmatized Reply_no_sw']

for column in columns_to_fix:
    df[column] = df[column].apply(lambda x: eval(' '.join(x.split())))

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 191928 entries, 0 to 191927
Data columns (total 28 columns):
 #   Column                                  Non-Null Count   Dtype  
---  ------                                  --------------   -----  
 0   Comment                                 191928 non-null  object 
 1   Reply                                   191928 non-null  object 
 2   Expanded Comment                        191928 non-null  object 
 3   Expanded Reply                          191928 non-null  object 
 4   Comment Length                          191928 non-null  int64  
 5   Reply Length                            191928 non-null  int64  
 6   Tokenized Comment                       191928 non-null  object 
 7   Tokenized Reply                         191928 non-null  object 
 8   Tokenized Comment_no_sw                 191928 non-null  object 
 9   Tokenized Reply_no_sw                   191928 non-null  object 
 10  Lemmatized Comment                      1919

While we do have a decently sized dataset (~200k), we'll limit our dataset down to 20k. If it works, we can always scale it back up. It doesn't have to be perfect right now, we just want to see if everything is working properly. So we could define this new subset as mini_20k.

In [8]:
mini_20k = df[:20000]

Here, we'll build training and testing sets for lemmatized comments and replies with and without stop-words from mini_5k. Then we'll start with the CountVectorizer from sklearn.

In [9]:
# Generating training and testing data with sklearn's train_test_split() function
# Unfortunately, sklearn's text vectorizers expects an array of strings rather than tokens
# So in this case, we'll join the them back together for our train/test splitting
X_train, X_test, y_train, y_test = train_test_split(mini_20k['Lemmatized Comment'].str.join(' '),
                                                    mini_20k['Lemmatized Reply'].str.join(' '),
                                                    test_size=0.2, random_state=7)
X_train_no_sw, X_test_no_sw, y_train_no_sw, y_test_no_sw = train_test_split(mini_20k['Lemmatized Comment_no_sw'].str.join(' '),
                                                                            mini_20k['Lemmatized Reply_no_sw'].str.join(' '),
                                                                            test_size=0.2, random_state=7)

In [10]:
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)
print(X_train_no_sw.shape, X_test_no_sw.shape, y_train_no_sw.shape, y_test_no_sw.shape)

(16000,) (4000,) (16000,) (4000,)
(16000,) (4000,) (16000,) (4000,)


In [11]:
# Checking one of the training sets
X_train

8118                                                 hello
10171                         do not be fool by appearance
9399                                    oh give me a start
12669    this be where we be find most of the hemorrhag...
18809                                                 open
                               ...                        
13927                            can not a boy be a dorrit
919                                                well he
5699     it be early in the game yet mr rain jesse jame...
10742    we look all over for you what be you do back here
16921        i will go on the dole like my daddy before me
Name: Lemmatized Comment, Length: 16000, dtype: object

In [12]:
# Reseting indexes for all my training and testing data
X_train = X_train.reset_index().drop('index', axis=1).iloc[:,0]
X_test = X_test.reset_index().drop('index', axis=1).iloc[:,0]
y_train = y_train.reset_index().drop('index', axis=1).iloc[:,0]
y_test = y_test.reset_index().drop('index', axis=1).iloc[:,0]
X_train_no_sw = X_train_no_sw.reset_index().drop('index', axis=1).iloc[:,0]
X_test_no_sw = X_test_no_sw.reset_index().drop('index', axis=1).iloc[:,0]
y_train_no_sw = y_train_no_sw.reset_index().drop('index', axis=1).iloc[:,0]
y_test_no_sw = y_test_no_sw.reset_index().drop('index', axis=1).iloc[:,0]

In [13]:
# Checking format of dataset after index reset and manipulation
X_train

0                                                    hello
1                             do not be fool by appearance
2                                       oh give me a start
3        this be where we be find most of the hemorrhag...
4                                                     open
                               ...                        
15995                            can not a boy be a dorrit
15996                                              well he
15997    it be early in the game yet mr rain jesse jame...
15998    we look all over for you what be you do back here
15999        i will go on the dole like my daddy before me
Name: Lemmatized Comment, Length: 16000, dtype: object

In [14]:
# Checking the very first element in the training set X_train
X_train.iloc[1]

'do not be fool by appearance'

Having split up our data into training and test sets, we'll start fitting our vectorizers.

In [15]:
# Initializing CountVectorizer for each dataset, cv for the those with stop-words
# and cv_no_sw for those without; we'll also use the default min_df for filtering
# infrequent words
cv = CountVectorizer()
cv_no_sw = CountVectorizer()

In [16]:
# Fitting and transforming vectorizer to training data and only tranforming the testing data
cv_transformed_train = cv.fit_transform(X_train)
cv_transformed_test = cv.transform(X_test)
cv_transformed_train_no_sw = cv_no_sw.fit_transform(X_train_no_sw)
cv_transformed_test_no_sw = cv_no_sw.transform(X_test_no_sw)

In [17]:
# Checking the first element of the newly transformed vector in cv_transformed_train
cv_transformed_train.toarray()[0]

array([0, 0, 0, ..., 0, 0, 0])

In [18]:
# Checking values in the first element that are not null
print([i for i, n in enumerate(cv_transformed_train.toarray()[1]) if n > 0])
[cv.get_feature_names()[i] for i, n in enumerate(cv_transformed_train.toarray()[1]) if n > 0]

[463, 764, 1184, 2397, 3119, 5542]


['appearance', 'be', 'by', 'do', 'fool', 'not']

In [19]:
# Placing vectors into the DataFrame for the training data
cv_transformed_train_df = pd.DataFrame(cv_transformed_train.toarray(),
                                       columns=cv.get_feature_names()).add_prefix('CV_')
cv_transformed_train_df_no_sw = pd.DataFrame(cv_transformed_train_no_sw.toarray(),
                                       columns=cv_no_sw.get_feature_names()).add_prefix('CV_')

In [20]:
cv_transformed_train_df.head()

Unnamed: 0,CV_00,CV_000,CV_0100,CV_0300,CV_0630,CV_09,CV_0h,CV_10,CV_100,CV_1002,...,CV_zoloft,CV_zombie,CV_zone,CV_zoo,CV_zooie,CV_zoological,CV_zorg,CV_zorn,CV_zulu,CV_zuma
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [21]:
# Concat the dataset with its word vectors
cv_train_df = pd.concat([X_train,cv_transformed_train_df], axis=1, sort=False)
cv_train_df_no_sw = pd.concat([X_train_no_sw,cv_transformed_train_df_no_sw], axis=1, sort=False)

In [22]:
cv_train_df.head()

Unnamed: 0,Lemmatized Comment,CV_00,CV_000,CV_0100,CV_0300,CV_0630,CV_09,CV_0h,CV_10,CV_100,...,CV_zoloft,CV_zombie,CV_zone,CV_zoo,CV_zooie,CV_zoological,CV_zorg,CV_zorn,CV_zulu,CV_zuma
0,hello,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,do not be fool by appearance,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,oh give me a start,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,this be where we be find most of the hemorrhag...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,open,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [23]:
# Checking if vectors are properly merged with comment
cv_train_df.iloc[1]['Lemmatized Comment']

'do not be fool by appearance'

In [24]:
[cv.get_feature_names()[i] for i, n in enumerate(cv_train_df.iloc[1, 1:]) if n > 0]

['appearance', 'be', 'by', 'do', 'fool', 'not']

In [25]:
cv_transformed_train_df.iloc[1].sort_values(ascending=False)

CV_fool         1
CV_be           1
CV_by           1
CV_not          1
CV_do           1
               ..
CV_pink         0
CV_pinkerton    0
CV_pinpoint     0
CV_pinzon       0
CV_00           0
Name: 1, Length: 9071, dtype: int64

In [26]:
cv_train_df_no_sw.head()

Unnamed: 0,Lemmatized Comment_no_sw,CV_00,CV_000,CV_0100,CV_0300,CV_0630,CV_09,CV_0h,CV_10,CV_100,...,CV_zoloft,CV_zombie,CV_zone,CV_zoo,CV_zooie,CV_zoological,CV_zorg,CV_zorn,CV_zulu,CV_zuma
0,hello,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,fool appearance,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,oh give start,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,find hemorrhage outer line cerebral cortex,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,open,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [27]:
cv_transformed_train_df_no_sw.iloc[1].sort_values(ascending=False)

CV_appearance    1
CV_fool          1
CV_zuma          0
CV_filth         0
CV_financial     0
                ..
CV_pitiful       0
CV_pitt          0
CV_pitts         0
CV_pittsburgh    0
CV_00            0
Name: 1, Length: 9063, dtype: int64

In [28]:
# Placing vectors into the DataFrame for the testing data
cv_transformed_test_df = pd.DataFrame(cv_transformed_test.toarray(),
                                       columns=cv.get_feature_names()).add_prefix('CV_')
cv_transformed_test_df_no_sw = pd.DataFrame(cv_transformed_test_no_sw.toarray(),
                                       columns=cv_no_sw.get_feature_names()).add_prefix('CV_')

In [29]:
cv_test_df = pd.concat([X_test,cv_transformed_test_df], axis=1, sort=False)
cv_test_df_no_sw = pd.concat([X_test_no_sw,cv_transformed_test_df_no_sw], axis=1, sort=False)

In [30]:
cv_test_df.head()

Unnamed: 0,Lemmatized Comment,CV_00,CV_000,CV_0100,CV_0300,CV_0630,CV_09,CV_0h,CV_10,CV_100,...,CV_zoloft,CV_zombie,CV_zone,CV_zoo,CV_zooie,CV_zoological,CV_zorg,CV_zorn,CV_zulu,CV_zuma
0,be it that bad,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,i love you elaine,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,he leave his sweater,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,l triptophanea neurotransmitter sometimes use ...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,shoot it,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [31]:
cv_transformed_test_df.iloc[1].sort_values(ascending=False)

CV_elaine       1
CV_love         1
CV_you          1
CV_zuma         0
CV_financing    0
               ..
CV_pineal       0
CV_pink         0
CV_pinkerton    0
CV_pinpoint     0
CV_00           0
Name: 1, Length: 9071, dtype: int64

In [32]:
cv_test_df_no_sw.head()

Unnamed: 0,Lemmatized Comment_no_sw,CV_00,CV_000,CV_0100,CV_0300,CV_0630,CV_09,CV_0h,CV_10,CV_100,...,CV_zoloft,CV_zombie,CV_zone,CV_zoo,CV_zooie,CV_zoological,CV_zorg,CV_zorn,CV_zulu,CV_zuma
0,bad,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,love elaine,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,leave sweater,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,l triptophanea neurotransmitter sometimes use ...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,shoot,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [33]:
cv_transformed_test_df_no_sw.iloc[1].sort_values(ascending=False)

CV_elaine        1
CV_love          1
CV_zuma          0
CV_final         0
CV_financing     0
                ..
CV_pitt          0
CV_pitts         0
CV_pittsburgh    0
CV_pituitary     0
CV_00            0
Name: 1, Length: 9063, dtype: int64

Here, we have our word vectors stored in four DataFrames, cv_train_df, cv_train_df_no_sw, cv_test_df, and cv_test_df_no_sw. Now that we have a taste of word vectoring from the Count Vectorizer, we'll have a try on TF-IDF next.

In [34]:
# Initializing TF-IDF Vectorizer for each dataset, tv for the those with stop-words and
# tv_no_sw for those without stop-words
tv = TfidfVectorizer()
tv_no_sw = TfidfVectorizer()

In [35]:
# Fitting and transforming vectorizer to training data and only tranforming the testing data
tv_transformed_train = tv.fit_transform(X_train)
tv_transformed_test = tv.transform(X_test)
tv_transformed_train_no_sw = tv_no_sw.fit_transform(X_train_no_sw)
tv_transformed_test_no_sw = tv_no_sw.transform(X_test_no_sw)

In [36]:
# Placing vectors into the DataFrame for the training data
tv_transformed_train_df = pd.DataFrame(tv_transformed_train.toarray(),
                                       columns=tv.get_feature_names()).add_prefix('TFIDF_')
tv_transformed_train_df_no_sw = pd.DataFrame(tv_transformed_train_no_sw.toarray(),
                                       columns=tv_no_sw.get_feature_names()).add_prefix('TFIDF_')

In [37]:
tv_train_df = pd.concat([X_train,tv_transformed_train_df], axis=1, sort=False)
tv_train_df_no_sw = pd.concat([X_train_no_sw,tv_transformed_train_df_no_sw], axis=1, sort=False)

In [38]:
tv_train_df.head()

Unnamed: 0,Lemmatized Comment,TFIDF_00,TFIDF_000,TFIDF_0100,TFIDF_0300,TFIDF_0630,TFIDF_09,TFIDF_0h,TFIDF_10,TFIDF_100,...,TFIDF_zoloft,TFIDF_zombie,TFIDF_zone,TFIDF_zoo,TFIDF_zooie,TFIDF_zoological,TFIDF_zorg,TFIDF_zorn,TFIDF_zulu,TFIDF_zuma
0,hello,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,do not be fool by appearance,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,oh give me a start,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,this be where we be find most of the hemorrhag...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,open,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [39]:
tv_transformed_train_df.iloc[1].sort_values(ascending=False)

TFIDF_appearance    0.691769
TFIDF_fool          0.525531
TFIDF_by            0.394979
TFIDF_do            0.191062
TFIDF_not           0.189831
                      ...   
TFIDF_pink          0.000000
TFIDF_pinkerton     0.000000
TFIDF_pinpoint      0.000000
TFIDF_pinzon        0.000000
TFIDF_00            0.000000
Name: 1, Length: 9071, dtype: float64

In [40]:
tv_train_df_no_sw.head()

Unnamed: 0,Lemmatized Comment_no_sw,TFIDF_00,TFIDF_000,TFIDF_0100,TFIDF_0300,TFIDF_0630,TFIDF_09,TFIDF_0h,TFIDF_10,TFIDF_100,...,TFIDF_zoloft,TFIDF_zombie,TFIDF_zone,TFIDF_zoo,TFIDF_zooie,TFIDF_zoological,TFIDF_zorg,TFIDF_zorn,TFIDF_zulu,TFIDF_zuma
0,hello,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,fool appearance,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,oh give start,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,find hemorrhage outer line cerebral cortex,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,open,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [41]:
tv_transformed_train_df_no_sw.iloc[1].sort_values(ascending=False)

TFIDF_appearance    0.796280
TFIDF_fool          0.604928
TFIDF_zuma          0.000000
TFIDF_filth         0.000000
TFIDF_financial     0.000000
                      ...   
TFIDF_pitiful       0.000000
TFIDF_pitt          0.000000
TFIDF_pitts         0.000000
TFIDF_pittsburgh    0.000000
TFIDF_00            0.000000
Name: 1, Length: 9063, dtype: float64

In [42]:
# Placing vectors into the DataFrame for the testing data
tv_transformed_test_df = pd.DataFrame(tv_transformed_test.toarray(),
                                       columns=tv.get_feature_names()).add_prefix('TFIDF_')
tv_transformed_test_df_no_sw = pd.DataFrame(tv_transformed_test_no_sw.toarray(),
                                       columns=tv_no_sw.get_feature_names()).add_prefix('TFIDF_')

In [43]:
tv_test_df = pd.concat([X_test,tv_transformed_test_df], axis=1, sort=False)
tv_test_df_no_sw = pd.concat([X_test_no_sw,tv_transformed_test_df_no_sw], axis=1, sort=False)

In [44]:
tv_test_df.head()

Unnamed: 0,Lemmatized Comment,TFIDF_00,TFIDF_000,TFIDF_0100,TFIDF_0300,TFIDF_0630,TFIDF_09,TFIDF_0h,TFIDF_10,TFIDF_100,...,TFIDF_zoloft,TFIDF_zombie,TFIDF_zone,TFIDF_zoo,TFIDF_zooie,TFIDF_zoological,TFIDF_zorg,TFIDF_zorn,TFIDF_zulu,TFIDF_zuma
0,be it that bad,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,i love you elaine,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,he leave his sweater,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,l triptophanea neurotransmitter sometimes use ...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,shoot it,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [45]:
tv_transformed_test_df.iloc[1].sort_values(ascending=False)

TFIDF_elaine       0.776058
TFIDF_love         0.592684
TFIDF_you          0.215544
TFIDF_zuma         0.000000
TFIDF_financing    0.000000
                     ...   
TFIDF_pineal       0.000000
TFIDF_pink         0.000000
TFIDF_pinkerton    0.000000
TFIDF_pinpoint     0.000000
TFIDF_00           0.000000
Name: 1, Length: 9071, dtype: float64

In [46]:
tv_test_df_no_sw.head()

Unnamed: 0,Lemmatized Comment_no_sw,TFIDF_00,TFIDF_000,TFIDF_0100,TFIDF_0300,TFIDF_0630,TFIDF_09,TFIDF_0h,TFIDF_10,TFIDF_100,...,TFIDF_zoloft,TFIDF_zombie,TFIDF_zone,TFIDF_zoo,TFIDF_zooie,TFIDF_zoological,TFIDF_zorg,TFIDF_zorn,TFIDF_zulu,TFIDF_zuma
0,bad,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,love elaine,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,leave sweater,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,l triptophanea neurotransmitter sometimes use ...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,shoot,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [47]:
tv_transformed_test_df_no_sw.iloc[1].sort_values(ascending=False)

TFIDF_elaine        0.794739
TFIDF_love          0.606951
TFIDF_zuma          0.000000
TFIDF_final         0.000000
TFIDF_financing     0.000000
                      ...   
TFIDF_pitt          0.000000
TFIDF_pitts         0.000000
TFIDF_pittsburgh    0.000000
TFIDF_pituitary     0.000000
TFIDF_00            0.000000
Name: 1, Length: 9063, dtype: float64

With TF-IDF, we have our vectors stored within our four DataFrames, tv_train_df, tv_train_df_no_sw, tv_test_df, and tv_test_df_no_sw.

Another approach to vectorizing outside of counting words and frequencies is the Word2Vec approach, a deep learning neural network developed by Google. For Word2Vec, we'll use gensim, using the documentation as a guide (https://radimrehurek.com/gensim/auto_examples/tutorials/run_word2vec.html).

In [48]:
# We'll follow the example for training my own model using my own dataset
import gensim.models

sentences_train = [comment.split() for comment in X_train]
word2vec_model = gensim.models.Word2Vec(sentences=sentences_train, vector_size=300)

In [49]:
sentences_train_no_sw = [comment.split() for comment in X_train_no_sw]
word2vec_model_no_sw = gensim.models.Word2Vec(sentences=sentences_train_no_sw, vector_size=300)

The documentation for gensim provided a really nice example on visualizing embedded words by reducing the dimensionality of the words to 2 dimensions using tSNE.

In [50]:
# Visualizing word embeddings via plotly and matplotlib and the TSNE algorithm
from sklearn.decomposition import IncrementalPCA    # inital reduction
from sklearn.manifold import TSNE                   # final reduction
import numpy as np                                  # array handling

def reduce_dimensions(model):
    num_dimensions = 2  # final num dimensions (2D, 3D, etc)

    # extract the words & their vectors, as numpy arrays
    vectors = np.asarray(model.wv.vectors)
    labels = np.asarray(model.wv.index_to_key)  # fixed-width numpy strings

    # reduce using t-SNE
    tsne = TSNE(n_components=num_dimensions, random_state=0)
    vectors = tsne.fit_transform(vectors)

    x_vals = [v[0] for v in vectors]
    y_vals = [v[1] for v in vectors]
    return x_vals, y_vals, labels

x_vals, y_vals, labels = reduce_dimensions(word2vec_model)

def plot_with_plotly(x_vals, y_vals, labels, plot_in_notebook=True):
    from plotly.offline import init_notebook_mode, iplot, plot
    import plotly.graph_objs as go

    trace = go.Scatter(x=x_vals, y=y_vals, mode='text', text=labels)
    data = [trace]

    if plot_in_notebook:
        init_notebook_mode(connected=True)
        iplot(data, filename='word-embedding-plot')
    else:
        plot(data, filename='word-embedding-plot.html')

def plot_with_matplotlib(x_vals, y_vals, labels):
    import matplotlib.pyplot as plt
    import random

    random.seed(0)

    plt.figure(figsize=(12, 12))
    plt.scatter(x_vals, y_vals)

    #
    # Label randomly subsampled 25 data points
    #
    indices = list(range(len(labels)))
    selected_indices = random.sample(indices, 25)
    for i in selected_indices:
        plt.annotate(labels[i], (x_vals[i], y_vals[i]))

try:
    get_ipython()
except Exception:
    plot_function = plot_with_matplotlib
else:
    plot_function = plot_with_plotly

print('This is a plot for word2vec_model')
plot_function(x_vals, y_vals, labels)

This is a plot for word2vec_model


In [51]:
x_vals_no_sw, y_vals_no_sw, labels_no_sw = reduce_dimensions(word2vec_model_no_sw)
print('This is a plot for word2vec_model_no_sw')
plot_function(x_vals_no_sw, y_vals_no_sw, labels_no_sw)

This is a plot for word2vec_model_no_sw


In [52]:
print(word2vec_model)

Word2Vec(vocab=2186, vector_size=300, alpha=0.025)


In [53]:
print(word2vec_model_no_sw)

Word2Vec(vocab=2088, vector_size=300, alpha=0.025)


With this, we have three different vectorizations of the mini_5k set. CountVectorization and TF-IDF Vectorization using sklearn and Word2Vec using gensim.