<h2> 3.6 Featurizing text data with tfidf weighted word-vectors </h2>

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import re
import time
import warnings
import numpy as np
from nltk.corpus import stopwords
from sklearn.preprocessing import normalize
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
warnings.filterwarnings("ignore")
import sys
import os 
import pandas as pd
import numpy as np
from tqdm import tqdm

# exctract word2vec vectors
# https://github.com/explosion/spaCy/issues/1721
# http://landinghub.visualstudio.com/visual-cpp-build-tools
import spacy

In [2]:
# avoid decoding problems
df = pd.read_csv("train.csv")
 
# encode questions to unicode
# https://stackoverflow.com/a/6812069
# ----------------- python 2 ---------------------
# df['question1'] = df['question1'].apply(lambda x: unicode(str(x),"utf-8"))
# df['question2'] = df['question2'].apply(lambda x: unicode(str(x),"utf-8"))
# ----------------- python 3 ---------------------
df['question1'] = df['question1'].apply(lambda x: str(x))
df['question2'] = df['question2'].apply(lambda x: str(x))

In [3]:
df.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0


In [4]:
# Take 100k datasample
df = df.sample(n=100000, random_state=1)
df.shape

(100000, 6)

In [5]:
# Split the data into 70,30 train and test data
from sklearn.model_selection import train_test_split
tr, ts = train_test_split(df, test_size=0.3, random_state=1, stratify=df['is_duplicate'].values)

In [6]:
tr.shape, ts.shape

((70000, 6), (30000, 6))

In [7]:
tr.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
149483,149483,235456,235457,How do I translate in Android?,"How would you translate ""螳螂捕蝉, 黄雀在后""?",0
146085,146085,179014,230839,Can a pet bird be trained to live without a ca...,How do airports keep birds away?,0
337094,337094,44878,204218,What is the best way to teach a child how to s...,How do you teach your kid to swim?,1
115033,115033,187657,187658,How do I add a location to my business page on...,"Can a Facebook Page check-in to a Place? Or, w...",0
190104,190104,289081,289082,What purpose did the Roman Colosseum have?,What purpose does the Colosseum serve?,1


In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
# merge texts
questions = list(tr['question1']) + list(tr['question2'])

tfidf = TfidfVectorizer(lowercase=False)
tfidf.fit_transform(questions)

# dict key:word and value:tf-idf score
word2tfidf = dict(zip(tfidf.get_feature_names(), tfidf.idf_))

- After we find TF-IDF scores, we convert each question to a weighted average of word2vec vectors by these scores.
- here we use a pre-trained GLOVE model which comes free with "Spacy".  https://spacy.io/usage/vectors-similarity
- It is trained on Wikipedia and therefore, it is stronger in terms of word semantics. 

In [9]:
# en_vectors_web_lg, which includes over 1 million unique vectors.
nlp = spacy.load('en_core_web_sm-2.2.0/en_core_web_sm/en_core_web_sm-2.2.0')

vecs1 = []
# https://github.com/noamraph/tqdm
# tqdm is used to print the progress bar
# For 'question1' feature train data
for qu1 in tqdm(list(tr['question1'])):
    doc1 = nlp(qu1) 
    # 384 is the number of dimensions of vectors 
    mean_vec1 = np.zeros([len(doc1), len(doc1[0].vector)])
    for word1 in doc1:
        # word2vec
        vec1 = word1.vector
        # fetch df score
        try:
            idf = word2tfidf[str(word1)]
        except:
            idf = 0
        # compute final vec
        mean_vec1 += vec1 * idf
    mean_vec1 = mean_vec1.mean(axis=0)
    vecs1.append(mean_vec1)
    
tr['q1_feats_m'] = list(vecs1)

100%|███████████████████████████████████████████████████████████████████████████| 70000/70000 [07:11<00:00, 162.19it/s]


In [10]:
# For 'question2' feature train data
vecs2 = []
for qu2 in tqdm(list(tr['question2'])):
    doc2 = nlp(qu2) 
    mean_vec2 = np.zeros([len(doc2), len(doc2[0].vector)])
    for word2 in doc2:
        # word2vec
        vec2 = word2.vector
        # fetch df score
        try:
            idf = word2tfidf[str(word2)]
        except:
            #print word
            idf = 0
        # compute final vec
        mean_vec2 += vec2 * idf
    mean_vec2 = mean_vec2.mean(axis=0)
    vecs2.append(mean_vec2)

tr['q2_feats_m'] = list(vecs2)

100%|███████████████████████████████████████████████████████████████████████████| 70000/70000 [07:12<00:00, 161.69it/s]


In [11]:
vecs1 = []
# https://github.com/noamraph/tqdm
# tqdm is used to print the progress bar
# For 'question1' feature train data
for qu1 in tqdm(list(ts['question1'])):
    doc1 = nlp(qu1) 
    # 384 is the number of dimensions of vectors 
    mean_vec1 = np.zeros([len(doc1), len(doc1[0].vector)])
    for word1 in doc1:
        # word2vec
        vec1 = word1.vector
        # fetch df score
        try:
            idf = word2tfidf[str(word1)]
        except:
            idf = 0
        # compute final vec
        mean_vec1 += vec1 * idf
    mean_vec1 = mean_vec1.mean(axis=0)
    vecs1.append(mean_vec1)
    
ts['q1_feats_m'] = list(vecs1)

100%|███████████████████████████████████████████████████████████████████████████| 30000/30000 [03:05<00:00, 161.72it/s]


In [12]:
# For 'question2' feature train data
vecs2 = []
for qu2 in tqdm(list(ts['question2'])):
    doc2 = nlp(qu2) 
    mean_vec2 = np.zeros([len(doc2), len(doc2[0].vector)])
    for word2 in doc2:
        # word2vec
        vec2 = word2.vector
        # fetch df score
        try:
            idf = word2tfidf[str(word2)]
        except:
            #print word
            idf = 0
        # compute final vec
        mean_vec2 += vec2 * idf
    mean_vec2 = mean_vec2.mean(axis=0)
    vecs2.append(mean_vec2)

ts['q2_feats_m'] = list(vecs2)

100%|███████████████████████████████████████████████████████████████████████████| 30000/30000 [03:06<00:00, 161.01it/s]


In [13]:
tr.shape, ts.shape

((70000, 8), (30000, 8))

In [14]:
#prepro_features_train.csv (Simple Preprocessing Feartures)
#nlp_features_train.csv (NLP Features)
if os.path.isfile('nlp_features_train.csv'):
    dfnlp = pd.read_csv("nlp_features_train.csv",encoding='latin-1')
else:
    print("download nlp_features_train.csv from drive or run previous notebook")

if os.path.isfile('df_basicfe_train.csv'):
    dfppro = pd.read_csv("df_basicfe_train.csv",encoding='latin-1')
else:
    print("download df_basicfe_train.csv from drive or run previous notebook")

In [15]:
# Take 100k and split in same proportion
# Random state parameter give the good idea that if you give random state to any value, and recompile again and again, 
# the result will always the same.

df1 = dfnlp.sample(n=100000, random_state=1)
df2 = dfppro.sample(n=100000, random_state=1)
df1.shape, df2.shape

((100000, 21), (100000, 17))

In [16]:
df_tr1, df_ts1 = train_test_split(df1, test_size=0.3, random_state=1, stratify=df1['is_duplicate'].values)
df_tr2, df_ts2 = train_test_split(df2, test_size=0.3, random_state=1, stratify=df2['is_duplicate'].values)
df_tr1.shape, df_tr2.shape, df_ts1.shape, df_ts2.shape

((70000, 21), (70000, 17), (30000, 21), (30000, 17))

In [17]:
df_tr1 = df_tr1.drop(['qid1','qid2','question1','question2'],axis=1)
df_ts1 = df_ts1.drop(['qid1','qid2','question1','question2'],axis=1)
df_tr2 = df_tr2.drop(['qid1','qid2','question1','question2','is_duplicate'],axis=1)
df_ts2 = df_ts2.drop(['qid1','qid2','question1','question2','is_duplicate'],axis=1)

In [28]:
# Just take tfidf w2v feature only and remove others
df3 = tr.drop(['qid1','qid2','question1','question2','is_duplicate'],axis=1)

# Store tfidf w2v of question1 train data
df3_q1 = pd.DataFrame(df3.q1_feats_m.values.tolist(), index= df3.index)

# Store tfidf w2v of question2 train data
df3_q2 = pd.DataFrame(df3.q2_feats_m.values.tolist(), index= df3.index)

# Just take tfidf w2v feature only and remove others
df3 = ts.drop(['qid1','qid2','question1','question2','is_duplicate'],axis=1)

# Store tfidf w2v of question1 train data
df4_q1 = pd.DataFrame(df3.q1_feats_m.values.tolist(), index= df3.index)

# Store tfidf w2v of question2 train data
df4_q2 = pd.DataFrame(df3.q2_feats_m.values.tolist(), index= df3.index)

In [29]:
# dataframe of advance nlp feature of train data
df_tr1.head()

Unnamed: 0,id,is_duplicate,cwc_min,cwc_max,csc_min,csc_max,ctc_min,ctc_max,last_word_eq,first_word_eq,abs_len_diff,mean_len,token_set_ratio,token_sort_ratio,fuzz_ratio,fuzz_partial_ratio,longest_substr_ratio
149483,149483,0,0.499975,0.249994,0.499975,0.249994,0.333328,0.333328,0.0,1.0,0.0,6.0,65,42,56,63,0.354839
146085,146085,0,0.249994,0.124998,0.0,0.0,0.166664,0.066666,1.0,0.0,9.0,10.5,36,35,33,44,0.181818
337094,337094,1,0.666644,0.399992,0.399992,0.333328,0.499994,0.333331,1.0,0.0,4.0,10.0,68,61,56,65,0.257143
115033,115033,0,0.599988,0.374995,0.28571,0.22222,0.416663,0.238094,0.0,0.0,9.0,16.5,66,54,40,52,0.175439
190104,190104,1,0.666644,0.666644,0.666644,0.499988,0.666656,0.57142,0.0,1.0,1.0,6.5,83,79,79,76,0.358974


In [30]:
# dataframe of advance nlp feature of test data
df_ts1.head()

Unnamed: 0,id,is_duplicate,cwc_min,cwc_max,csc_min,csc_max,ctc_min,ctc_max,last_word_eq,first_word_eq,abs_len_diff,mean_len,token_set_ratio,token_sort_ratio,fuzz_ratio,fuzz_partial_ratio,longest_substr_ratio
303972,303972,0,0.833319,0.555549,0.999986,0.636358,0.857137,0.599997,0.0,0.0,6.0,17.0,95,69,50,68,0.555556
72206,72206,1,0.499992,0.499992,0.333322,0.199996,0.363633,0.333331,1.0,0.0,1.0,11.5,65,61,58,58,0.216667
106335,106335,1,0.999983,0.857131,0.999986,0.874989,0.999992,0.866661,0.0,1.0,2.0,14.0,100,92,92,100,0.985507
268194,268194,0,0.583328,0.538457,0.636358,0.583328,0.499998,0.466665,0.0,0.0,2.0,29.0,76,65,54,56,0.153285
33364,33364,0,0.333322,0.166664,0.0,0.0,0.166664,0.090908,0.0,0.0,5.0,8.5,50,48,48,55,0.181818


In [31]:
# Dataframe of basic feature of train data
df_tr2.head()

Unnamed: 0,id,freq_qid1,freq_qid2,q1len,q2len,q1_n_words,q2_n_words,word_Common,word_Total,word_share,freq_q1+q2,freq_q1-q2
149483,149483,1,1,30,37,6,6,2.0,12.0,0.166667,2,0
146085,146085,2,1,66,32,15,6,1.0,20.0,0.05,3,1
337094,337094,3,4,50,34,12,8,4.0,19.0,0.210526,7,1
115033,115033,1,1,56,96,12,19,4.0,27.0,0.148148,2,0
190104,190104,1,1,42,38,7,6,4.0,13.0,0.307692,2,0


In [32]:
# Dataframe of basic feature of train data
df_ts2.head()

Unnamed: 0,id,freq_qid1,freq_qid2,q1len,q2len,q1_n_words,q2_n_words,word_Common,word_Total,word_share,freq_q1+q2,freq_q1-q2
303972,303972,1,1,60,102,13,19,11.0,31.0,0.354839,2,0
72206,72206,3,2,61,59,10,11,3.0,19.0,0.157895,5,1
106335,106335,1,1,79,68,15,12,10.0,27.0,0.37037,2,0
268194,268194,1,1,133,140,26,30,11.0,49.0,0.22449,2,0
33364,33364,1,1,32,58,6,11,1.0,17.0,0.058824,2,0


In [33]:
# Questions 1 tfidf weighted word2vec
df3_q1.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,86,87,88,89,90,91,92,93,94,95
149483,-35.97082,15.233681,-83.94503,1.821328,33.741198,44.038382,-10.648363,-7.947617,49.464797,-49.422297,...,-12.500348,36.124373,-23.047068,46.039429,-60.616644,-14.279125,1.455213,-50.320586,0.711188,-40.428919
146085,-85.020419,46.710926,-125.288743,-75.608168,64.469008,115.124894,20.742659,158.109579,-102.035777,-23.369245,...,-166.052082,63.268312,-30.95924,50.607657,-32.15372,-24.559688,61.468883,-91.269326,-125.749991,-35.600178
337094,-33.065192,-20.178773,-77.886197,-120.081859,43.659595,49.50405,-0.36568,7.494594,-37.327102,-67.218828,...,-41.573335,61.387205,-54.091007,61.049627,-16.642347,-56.006326,84.121395,-91.712856,-2.291135,73.972443
115033,-72.944806,-51.25323,-80.234965,-67.835021,104.177971,58.757157,-34.426265,13.135962,35.04246,-3.93522,...,-70.013921,81.913096,41.23677,71.686704,-77.300308,-24.61709,25.184338,-113.626988,-2.613044,-39.260321
190104,-29.736965,56.128536,-25.453947,-74.479012,-19.861213,40.580078,-90.739224,58.63896,4.982964,-60.430177,...,-7.056295,77.465605,15.612841,27.9028,-65.106652,20.54373,40.426446,-107.896188,-46.825787,53.716711


In [35]:
df4_q1.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,86,87,88,89,90,91,92,93,94,95
303972,-35.086464,30.876545,-80.416684,-106.096214,47.108919,29.962707,-74.211906,76.222053,-52.817481,-26.348005,...,-128.259679,86.027187,-15.743384,14.056966,0.832484,52.07639,13.275423,-141.048609,-109.166131,-5.137598
72206,1.685434,-36.735866,-196.028876,-140.657811,-43.790299,-9.156165,6.992758,-19.499774,-33.655322,-12.672375,...,-123.773371,57.898876,-66.728941,-9.11992,-0.521965,-26.317729,32.656572,-50.538371,16.680108,81.163604
106335,-111.489742,22.646898,-196.95207,-150.55791,126.045632,76.895359,-19.473369,73.001146,-117.053125,-64.015255,...,-124.569062,173.180535,6.944051,44.30668,-11.721601,88.717609,71.904813,-135.637861,-55.131707,-76.251671
268194,-41.903405,-199.639366,-153.416367,-153.148267,-19.763226,99.206988,8.039918,34.153714,-109.584684,23.843086,...,-179.908821,62.718726,-18.323001,128.238268,-68.955381,107.098512,165.807786,-116.754054,-123.739345,-69.575305
33364,-76.174544,-46.639612,-59.099957,-51.663911,27.144306,25.683623,-2.553842,-23.118165,-23.407722,-58.68586,...,2.305467,57.517286,20.301194,121.262572,-42.335856,42.225121,85.550532,-61.283083,-38.63779,10.311416


In [34]:
# Questions 2 tfidf weighted word2vec
df3_q2.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,86,87,88,89,90,91,92,93,94,95
149483,-33.364869,-5.495585,-87.12227,-43.383327,43.409313,8.358389,-9.309176,37.756822,41.297043,-72.184926,...,-24.634324,107.105604,-13.405043,89.338409,-76.582135,81.455045,-19.58937,-53.108553,-42.871799,50.53386
146085,-1.638967,39.372217,-87.395557,-33.371412,18.65948,44.3817,15.707307,36.774686,-23.186097,-87.426189,...,79.361151,47.821262,-7.425674,66.802753,-13.87027,-94.35944,46.116241,-67.960821,-47.144685,-56.180332
337094,-39.340501,-53.392284,-44.010859,-74.687828,43.536204,60.482706,-26.103947,-34.727356,46.506368,-86.78762,...,1.134499,58.792493,-44.008875,28.522468,21.318031,-37.849338,17.109098,-174.242894,5.341458,-8.428313
115033,-122.364968,35.770705,-87.295173,-171.742449,127.268932,139.834739,-8.106615,30.278763,-150.290404,-143.008661,...,-196.619944,156.978629,163.932592,189.862594,-93.918032,-37.407774,54.468218,-167.026241,-149.009252,16.012905
190104,-9.906318,17.387377,-17.30345,-41.914522,41.149178,47.610587,-85.934912,74.996205,-38.504456,-89.565776,...,4.403876,102.640468,23.684351,62.863966,-1.83352,3.923125,69.741323,-117.703127,-18.816606,6.477832


In [36]:
df4_q2.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,86,87,88,89,90,91,92,93,94,95
303972,-94.364372,82.517343,-207.351414,-123.893591,85.709126,88.650925,-86.536333,154.028366,-12.134686,-10.402775,...,-144.991185,222.946762,7.811561,-37.996282,-67.199225,-21.514719,39.070854,-166.003348,-184.055494,5.645808
72206,0.680685,-65.595772,-17.650448,-48.919356,70.625238,111.967245,-40.600397,-7.394346,-7.762665,-91.553182,...,39.345638,84.567552,-28.47992,26.782872,15.819899,0.026018,-9.827171,-171.505975,-69.218734,68.290094
106335,-117.09996,33.590637,-171.654092,-91.793823,134.984977,95.909769,0.91168,68.959064,-109.523601,-50.480547,...,-131.088148,135.252509,-29.426335,28.865457,14.432708,23.914983,39.755017,-105.470546,-41.418499,-25.264727
268194,58.616156,-77.750569,-253.6617,-201.832817,109.530422,183.692623,18.180422,199.613442,-59.008171,-41.821395,...,-279.122597,196.022612,-99.552281,86.235594,80.645449,-50.810555,113.096337,-237.795976,-53.090529,-100.635119
33364,-77.772708,-12.111855,-121.301187,-31.978529,73.371403,92.359309,-14.790487,17.243972,-18.429706,32.780807,...,-77.334339,98.339507,46.067258,-36.468406,-7.591038,-32.041288,-63.03463,-96.130894,11.901086,38.803704


In [25]:
print("Number of features in nlp dataframe :", df_tr1.shape[1])
print("Number of features in preprocessed dataframe :", df_tr2.shape[1])
print("Number of features in question1 w2v  dataframe :", df3_q1.shape[1])
print("Number of features in question2 w2v  dataframe :", df3_q2.shape[1])
print("Number of features in final dataframe  :", df_tr1.shape[1]+df_tr2.shape[1]+df3_q1.shape[1]+df3_q2.shape[1])

Number of features in nlp dataframe : 17
Number of features in preprocessed dataframe : 12
Number of features in question1 w2v  dataframe : 96
Number of features in question2 w2v  dataframe : 96
Number of features in final dataframe  : 221


In [27]:
# storing the final features to csv file
if not os.path.isfile('tr_finalfeatures_tfidf_w2v.csv'):
    # Assign 'id' attribute astfidf-w2v vector dataframe same as nlp or basic dataframe
    # Please observe above dataframe of basic,nlp and tfidfw2v features, you will find 'id' are all same
    df3_q1['id']=df_tr1['id']
    df3_q2['id']=df_tr1['id']
    
    # Merge the train basic and nlp feature 
    df1  = df_tr1.merge(df_tr2, on='id',how='left')
    print('Total df1 features: {0}'.format(df1.shape))
    
    # Merge the train tfidf-w2v question1 and question2
    df2  = df3_q1.merge(df3_q2, on='id',how='left')
    print('Total df2 features: {0}'.format(df2.shape))
    
    # Merge above two dataframe
    result  = df1.merge(df2, on='id',how='left')
    
    print('Total features: {0}'.format(result.shape))
    result.to_csv('tr_finalfeatures_tfidf_w2v.csv')

Total df1 features: (70000, 28)
Total df2 features: (70000, 193)
Total features: (70000, 220)


In [37]:
# storing the final features to csv file
if not os.path.isfile('ts_finalfeatures_tfidf_w2v.csv'):
    # Assign 'id' attribute astfidf-w2v vector dataframe same as nlp or basic dataframe
    # Please observe above dataframe of basic,nlp and tfidfw2v features, you will find 'id' are all same
    df4_q1['id']=df_ts1['id']
    df4_q2['id']=df_ts1['id']
    
    # Merge the test basic and nlp feature 
    df1  = df_ts1.merge(df_ts2, on='id',how='left')
    print('Total df1 features: {0}'.format(df1.shape))
    
    # Merge the test tfidf-w2v question1 and question2
    df2  = df4_q1.merge(df4_q2, on='id',how='left')
    print('Total df2 features: {0}'.format(df2.shape))
    
    # Merge above two dataframe
    result  = df1.merge(df2, on='id',how='left')
    print('Total features: {0}'.format(result.shape))
    result.to_csv('ts_finalfeatures_tfidf_w2v.csv')

Total df1 features: (30000, 28)
Total df2 features: (30000, 193)
Total features: (30000, 220)
