## 5.4 Featurizing text data with tfidf weighted word-vectors

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import re
import time
import warnings
import numpy as np
from nltk.corpus import stopwords
from sklearn.preprocessing import normalize
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
warnings.filterwarnings("ignore")
import sys
import os 
import pandas as pd
import numpy as np
from tqdm import tqdm

# exctract word2vec vectors
# https://github.com/explosion/spaCy/issues/1721
# http://landinghub.visualstudio.com/visual-cpp-build-tools
import spacy

In [None]:
# avoid decoding problems
df = pd.read_csv("/content/drive/MyDrive/Projects/Quora/Data/train.csv")
 
# encode questions to unicode
# https://stackoverflow.com/a/6812069
# ----------------- python 2 ---------------------
# df['question1'] = df['question1'].apply(lambda x: unicode(str(x),"utf-8"))
# df['question2'] = df['question2'].apply(lambda x: unicode(str(x),"utf-8"))
# ----------------- python 3 ---------------------
df['question1'] = df['question1'].apply(lambda x: str(x))
df['question2'] = df['question2'].apply(lambda x: str(x))

In [None]:
df.head()

Unnamed: 0.1,Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,140651,223482,72855,Does the Quran state that the earth is flat?,How do I prove the flat earth theory?,0
1,1,350470,479204,479205,"My car back glass has been broken, how can I c...",Which is the best car insurance company in India?,0
2,2,103454,170990,170991,Why do my teeth shift even though I wear my re...,Orthodontics: Will my retainer continue to str...,0
3,3,377861,509201,509202,I've been eating white rice with tuna and 1 av...,Tuna is one of the most convenient protein sou...,0
4,4,40424,73144,73145,Why is oil the predominate way of heating home...,What do New Yorkers think of the rest of the U...,0


In [None]:
df.drop("Unnamed: 0",inplace = True,axis = 1)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
# merge texts
questions = list(df['question1']) + list(df['question2'])

tfidf = TfidfVectorizer(lowercase=False, )
tfidf.fit_transform(questions)

# dict key:word and value:tf-idf score
word2tfidf = dict(zip(tfidf.get_feature_names(), tfidf.idf_))

- After we find TF-IDF scores, we convert each question to a weighted average of word2vec vectors by these scores.
- here we use a pre-trained GLOVE model which comes free with "Spacy".  https://spacy.io/usage/vectors-similarity
- It is trained on Wikipedia and therefore, it is stronger in terms of word semantics. 

In [None]:
!python -m spacy download en_core_web_lg
# !python -m spacy download en_core_web_sm
# !python -m spacy download en_core_web_md
!python -m spacy download en

2022-08-27 10:25:36.841012: E tensorflow/stream_executor/cuda/cuda_driver.cc:271] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting en-core-web-lg==3.4.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.4.0/en_core_web_lg-3.4.0-py3-none-any.whl (587.7 MB)
[K     |████████████████████████████████| 587.7 MB 7.7 kB/s 
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')
2022-08-27 10:26:18.850849: E tensorflow/stream_executor/cuda/cuda_driver.cc:271] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected
[38;5;3m⚠ As of spaCy v3.0, shortcuts like 'en' are deprecated. Please use the
full pipeline package name 'en_core_web_sm' instead.[0m
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/pub

In [None]:
# en_vectors_web_lg, which includes over 1 million unique vectors.
nlp = spacy.load('en_core_web_lg')
vecs1 = []
# https://github.com/noamraph/tqdm
# tqdm is used to print the progrss bar
for qu1 in tqdm(list(df['question1'])):
    doc1 = nlp(qu1) 
    # 384 is the number of dimensions of vectors 
    mean_vec1 = np.zeros([len(doc1), len(doc1[0].vector)])
    for word1 in doc1:
        # word2vec
        vec1 = word1.vector
        # fetch df score
        try:
            idf = word2tfidf[str(word1)]
        except:
            idf = 0
        # compute final vec
        mean_vec1 += vec1 * idf
    mean_vec1 = mean_vec1.mean(axis=0)
    vecs1.append(mean_vec1)
df['q1_feats_m'] = list(vecs1)
x=nlp('man')
len(x.vector)

100%|██████████| 80000/80000 [12:29<00:00, 106.71it/s]


300

In [None]:
vecs2 = []
for qu2 in tqdm(list(df['question2'])):
    doc2 = nlp(qu2) 
    mean_vec2 = np.zeros([len(doc1), len(doc2[0].vector)])
    for word2 in doc2:
        # word2vec
        vec2 = word2.vector
        # fetch df score
        try:
            idf = word2tfidf[str(word2)]
        except:
            #print word
            idf = 0
        # compute final vec
        mean_vec2 += vec2 * idf
    mean_vec2 = mean_vec2.mean(axis=0)
    vecs2.append(mean_vec2)
df['q2_feats_m'] = list(vecs2)

100%|██████████| 80000/80000 [12:02<00:00, 110.65it/s]


In [None]:
df.to_csv("/content/drive/MyDrive/Projects/Quora/Data/df_with_tfidf_w2v.csv")

In [None]:
#prepro_features_train.csv (Simple Preprocessing Feartures)
#nlp_features_train.csv (NLP Features)
if os.path.isfile('/content/drive/MyDrive/Projects/Quora/Data/nlp_features_train.csv'):
    dfnlp = pd.read_csv("/content/drive/MyDrive/Projects/Quora/Data/nlp_features_train.csv",encoding='latin-1')
else:
    print("download nlp_features_train.csv from drive or run previous notebook")

if os.path.isfile('/content/drive/MyDrive/Projects/Quora/Data/df_fe_without_preprocessing_train.csv'):
    dfppro = pd.read_csv("/content/drive/MyDrive/Projects/Quora/Data/df_fe_without_preprocessing_train.csv",encoding='latin-1')
else:
    print("download df_fe_without_preprocessing_train.csv from drive or run previous notebook")

In [None]:
df1 = dfnlp.drop(['qid1','qid2','question1','question2'],axis=1)
df2 = dfppro.drop(['qid1','qid2','question1','question2','is_duplicate'],axis=1)
df3 = df.drop(['qid1','qid2','question1','question2','is_duplicate'],axis=1)
df3_q1 = pd.DataFrame(df3.q1_feats_m.values.tolist(), index= df3.index)
df3_q2 = pd.DataFrame(df3.q2_feats_m.values.tolist(), index= df3.index)

In [None]:
# dataframe of nlp features
df1.head()

Unnamed: 0.1,Unnamed: 0,id,is_duplicate,cwc_min,cwc_max,csc_min,csc_max,ctc_min,ctc_max,last_word_eq,first_word_eq,abs_len_diff,mean_len,token_set_ratio,token_sort_ratio,fuzz_ratio,fuzz_partial_ratio,longest_substr_ratio
0,0,140651,0,0.499988,0.499988,0.249994,0.249994,0.374995,0.33333,0.0,0.0,1.0,8.5,67,66,46,46,0.184211
1,1,350470,0,0.599988,0.374995,0.0,0.0,0.33333,0.214284,0.0,0.0,5.0,11.5,57,50,23,40,0.22
2,2,103454,0,0.199998,0.181817,0.44444,0.333331,0.260868,0.239999,0.0,0.0,2.0,24.0,57,59,36,47,0.145299
3,3,377861,0,0.30769,0.266665,0.0,0.0,0.129032,0.129032,0.0,0.0,0.0,31.0,47,45,35,42,0.07483
4,4,40424,0,0.333328,0.249997,0.499988,0.399992,0.333331,0.285712,1.0,0.0,2.0,13.0,56,57,51,54,0.25


In [None]:
df1.drop("Unnamed: 0",inplace = True,axis = 1)

In [None]:
# data before preprocessing 
df2.head()

Unnamed: 0.1,Unnamed: 0,id,freq_qid1,freq_qid2,q1len,q2len,q1_n_words,q2_n_words,word_Common,word_Total,word_share,freq_q1+q2,freq_q1-q2
0,0,140651,1,1,44,37,9,8,2.0,16.0,0.125,2,0
1,1,350470,1,1,74,49,14,9,1.0,23.0,0.043478,2,0
2,2,103454,1,2,116,138,23,24,5.0,41.0,0.121951,3,1
3,3,377861,1,1,147,144,29,29,2.0,51.0,0.039216,2,0
4,4,40424,1,1,79,59,14,12,4.0,23.0,0.173913,2,0


In [None]:
df2.drop("Unnamed: 0",inplace = True,axis = 1)

In [None]:
# Questions 1 tfidf weighted word2vec
df3_q1.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,290,291,292,293,294,295,296,297,298,299
0,-6.988441,136.699944,-64.962242,20.36091,53.041836,29.885409,29.842797,137.623206,-26.18576,4.918103,...,53.311781,26.632405,31.411602,-50.244483,-69.979159,44.159736,7.612701,-99.815037,-35.019941,13.689423
1,57.439391,73.474271,-345.37683,-45.978133,231.126044,67.376631,148.759427,370.451036,71.177448,146.218071,...,105.860512,-227.328902,38.794714,98.353986,-158.062079,-29.902804,-42.089963,-67.943578,-145.414598,96.416114
2,-27.169425,421.893477,-478.078846,-130.702125,201.496696,35.241516,-3.999662,662.491184,-210.086991,159.738621,...,136.429602,24.846999,223.781298,-227.510228,-287.893507,-28.617904,-93.935821,-16.023693,-523.807581,28.379155
3,-89.639229,-40.366389,-451.323366,67.61708,304.08465,-229.659922,154.073569,555.175209,-205.829925,150.167173,...,238.218881,-124.155453,101.27158,-206.799933,-422.841403,179.811404,187.300439,-35.93389,-404.81514,255.676317
4,-236.9747,99.283254,-262.79808,145.704787,328.20675,38.163662,54.575788,238.210706,85.459172,118.729166,...,15.544833,-20.633789,-78.907012,42.198698,-261.903408,73.478417,77.485029,-249.991332,-61.058012,95.324792


In [None]:
# Questions 2 tfidf weighted word2vec
df3_q2.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,290,291,292,293,294,295,296,297,298,299
0,-34.115527,80.969918,-22.877679,-6.415809,-2.166809,43.137141,24.145206,148.995495,-57.182735,74.233633,...,80.475596,-39.830429,72.818719,-5.28267,-57.8668,-34.641398,3.752136,75.783305,-40.521464,46.867214
1,-17.849913,29.668285,-124.85446,-21.46937,135.486501,25.988836,51.685142,101.562422,71.950542,13.018574,...,-14.732463,-150.78122,53.148583,73.516857,-173.928576,-21.251884,-23.844391,-89.964827,-10.483215,70.213638
2,-188.418771,350.403472,-348.036999,-7.075688,396.282342,-27.324447,56.58153,782.914326,-54.66001,166.901965,...,136.21285,-142.54917,27.160261,-137.058516,-223.45566,37.129406,-13.365226,-70.400027,-575.673654,40.029448
3,80.540976,390.646902,-497.244401,-183.268348,412.617537,116.35009,139.852291,463.450724,-413.705747,-33.112804,...,337.048848,-188.137862,239.545886,-147.171152,-352.394971,152.980919,124.964462,199.469344,-405.938959,338.605843
4,-183.082525,75.220117,-84.167647,175.695247,311.344178,-7.739691,53.554088,115.075061,56.086958,35.370595,...,116.865741,13.774787,58.744463,13.403347,-111.818721,29.643483,2.331652,-140.133979,-96.233456,29.628152


In [None]:
print("Number of features in nlp dataframe :", df1.shape[1])
print("Number of features in preprocessed dataframe :", df2.shape[1])
print("Number of features in question1 w2v  dataframe :", df3_q1.shape[1])
print("Number of features in question2 w2v  dataframe :", df3_q2.shape[1])
print("Number of features in final dataframe  :", df1.shape[1]+df2.shape[1]+df3_q1.shape[1]+df3_q2.shape[1])

Number of features in nlp dataframe : 17
Number of features in preprocessed dataframe : 12
Number of features in question1 w2v  dataframe : 300
Number of features in question2 w2v  dataframe : 300
Number of features in final dataframe  : 629


In [None]:
# storing the final features to csv file
if not os.path.isfile('/content/drive/MyDrive/Projects/Quora/Data/final_features.csv'):
    df3_q1['id']=df1['id']
    df3_q2['id']=df1['id']
    df1  = df1.merge(df2, on='id',how='left')
    df2  = df3_q1.merge(df3_q2, on='id',how='left')
    result  = df1.merge(df2, on='id',how='left')
    result.to_csv('/content/drive/MyDrive/Projects/Quora/Data/final_features.csv')