In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
%cd /content/drive/MyDrive/Home_Depot_Case_Study/Workspace3
!pwd

/content/drive/MyDrive/Home_Depot_Case_Study/Workspace3
/content/drive/MyDrive/Home_Depot_Case_Study/Workspace3


In [None]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns
from matplotlib_venn import venn2
import regex as re
from collections import Counter
!pip install nltk 
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
import re
from collections import Counter
from nltk.stem import PorterStemmer 
from wordcloud import WordCloud, STOPWORDS 
from prettytable import PrettyTable
from sklearn.feature_extraction.text import CountVectorizer 
from textblob import TextBlob
from scipy.stats import spearmanr
import warnings
warnings.filterwarnings('ignore')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


**Reading the Data**

In [None]:
# index should be [59217, 49176, 26412, 3850, 48569] and [70761, 65893, 18905, 32031, 67726]
cleaned_df_train = pd.read_pickle('preprocessing/cleaned_df_train.pkl')
cleaned_df2_train = pd.read_pickle('preprocessing/cleaned_df2_train.pkl')

cleaned_df_test = pd.read_pickle('preprocessing/cleaned_df_test.pkl')
cleaned_df2_test = pd.read_pickle('preprocessing/cleaned_df2_test.pkl')

In [None]:
print(cleaned_df_train.shape)
print(cleaned_df2_train.shape)
print(cleaned_df_test.shape)
print(cleaned_df2_test.shape)

(59253, 8)
(59253, 8)
(14814, 8)
(14814, 8)


# Feature Engineering 

In [None]:
import math
import re
import xgboost as xgb
from xgboost import XGBRegressor
from nltk import sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from gensim.models import Word2Vec
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from numpy.linalg import norm
import pickle
from tqdm.notebook import tqdm
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVR
from sklearn.linear_model import SGDRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# Feature Set 1 - Set Theoretic Features

- common words and their count between query and document fields (title, brand, description)
- Cosine coefficient,  jacquard coefficient between query and document fields
- Length of the query 
- Lengths of document fields 
- whether the last word in query is in text fields 

## Creating the Features

In [None]:
data1_train = cleaned_df_train.copy()
data1_test = cleaned_df_test.copy()

print(data1_train.shape)
print(data1_test.shape)

(59253, 8)
(14814, 8)


**BASIC SET OPERATIONS**

In [None]:
def common_words(df, col1, col2):
  """
  Returns common words between each row of col1 and col2 of df in the form of a list. 
  Length of list is number of rows in dataframe
  """
  common_list = []
  for i, row in df[[col1,col2]].iterrows():
    set1 = set(row[col1].split())
    set2 = set(row[col2].split())
    common = set1 & set2
    common = ' '.join(common)
    common_list.append(common)
  return common_list

def cosine_similarity_sent(sent1, sent2):
  """
  Cosine Similarity between 2 sentences treating them as sets of words
  """

  set1 = set(sent1.split())
  set2 = set(sent2.split())
  numerator = len(set1 & set2)
  denominator = math.sqrt(len(set1)) * math.sqrt(len(set2))
  
  if not denominator:
      return 0.0
  else:
      return numerator / denominator

def jacquard_coefficient_sent(sent1, sent2):
  """
  Jacquard Coefficient between 2 sentences treating them as sets of words
  """

  set1 = set(sent1.split())
  set2 = set(sent2.split())
  numerator = len(set1 & set2)
  denominator = len(set1 | set2)

  if not denominator:
      return 0.0
  else:
      return numerator / denominator

In [None]:
#COMMON WORDS 
data1_train['common_ST'] = common_words(data1_train,'corrected_search', 'title')
data1_train['common_SD'] = common_words(data1_train,'corrected_search', 'description')
data1_train['common_SB'] = common_words(data1_train,'corrected_search', 'brand')
#raw
data1_train['common_r_ST'] = common_words(data1_train,'raw_search', 'title')
data1_train['common_r_SD'] = common_words(data1_train,'raw_search', 'description')
data1_train['common_r_SB'] = common_words(data1_train,'raw_search', 'brand')

#NUM OF COMMON WORDS 
data1_train['num_common_ST'] = data1_train['common_ST'].apply(lambda x : len(x.split()))
data1_train['num_common_SD'] = data1_train['common_SD'].apply(lambda x : len(x.split()))
data1_train['num_common_SB'] = data1_train['common_SB'].apply(lambda x : len(x.split()))
#raw
data1_train['num_common_r_ST'] = data1_train['common_r_ST'].apply(lambda x : len(x.split()))
data1_train['num_common_r_SD'] = data1_train['common_r_SD'].apply(lambda x : len(x.split()))
data1_train['num_common_r_SB'] = data1_train['common_r_SB'].apply(lambda x : len(x.split()))

#COSINE COEFFICIENT
data1_train['cosine_ST'] = data1_train.apply(lambda row: cosine_similarity_sent(row['corrected_search'], row['title']), axis=1) 
data1_train['cosine_SD'] = data1_train.apply(lambda row: cosine_similarity_sent(row['corrected_search'], row['description']), axis=1)
data1_train['cosine_SB'] = data1_train.apply(lambda row: cosine_similarity_sent(row['corrected_search'], row['brand']), axis=1)
#raw
data1_train['cosine_r_ST'] = data1_train.apply(lambda row: cosine_similarity_sent(row['raw_search'], row['title']), axis=1) 
data1_train['cosine_r_SD'] = data1_train.apply(lambda row: cosine_similarity_sent(row['raw_search'], row['description']), axis=1)
data1_train['cosine_r_SB'] = data1_train.apply(lambda row: cosine_similarity_sent(row['raw_search'], row['brand']), axis=1)

#JACQUARD COEFFICIENT
data1_train['jacquard_ST'] = data1_train.apply(lambda row: jacquard_coefficient_sent(row['corrected_search'], row['title']), axis=1) 
data1_train['jacquard_SD'] = data1_train.apply(lambda row: jacquard_coefficient_sent(row['corrected_search'], row['description']), axis=1)
data1_train['jacquard_SB'] = data1_train.apply(lambda row: jacquard_coefficient_sent(row['corrected_search'], row['brand']), axis=1)
#raw
data1_train['jacquard_r_ST'] = data1_train.apply(lambda row: jacquard_coefficient_sent(row['raw_search'], row['title']), axis=1) 
data1_train['jacquard_r_SD'] = data1_train.apply(lambda row: jacquard_coefficient_sent(row['raw_search'], row['description']), axis=1)
data1_train['jacquard_r_SB'] = data1_train.apply(lambda row: jacquard_coefficient_sent(row['raw_search'], row['brand']), axis=1)

#LENGTH OF DOCUMENT FIELDS
data1_train['len_description'] = data1_train['description'].apply(lambda x : len(x.split()))
data1_train['len_brand'] = data1_train['brand'].apply(lambda x : len(x.split()))
data1_train['len_title'] = data1_train['title'].apply(lambda x : len(x.split()))
data1_train['len_search'] = data1_train['corrected_search'].apply(lambda x : len(x.split()))
data1_train['len_r_search'] = data1_train['raw_search'].apply(lambda x : len(x.split()))

#PRESENCE OF LAST WORD IN DOC FEILDS
data1_train['islast_ST'] = data1_train.apply(lambda row: row['corrected_search'].split()[-1] in row['title'].split(), axis=1)
data1_train['islast_SD'] = data1_train.apply(lambda row: row['corrected_search'].split()[-1] in row['description'].split(), axis=1)
data1_train['islast_SB'] = data1_train.apply(lambda row: row['corrected_search'].split()[-1] in row['brand'].split(), axis=1)
#raw
data1_train['islast_r_ST'] = data1_train.apply(lambda row: row['raw_search'].split()[-1] in row['title'].split(), axis=1)
data1_train['islast_r_SD'] = data1_train.apply(lambda row: row['raw_search'].split()[-1] in row['description'].split(), axis=1)
data1_train['islast_r_SB'] = data1_train.apply(lambda row: row['raw_search'].split()[-1] in row['brand'].split(), axis=1)

#converting all the boolean column to int
bool_cols = ['islast_ST', 'islast_SD', 'islast_SB', 'islast_r_ST', 'islast_r_SD', 'islast_r_SB']
for col in bool_cols:
  data1_train[col] = data1_train[col].astype(int)

In [None]:
#COMMON WORDS 
data1_test['common_ST'] = common_words(data1_test,'corrected_search', 'title')
data1_test['common_SD'] = common_words(data1_test,'corrected_search', 'description')
data1_test['common_SB'] = common_words(data1_test,'corrected_search', 'brand')
#raw
data1_test['common_r_ST'] = common_words(data1_test,'raw_search', 'title')
data1_test['common_r_SD'] = common_words(data1_test,'raw_search', 'description')
data1_test['common_r_SB'] = common_words(data1_test,'raw_search', 'brand')

#NUM OF COMMON WORDS 
data1_test['num_common_ST'] = data1_test['common_ST'].apply(lambda x : len(x.split()))
data1_test['num_common_SD'] = data1_test['common_SD'].apply(lambda x : len(x.split()))
data1_test['num_common_SB'] = data1_test['common_SB'].apply(lambda x : len(x.split()))
#raw
data1_test['num_common_r_ST'] = data1_test['common_r_ST'].apply(lambda x : len(x.split()))
data1_test['num_common_r_SD'] = data1_test['common_r_SD'].apply(lambda x : len(x.split()))
data1_test['num_common_r_SB'] = data1_test['common_r_SB'].apply(lambda x : len(x.split()))

#COSINE COEFFICIENT
data1_test['cosine_ST'] = data1_test.apply(lambda row: cosine_similarity_sent(row['corrected_search'], row['title']), axis=1) 
data1_test['cosine_SD'] = data1_test.apply(lambda row: cosine_similarity_sent(row['corrected_search'], row['description']), axis=1)
data1_test['cosine_SB'] = data1_test.apply(lambda row: cosine_similarity_sent(row['corrected_search'], row['brand']), axis=1)
#raw
data1_test['cosine_r_ST'] = data1_test.apply(lambda row: cosine_similarity_sent(row['raw_search'], row['title']), axis=1) 
data1_test['cosine_r_SD'] = data1_test.apply(lambda row: cosine_similarity_sent(row['raw_search'], row['description']), axis=1)
data1_test['cosine_r_SB'] = data1_test.apply(lambda row: cosine_similarity_sent(row['raw_search'], row['brand']), axis=1)

#JACQUARD COEFFICIENT
data1_test['jacquard_ST'] = data1_test.apply(lambda row: jacquard_coefficient_sent(row['corrected_search'], row['title']), axis=1) 
data1_test['jacquard_SD'] = data1_test.apply(lambda row: jacquard_coefficient_sent(row['corrected_search'], row['description']), axis=1)
data1_test['jacquard_SB'] = data1_test.apply(lambda row: jacquard_coefficient_sent(row['corrected_search'], row['brand']), axis=1)
#raw
data1_test['jacquard_r_ST'] = data1_test.apply(lambda row: jacquard_coefficient_sent(row['raw_search'], row['title']), axis=1) 
data1_test['jacquard_r_SD'] = data1_test.apply(lambda row: jacquard_coefficient_sent(row['raw_search'], row['description']), axis=1)
data1_test['jacquard_r_SB'] = data1_test.apply(lambda row: jacquard_coefficient_sent(row['raw_search'], row['brand']), axis=1)

#LENGTH OF DOCUMENT FIELDS
data1_test['len_description'] = data1_test['description'].apply(lambda x : len(x.split()))
data1_test['len_brand'] = data1_test['brand'].apply(lambda x : len(x.split()))
data1_test['len_title'] = data1_test['title'].apply(lambda x : len(x.split()))
data1_test['len_search'] = data1_test['corrected_search'].apply(lambda x : len(x.split()))
data1_test['len_r_search'] = data1_test['raw_search'].apply(lambda x : len(x.split()))

#PRESENCE OF LAST WORD IN DOC FEILDS
data1_test['islast_ST'] = data1_test.apply(lambda row: row['corrected_search'].split()[-1] in row['title'].split(), axis=1)
data1_test['islast_SD'] = data1_test.apply(lambda row: row['corrected_search'].split()[-1] in row['description'].split(), axis=1)
data1_test['islast_SB'] = data1_test.apply(lambda row: row['corrected_search'].split()[-1] in row['brand'].split(), axis=1)
#raw
data1_test['islast_r_ST'] = data1_test.apply(lambda row: row['raw_search'].split()[-1] in row['title'].split(), axis=1)
data1_test['islast_r_SD'] = data1_test.apply(lambda row: row['raw_search'].split()[-1] in row['description'].split(), axis=1)
data1_test['islast_r_SB'] = data1_test.apply(lambda row: row['raw_search'].split()[-1] in row['brand'].split(), axis=1)

#converting all the boolean column to int
bool_cols = ['islast_ST', 'islast_SD', 'islast_SB', 'islast_r_ST', 'islast_r_SD', 'islast_r_SB']
for col in bool_cols:
  data1_test[col] = data1_test[col].astype(int)

In [None]:
print(data1_train.shape)
print(data1_test.shape)

(59253, 43)
(14814, 43)


In [None]:
corrected_feat_set = ['num_common_ST','num_common_SD', 'num_common_SB', 'cosine_ST', 'cosine_SD', 
                'cosine_SB', 'jacquard_ST', 'jacquard_SD', 'jacquard_SB', 
                'len_description', 'len_brand', 'len_title', 'len_search',
                'islast_ST', 'islast_SD', 'islast_SB']

raw_feat_set = ['num_common_r_ST', 'num_common_r_SD', 'num_common_r_SB', 'cosine_r_ST', 'cosine_r_SD', 
                'cosine_r_SB',  'jacquard_r_ST', 'jacquard_r_SD', 'jacquard_r_SB', 
                'len_description', 'len_brand', 'len_title', 'len_r_search', 
                'islast_r_ST','islast_r_SD', 'islast_r_SB']

feat_set1_comb =  ['num_common_ST','num_common_SD', 'num_common_SB', 'cosine_ST', 'cosine_SD', 
                'cosine_SB', 'jacquard_ST', 'jacquard_SD', 'jacquard_SB', 
                'len_description', 'len_brand', 'len_title', 'len_search',
                'islast_ST', 'islast_SD', 'islast_SB', 'num_common_r_ST', 'num_common_r_SD', 'num_common_r_SB', 'cosine_r_ST', 'cosine_r_SD', 
                'cosine_r_SB',  'jacquard_r_ST', 'jacquard_r_SD', 'jacquard_r_SB', 
                'len_r_search', 'islast_r_ST','islast_r_SD', 'islast_r_SB']

print(len(corrected_feat_set), len(raw_feat_set), len(feat_set1_comb))

16 16 29


In [None]:
data1_train.to_pickle('featurization/train/feature_set1/data1_train.pkl')
data1_train[corrected_feat_set].to_pickle('featurization/train/feature_set1/data1_train_corrected.pkl')
data1_train[raw_feat_set].to_pickle('featurization/train/feature_set1/data1_train_raw.pkl')
data1_train[feat_set1_comb].to_pickle('featurization/train/feature_set1/data1_train_comb.pkl')

In [None]:
data1_test.to_pickle('featurization/test/feature_set1/data1_test.pkl')
data1_test[corrected_feat_set].to_pickle('featurization/test/feature_set1/data1_test_corrected.pkl')
data1_test[raw_feat_set].to_pickle('featurization/test/feature_set1/data1_test_raw.pkl')
data1_test[feat_set1_comb].to_pickle('featurization/test/feature_set1/data1_test_comb.pkl')

# Feature Set 2 - VSM Based Features

* LSI on term document matrices of query, title and description separately and use the low-rank approximations as features directly. 
* Apply LSI on full text (combined title and description). Then, transform query into the ‘concept’ space and calculate cosine similarity between them. 
* Other similarity measure will be cosine coefficient, jacquard coefficient and inner product.

## Creating the Features

In [None]:
data2_train = cleaned_df_train.copy()
data2_test = cleaned_df_test.copy()

**LSI ON INDIVISUAL DOCS FIELDS**

In [None]:
tfidf_vectorizer = TfidfVectorizer(smooth_idf=True, token_pattern=r"(?u)\b\w+\b")
X_search_train = tfidf_vectorizer.fit_transform(data2_train['corrected_search'])
print(X_search_train.shape) 

svd_model = TruncatedSVD(n_components=2000, algorithm='randomized', n_iter=100, random_state=122)
truncated_search_train = svd_model.fit_transform(X_search_train)
print(truncated_search_train.shape)

tsvd_var_ratios = svd_model.explained_variance_ratio_
print('variance explained', tsvd_var_ratios.sum())

with open('Final/featurization/F2_tfidf_search.pkl','wb') as f:
  pickle.dump(tfidf_vectorizer, f)
with open('Final/featurization/F2_tsvd_search.pkl','wb') as f:
  pickle.dump(svd_model, f)

(59253, 5011)
(59253, 2000)
variance explained 0.8961633433416117


In [None]:
X_search_test = tfidf_vectorizer.transform(data2_test['corrected_search'])
print(X_search_test.shape)

truncated_search_test = svd_model.transform(X_search_test)
print(truncated_search_test.shape)

(14814, 5011)
(14814, 2000)


In [None]:
#TRAIN
tfidf_vectorizer = TfidfVectorizer(smooth_idf=True, token_pattern=r"(?u)\b\w+\b")
X_title_train = tfidf_vectorizer.fit_transform(data2_train['title'])
print(X_title_train.shape) 

svd_model = TruncatedSVD(n_components=1000, algorithm='randomized', n_iter=100, random_state=122)
truncated_title_train = svd_model.fit_transform(X_title_train)
print(truncated_title_train.shape)

tsvd_var_ratios = svd_model.explained_variance_ratio_
print('variance explained', tsvd_var_ratios.sum())

with open('Final/featurization/F2_tfidf_title.pkl','wb') as f:
  pickle.dump(tfidf_vectorizer, f)
with open('Final/featurization/F2_tsvd_title.pkl','wb') as f:
  pickle.dump(svd_model, f)

(59253, 13609)
(59253, 1000)
variance explained 0.6826035592293663


In [None]:
#TEST
X_title_test = tfidf_vectorizer.transform(data2_test['title'])
print(X_title_test.shape)

truncated_title_test = svd_model.transform(X_title_test)
print(truncated_title_test.shape)

(14814, 13609)
(14814, 1000)


In [None]:
#TRAIN
tfidf_vectorizer = TfidfVectorizer(smooth_idf=True, token_pattern=r"(?u)\b\w+\b")
X_desc_train = tfidf_vectorizer.fit_transform(data2_train['description'])
print(X_desc_train.shape) 

svd_model = TruncatedSVD(n_components=1000, algorithm='randomized', n_iter=100, random_state=122)
truncated_desc_train = svd_model.fit_transform(X_desc_train)
print(truncated_desc_train.shape)

tsvd_var_ratios = svd_model.explained_variance_ratio_
print('variance explained', tsvd_var_ratios.sum())

with open('Final/featurization/F2_tfidf_desc.pkl','wb') as f:
  pickle.dump(tfidf_vectorizer, f)
with open('Final/featurization/F2_tsvd_desc.pkl','wb') as f:
  pickle.dump(svd_model, f)

(59253, 27980)
(59253, 1000)
variance explained 0.6721646260276368


In [None]:
#TEST
X_desc_test = tfidf_vectorizer.transform(data2_test['description'])
print(X_desc_test.shape)

truncated_desc_test = svd_model.transform(X_desc_test)
print(truncated_desc_test.shape)

(14814, 27980)
(14814, 1000)


In [None]:
trun_arr = np.hstack((truncated_search_train,truncated_title_train,truncated_desc_train))
truncated_df_train = pd.DataFrame(trun_arr, index=cleaned_df_train.index)
print(truncated_df_train.shape)
truncated_df_train.to_pickle('featurization/train/feature_set2/truncated_df_train.pkl')

(59253, 4000)


In [None]:
trun_arr = np.hstack((truncated_search_test,truncated_title_test,truncated_desc_test))
truncated_df_test = pd.DataFrame(trun_arr, index=cleaned_df_test.index)
print(truncated_df_test.shape)
truncated_df_test.to_pickle('featurization/test/feature_set2/truncated_df_test.pkl')

(14814, 4000)


**LSI ON COMBINED TITLE AND DESCRIPTION**

In [None]:
title_desc_train = data2_train["title"].astype(str) + ' ' + data2_train["description"].astype(str)
title_desc_test = data2_test["title"].astype(str) + ' ' + data2_test["description"].astype(str)

In [None]:
vectorizer = TfidfVectorizer(smooth_idf=True, token_pattern=r"(?u)\b\w+\b", min_df=2)
X_title_desc_train = vectorizer.fit_transform(title_desc_train)
print(X_title_desc_train.shape) # check shape of the document-term matrix

# SVD represent documents and terms in vectors 
svd_model = TruncatedSVD(n_components=2000, algorithm='randomized', n_iter=100, random_state=122)
truncated_title_desc_train = svd_model.fit_transform(X_title_desc_train)
print(truncated_title_desc_train.shape)

tsvd_var_ratios = svd_model.explained_variance_ratio_
print('variance explained', tsvd_var_ratios.sum())

with open('Final/featurization/F2_tfidf_lsi.pkl','wb') as f:
  pickle.dump(vectorizer, f)
with open('Final/featurization/F2_tsvd_lsi.pkl','wb') as f:
  pickle.dump(svd_model, f)

(59253, 20806)
(59253, 2000)
variance explained 0.8093126258029886


In [None]:
X_search_ = vectorizer.transform(data2_train['corrected_search'])
print(X_search_.shape) # check shape of the document-term matrix

transformed_search_train = svd_model.transform(X_search_)
transformed_search_train.shape

(59253, 20806)


(59253, 2000)

In [None]:
X_title_desc_test = vectorizer.transform(title_desc_test)
print(X_title_desc_test.shape)

truncated_title_desc_test = svd_model.transform(X_title_desc_test)
print(truncated_title_desc_test.shape)

(14814, 20806)
(14814, 2000)


In [None]:
X_search_ = vectorizer.transform(data2_test['corrected_search'])
print(X_search_.shape) # check shape of the document-term matrix

transformed_search_test = svd_model.transform(X_search_)
transformed_search_test.shape

(14814, 20806)


(14814, 2000)

**Calculating similarity between them**

In [None]:
def cosine_similarity_vec(a, b):
  """
  Cosine Similarity between 2 vectors
  """  
  num = np.dot(a, b)
  den = norm(a)*norm(b)
  if den != 0:
    return num/den
  else:
    return 0

def jacquard_similarity_vec(a, b):
  """
  Cosine Similarity between 2 vectors
  """  
  num = np.dot(a,b)
  den = norm(a)**2 + norm(b)**2 - np.dot(a,b)
  if den != 0:
    return num/den
  else:
    return 0

def inner_product_vec(a, b):
  return np.dot(a,b)

In [None]:
cos_sim = []
for i in range(len(transformed_search_train)):
  cos_sim.append(cosine_similarity_vec(truncated_title_desc_train[i], transformed_search_train[i]))
print(len(cos_sim))
data2_train['lsi_cos_sim'] = cos_sim

jaq_sim = []
for i in range(len(transformed_search_train)):
  jaq_sim.append(jacquard_similarity_vec(truncated_title_desc_train[i], transformed_search_train[i]))
print(len(jaq_sim))
data2_train['lsi_jaq_sim'] = jaq_sim

inn_prod = []
for i in range(len(transformed_search_train)):
  inn_prod.append(inner_product_vec(truncated_title_desc_train[i], transformed_search_train[i]))
print(len(inn_prod))
data2_train['lsi_inn_prod'] = inn_prod

data2_train = data2_train[['lsi_cos_sim', 'lsi_jaq_sim',	'lsi_inn_prod']]
print(data2_train.shape)
data2_train.head(2)

59253
59253
59253
(59253, 3)


Unnamed: 0,lsi_cos_sim,lsi_jaq_sim,lsi_inn_prod
59217,0.371404,0.211167,0.195933
49176,0.336645,0.19993,0.24643


In [None]:
cos_sim = []
for i in range(len(transformed_search_test)):
  cos_sim.append(cosine_similarity_vec(truncated_title_desc_test[i], transformed_search_test[i]))
print(len(cos_sim))
data2_test['lsi_cos_sim'] = cos_sim

jaq_sim = []
for i in range(len(transformed_search_test)):
  jaq_sim.append(jacquard_similarity_vec(truncated_title_desc_test[i], transformed_search_test[i]))
print(len(jaq_sim))
data2_test['lsi_jaq_sim'] = jaq_sim

inn_prod = []
for i in range(len(transformed_search_test)):
  inn_prod.append(inner_product_vec(truncated_title_desc_test[i], transformed_search_test[i]))
print(len(inn_prod))
data2_test['lsi_inn_prod'] = inn_prod

data2_test = data2_test[['lsi_cos_sim', 'lsi_jaq_sim',	'lsi_inn_prod']]
print(data2_test.shape)
data2_test.head(2)

14814
14814
14814
(14814, 3)


Unnamed: 0,lsi_cos_sim,lsi_jaq_sim,lsi_inn_prod
70761,0.148432,0.069174,0.04878
65893,0.553608,0.365412,0.320622


In [None]:
data2_train.to_pickle('featurization/train/feature_set2/data2_train.pkl')
data2_test.to_pickle('featurization/test/feature_set2/data2_test.pkl')

# Feature Set 3 - Probabalistic features

- Language model with Diriclet, Absolute and Jelinek Miller smoothing
- BM25 ranking function 
- Query and fields represented as tf-idf Word2Vec
- (sum, min, max) of (tf, normalized tf, tf-idf) for query in each text field 

## Creating the features

In [None]:
data3_train = cleaned_df_train.copy()
data3_test = cleaned_df_test.copy()

**LANGUAGE MODEL**

**Note**

* In the code for JM Smoothing, probability = lambd\*p_c + (1-lambd)\*pml. So a smaller lambd overfits. We have set lambda to 0.1
* Also we have used -log(prob) thus the features will have a negative corrleation with the relevance score
* In Dirichlet smoothing, i have kept value of mu as 12, 106, 1.5 for title, desc and brand which is their average length 

In [None]:
def lmir_fit(corpus):
  words = ' '.join(corpus).split()
  freq_dict = Counter(words)
  total_words = len(words)
  params = {
      'freq_dict':freq_dict,
      'total_words':total_words
  }
  return params

def lmir_jm_score(query, doc, params, lambd):
  query = query.split()
  doc = doc.split()
  if len(doc) != 0 and len(query) != 0:
    eps = 0.0001/(params['total_words'])
    score = 0
    for word in query:
      p_ml = doc.count(word) / len(doc)
      if word in params['freq_dict'].keys():
        p_c = params['freq_dict'][word] / params['total_words']
      else: 
        p_c = 0
      score += np.log(lambd*p_ml + (1-lambd)*p_c + eps)
    return score

def lmir_dir_score(query, doc, params, mu):
  query = query.split()
  doc = doc.split()
  if len(doc) != 0 and len(query) != 0:
    eps = 0.0001/(params['total_words'])
    score = 0
    for word in query:
      p_ml = doc.count(word) / len(doc)
      if word in params['freq_dict']:
        p_c = params['freq_dict'][word] / params['total_words']
      else: 
        p_c = 0
      lambd = len(doc) / (len(doc) + mu)
      score += np.log(lambd*p_ml + (1-lambd)*p_c + eps)
    return score

def lmir_abs_score(query, doc, alpha):
  query = query.split()
  doc = doc.split()
  if len(doc) != 0 and len(query) != 0:
    score = 0
    temp_dict = {k:v+alpha for k,v in Counter(doc).items()}
    for word in query:
      if word in temp_dict:
        pass
      else:
        temp_dict[word] = alpha
    
    denominator = sum(temp_dict.values())
    for word in query:
      score += temp_dict[word] / denominator
  return score

In [None]:
corpus = data3_train['title'].values
params_title = lmir_fit(corpus)
#train
data3_train['JM_ST'] = data3_train.apply(lambda row: lmir_jm_score(row['corrected_search'], row['title'], params_title, 0.9), axis=1)
data3_train['Dir_ST'] = data3_train.apply(lambda row: lmir_dir_score(row['corrected_search'], row['title'], params_title, 12 ), axis=1)
data3_train['AD_ST'] = data3_train.apply(lambda row: lmir_abs_score(row['corrected_search'], row['title'], 0.01 ), axis=1)
#test
data3_test['JM_ST'] = data3_test.apply(lambda row: lmir_jm_score(row['corrected_search'], row['title'], params_title, 0.9), axis=1)
data3_test['Dir_ST'] = data3_test.apply(lambda row: lmir_dir_score(row['corrected_search'], row['title'], params_title, 12 ), axis=1)
data3_test['AD_ST'] = data3_test.apply(lambda row: lmir_abs_score(row['corrected_search'], row['title'], 0.01 ), axis=1)

corpus = data3_train['brand'].values
params_brand = lmir_fit(corpus)
#train
data3_train['JM_SB'] = data3_train.apply(lambda row: lmir_jm_score(row['corrected_search'], row['brand'], params_brand, 0.9), axis=1)
data3_train['Dir_SB'] = data3_train.apply(lambda row: lmir_dir_score(row['corrected_search'], row['brand'], params_brand, 1.5 ), axis=1)
data3_train['AD_SB'] = data3_train.apply(lambda row: lmir_abs_score(row['corrected_search'], row['brand'], 0.01 ), axis=1)
#test
data3_test['JM_SB'] = data3_test.apply(lambda row: lmir_jm_score(row['corrected_search'], row['brand'], params_brand, 0.9), axis=1)
data3_test['Dir_SB'] = data3_test.apply(lambda row: lmir_dir_score(row['corrected_search'], row['brand'], params_brand, 1.5 ), axis=1)
data3_test['AD_SB'] = data3_test.apply(lambda row: lmir_abs_score(row['corrected_search'], row['brand'], 0.01 ), axis=1)

corpus = data3_train['description'].values
params_desc = lmir_fit(corpus)
#train
data3_train['JM_SD'] = data3_train.apply(lambda row: lmir_jm_score(row['corrected_search'], row['description'], params_desc, 0.9), axis=1)
data3_train['Dir_SD'] = data3_train.apply(lambda row: lmir_dir_score(row['corrected_search'], row['description'], params_desc, 106 ), axis=1)
data3_train['AD_SD'] = data3_train.apply(lambda row: lmir_abs_score(row['corrected_search'], row['description'], 0.01 ), axis=1)
#test
data3_test['JM_SD'] = data3_test.apply(lambda row: lmir_jm_score(row['corrected_search'], row['description'], params_desc, 0.9), axis=1)
data3_test['Dir_SD'] = data3_test.apply(lambda row: lmir_dir_score(row['corrected_search'], row['description'], params_desc, 106 ), axis=1)
data3_test['AD_SD'] = data3_test.apply(lambda row: lmir_abs_score(row['corrected_search'], row['description'], 0.01 ), axis=1)

In [None]:
with open('Final/featurization/F3_LM_params_title.pkl','wb') as f:
  pickle.dump(params_title, f)
with open('Final/featurization/F3_LM_params_brand.pkl','wb') as f:
  pickle.dump(params_brand, f)
with open('Final/featurization/F3_LM_params_desc.pkl','wb') as f:
  pickle.dump(params_desc, f)

**BM25 RANKING FUNCTION**

* The default parameters k and b are generally set to 1.5 and 0.75 respectively. With a bit of hyperparameter tuning, we found k=0.1 and b=0.5 to be working pretty well. To compare, we checked the corrleation score of the bm25 features with relevance. 

* Also normalized tf didnt work well here. We got a correlation coefficient (with relevance) of 0.2 for the normalized tf and of about 0.245 for the simple tf. 



In [None]:
def bm25_fit(corpus):
  tfidf_model = TfidfVectorizer(smooth_idf=False, token_pattern=r"(?u)\b\w+\b")
  tfidf_model.fit(corpus)
  idf_dict = dict(zip(tfidf_model.get_feature_names(), list(tfidf_model.idf_)))
  avgdl = np.mean([len(doc.split()) for doc in corpus])
  params = {'idf_dict': idf_dict, 
            'avgdl' : avgdl,
            'N' : len(corpus)}
  return params

def bm25_score(query, doc, params, k=0.1, b=0.5):

  idf_dict = params['idf_dict']
  avgdl = params['avgdl']
  N = params['N']
  score_query = 0

  for word in query.split():
    dl = len(doc.split())
    tf = doc.count(word)
    if word in idf_dict.keys():
      idf = idf_dict[word]
    else: 
      idf = np.log(N+1) 

    score_word = idf*(tf*(k+1))/(tf + k*(1-b) + b*dl/avgdl)
    score_query += score_word

  return score_query

In [None]:
params_bm25_title = bm25_fit(data3_train['title'])
data3_train['bm25_ST'] = data3_train.apply(lambda row: bm25_score(row['corrected_search'], row['title'], params_bm25_title), axis=1)
data3_test['bm25_ST'] = data3_test.apply(lambda row: bm25_score(row['corrected_search'], row['title'], params_bm25_title), axis=1)

params_bm25_desc = bm25_fit(data3_train['description'])
data3_train['bm25_SD'] = data3_train.apply(lambda row: bm25_score(row['corrected_search'], row['description'], params_bm25_desc), axis=1)
data3_test['bm25_SD'] = data3_test.apply(lambda row: bm25_score(row['corrected_search'], row['description'], params_bm25_desc), axis=1)

params_bm25_brand = bm25_fit(data3_train['brand'])
data3_train['bm25_SB'] = data3_train.apply(lambda row: bm25_score(row['corrected_search'], row['brand'], params_bm25_brand), axis=1)
data3_test['bm25_SB'] = data3_test.apply(lambda row: bm25_score(row['corrected_search'], row['brand'], params_bm25_brand), axis=1)

In [None]:
with open('Final/featurization/F3_bm25_params_title.pkl','wb') as f:
  pickle.dump(params_bm25_title, f)
with open('Final/featurization/F3_bm25_params_desc.pkl','wb') as f:
  pickle.dump(params_bm25_desc, f)
with open('Final/featurization/F3_bm25_params_brand.pkl','wb') as f:
  pickle.dump(params_bm25_brand, f)

**QUERY AND FIELDS REPRESENTED AS W2V**

* For this we are going to use data which is not stemmed stored in cleaned_df2

In [None]:
print(cleaned_df2_train.shape)
print(cleaned_df2_test.shape)

(59253, 8)
(14814, 8)


In [None]:
with open('glove_vectors', 'rb') as f:
    model = pickle.load(f)
    glove_words =  set(model.keys())

In [None]:
tfidf_model = TfidfVectorizer()
tfidf_model.fit(cleaned_df2_train['corrected_search'])
# we are converting a dictionary with word as a key, and the idf as a value
dictionary = dict(zip(tfidf_model.get_feature_names(), list(tfidf_model.idf_)))
tfidf_words = set(tfidf_model.get_feature_names())

search_tfidf_w2v_train = []; # the avg-w2v for each sentence/review is stored in this list
for sentence in tqdm(cleaned_df2_train['corrected_search']): # for each review/sentence
    vector = np.zeros(300) # as word vectors are of zero length
    tf_idf_weight =0; # num of words with a valid vector in the sentence/review
    for word in sentence.split(): # for each word in a review/sentence
        if (word in glove_words) and (word in tfidf_words):
            vec = model[word] # getting the vector for each word
            # here we are multiplying idf value(dictionary[word]) and the tf value((sentence.count(word)/len(sentence.split())))
            tf_idf = dictionary[word]*(sentence.count(word)) # getting the tfidf value for each word
            vector += (vec * tf_idf) # calculating tfidf weighted w2v
            tf_idf_weight += tf_idf
    if tf_idf_weight != 0:
        vector /= tf_idf_weight
    search_tfidf_w2v_train.append(vector)

params = {'dictionary':dictionary, 'tfidf_words':tfidf_words}
with open('Final/featurization/F3_tfidf_w2v_params_search.pkl','wb') as f:
  pickle.dump(params, f)

print(len(search_tfidf_w2v_train))
print(len(search_tfidf_w2v_train[0]))

HBox(children=(FloatProgress(value=0.0, max=59253.0), HTML(value='')))


59253
300


In [None]:
search_tfidf_w2v_test = []; # the avg-w2v for each sentence/review is stored in this list
for sentence in tqdm(cleaned_df2_test['corrected_search']): # for each review/sentence
    vector = np.zeros(300) # as word vectors are of zero length
    tf_idf_weight =0; # num of words with a valid vector in the sentence/review
    for word in sentence.split(): # for each word in a review/sentence
        if (word in glove_words) and (word in tfidf_words):
            vec = model[word] # getting the vector for each word
            # here we are multiplying idf value(dictionary[word]) and the tf value((sentence.count(word)/len(sentence.split())))
            tf_idf = dictionary[word]*(sentence.count(word)) # getting the tfidf value for each word
            vector += (vec * tf_idf) # calculating tfidf weighted w2v
            tf_idf_weight += tf_idf
    if tf_idf_weight != 0:
        vector /= tf_idf_weight
    search_tfidf_w2v_test.append(vector)

print(len(search_tfidf_w2v_test))
print(len(search_tfidf_w2v_test[0]))

HBox(children=(FloatProgress(value=0.0, max=14814.0), HTML(value='')))


14814
300


In [None]:
tfidf_model = TfidfVectorizer()
tfidf_model.fit(cleaned_df2_train['title'])
# we are converting a dictionary with word as a key, and the idf as a value
dictionary = dict(zip(tfidf_model.get_feature_names(), list(tfidf_model.idf_)))
tfidf_words = set(tfidf_model.get_feature_names())

title_tfidf_w2v_train = []; # the avg-w2v for each sentence/review is stored in this list
for sentence in tqdm(cleaned_df2_train['title']): # for each review/sentence
    vector = np.zeros(300) # as word vectors are of zero length
    tf_idf_weight =0; # num of words with a valid vector in the sentence/review
    for word in sentence.split(): # for each word in a review/sentence
        if (word in glove_words) and (word in tfidf_words):
            vec = model[word] # getting the vector for each word
            # here we are multiplying idf value(dictionary[word]) and the tf value((sentence.count(word)/len(sentence.split())))
            tf_idf = dictionary[word]*(sentence.count(word)) # getting the tfidf value for each word
            vector += (vec * tf_idf) # calculating tfidf weighted w2v
            tf_idf_weight += tf_idf
    if tf_idf_weight != 0:
        vector /= tf_idf_weight
    title_tfidf_w2v_train.append(vector)

params = {'dictionary':dictionary, 'tfidf_words':tfidf_words}
with open('Final/featurization/F3_tfidf_w2v_params_title.pkl','wb') as f:
  pickle.dump(params, f)

print(len(title_tfidf_w2v_train))
print(len(title_tfidf_w2v_train[0]))

HBox(children=(FloatProgress(value=0.0, max=59253.0), HTML(value='')))


59253
300


In [None]:
title_tfidf_w2v_test = []; # the avg-w2v for each sentence/review is stored in this list
for sentence in tqdm(cleaned_df2_test['title']): # for each review/sentence
    vector = np.zeros(300) # as word vectors are of zero length
    tf_idf_weight =0; # num of words with a valid vector in the sentence/review
    for word in sentence.split(): # for each word in a review/sentence
        if (word in glove_words) and (word in tfidf_words):
            vec = model[word] # getting the vector for each word
            # here we are multiplying idf value(dictionary[word]) and the tf value((sentence.count(word)/len(sentence.split())))
            tf_idf = dictionary[word]*(sentence.count(word)) # getting the tfidf value for each word
            vector += (vec * tf_idf) # calculating tfidf weighted w2v
            tf_idf_weight += tf_idf
    if tf_idf_weight != 0:
        vector /= tf_idf_weight
    title_tfidf_w2v_test.append(vector)

print(len(title_tfidf_w2v_test))
print(len(title_tfidf_w2v_test[0]))

HBox(children=(FloatProgress(value=0.0, max=14814.0), HTML(value='')))


14814
300


In [None]:
tfidf_model = TfidfVectorizer()
tfidf_model.fit(cleaned_df2_train['description'])
# we are converting a dictionary with word as a key, and the idf as a value
dictionary = dict(zip(tfidf_model.get_feature_names(), list(tfidf_model.idf_)))
tfidf_words = set(tfidf_model.get_feature_names())

# average Word2Vec
# compute average word2vec for each review.
desc_tfidf_w2v_train = []; # the avg-w2v for each sentence/review is stored in this list
for sentence in tqdm(cleaned_df2_train['description']): # for each review/sentence
    vector = np.zeros(300) # as word vectors are of zero length
    tf_idf_weight =0; # num of words with a valid vector in the sentence/review
    for word in sentence.split(): # for each word in a review/sentence
        if (word in glove_words) and (word in tfidf_words):
            vec = model[word] # getting the vector for each word
            # here we are multiplying idf value(dictionary[word]) and the tf value((sentence.count(word)/len(sentence.split())))
            tf_idf = dictionary[word]*(sentence.count(word)) # getting the tfidf value for each word
            vector += (vec * tf_idf) # calculating tfidf weighted w2v
            tf_idf_weight += tf_idf
    if tf_idf_weight != 0:
        vector /= tf_idf_weight
    desc_tfidf_w2v_train.append(vector)

params = {'dictionary':dictionary, 'tfidf_words':tfidf_words}
with open('Final/featurization/F3_tfidf_w2v_params_desc.pkl','wb') as f:
  pickle.dump(params, f)

print(len(desc_tfidf_w2v_train))
print(len(desc_tfidf_w2v_train[0]))

HBox(children=(FloatProgress(value=0.0, max=59253.0), HTML(value='')))


59253
300


In [None]:
desc_tfidf_w2v_test = []; # the avg-w2v for each sentence/review is stored in this list
for sentence in tqdm(cleaned_df2_test['description']): # for each review/sentence
    vector = np.zeros(300) # as word vectors are of zero length
    tf_idf_weight =0; # num of words with a valid vector in the sentence/review
    for word in sentence.split(): # for each word in a review/sentence
        if (word in glove_words) and (word in tfidf_words):
            vec = model[word] # getting the vector for each word
            # here we are multiplying idf value(dictionary[word]) and the tf value((sentence.count(word)/len(sentence.split())))
            tf_idf = dictionary[word]*(sentence.count(word)) # getting the tfidf value for each word
            vector += (vec * tf_idf) # calculating tfidf weighted w2v
            tf_idf_weight += tf_idf
    if tf_idf_weight != 0:
        vector /= tf_idf_weight
    desc_tfidf_w2v_test.append(vector)

print(len(desc_tfidf_w2v_test))
print(len(desc_tfidf_w2v_test[0]))

HBox(children=(FloatProgress(value=0.0, max=14814.0), HTML(value='')))


14814
300


In [None]:
#train
arr1 = np.array(search_tfidf_w2v_train)
arr2 = np.array(title_tfidf_w2v_train)
arr3 = np.array(desc_tfidf_w2v_train)
tfidf_w2v_df_train = pd.DataFrame(np.hstack((arr1, arr2, arr3)), index=cleaned_df2_train.index)
print(tfidf_w2v_df_train.shape)

#test
arr1 = np.array(search_tfidf_w2v_test)
arr2 = np.array(title_tfidf_w2v_test)
arr3 = np.array(desc_tfidf_w2v_test)
tfidf_w2v_df_test = pd.DataFrame(np.hstack((arr1, arr2, arr3)), index=cleaned_df2_test.index)
print(tfidf_w2v_df_test.shape)

(59253, 900)
(14814, 900)


In [None]:
tfidf_w2v_df_train.to_pickle('featurization/train/feature_set3/tfidf_w2v_df_train.pkl')
tfidf_w2v_df_test.to_pickle('featurization/test/feature_set3/tfidf_w2v_df_test.pkl')

**FEATURE (sum, min, max) of (normalized tf, tf-idf, idf) for query in each text field**

TITLE

In [None]:
tfidf_model = TfidfVectorizer(token_pattern=r"(?u)\b\w+\b")
tfidf_model.fit(cleaned_df_train['title'])
idf_dict = dict(zip(tfidf_model.get_feature_names(), list(tfidf_model.idf_)))
N = len(cleaned_df_train)

params = {'idf_dict':idf_dict, 'N':N}
with open('Final/featurization/F3_SmM_params_title.pkl','wb') as f:
  pickle.dump(params, f)

max_tf = []
max_idf = []
max_tfidf = []

min_tf = []
min_idf = []
min_tfidf = []

sum_tf = []
sum_idf = []
sum_tfidf = []


for ind, row in cleaned_df_train.iterrows():
  search = row['corrected_search']
  text = row['title']
  tf_vals = []
  idf_vals = []
  tfidf_vals = []
  for word in search.split():
    if word in idf_dict.keys():
      tf = text.count(word)
      idf = idf_dict[word]
    else:
      tf = text.count(word)
      idf = np.log(N+1)

    tf_vals.append(tf)
    idf_vals.append(idf)
    tfidf_vals.append(tf*idf)
  
  max_tf.append(max(tf_vals))
  min_tf.append(min(tf_vals))
  sum_tf.append(sum(tf_vals))

  max_idf.append(max(idf_vals))
  min_idf.append(min(idf_vals))
  sum_idf.append(sum(idf_vals))

  max_tfidf.append(max(tfidf_vals))
  min_tfidf.append(min(tfidf_vals))
  sum_tfidf.append(sum(tfidf_vals))

data3_train['max_tf_ST'] = max_tf
data3_train['max_idf_ST'] = max_idf
data3_train['max_tfidf_ST'] = max_tfidf

data3_train['min_tf_ST'] = min_tf
data3_train['min_idf_ST'] = min_idf
data3_train['min_tfidf_ST'] = min_tfidf

data3_train['sum_tf_ST'] = sum_tf
data3_train['sum_idf_ST'] = sum_idf
data3_train['sum_tfidf_ST'] = sum_tfidf

In [None]:
max_tf = []
max_idf = []
max_tfidf = []

min_tf = []
min_idf = []
min_tfidf = []

sum_tf = []
sum_idf = []
sum_tfidf = []

for ind, row in cleaned_df_test.iterrows():
  search = row['corrected_search']
  text = row['title']
  tf_vals = []
  idf_vals = []
  tfidf_vals = []
  for word in search.split():
    if word in idf_dict.keys():
      tf = text.count(word)
      idf = idf_dict[word]
    else:
      tf = text.count(word)
      idf = np.log(N+1)

    tf_vals.append(tf)
    idf_vals.append(idf)
    tfidf_vals.append(tf*idf)
  
  max_tf.append(max(tf_vals))
  min_tf.append(min(tf_vals))
  sum_tf.append(sum(tf_vals))

  max_idf.append(max(idf_vals))
  min_idf.append(min(idf_vals))
  sum_idf.append(sum(idf_vals))

  max_tfidf.append(max(tfidf_vals))
  min_tfidf.append(min(tfidf_vals))
  sum_tfidf.append(sum(tfidf_vals))

data3_test['max_tf_ST'] = max_tf
data3_test['max_idf_ST'] = max_idf
data3_test['max_tfidf_ST'] = max_tfidf

data3_test['min_tf_ST'] = min_tf
data3_test['min_idf_ST'] = min_idf
data3_test['min_tfidf_ST'] = min_tfidf

data3_test['sum_tf_ST'] = sum_tf
data3_test['sum_idf_ST'] = sum_idf
data3_test['sum_tfidf_ST'] = sum_tfidf

BRAND

In [None]:
tfidf_model = TfidfVectorizer(token_pattern=r"(?u)\b\w+\b")
tfidf_model.fit(cleaned_df_train['brand'])
idf_dict = dict(zip(tfidf_model.get_feature_names(), list(tfidf_model.idf_)))
N = len(cleaned_df_train)

params = {'idf_dict':idf_dict, 'N':N}
with open('Final/featurization/F3_SmM_params_brand.pkl','wb') as f:
  pickle.dump(params, f)

max_tf = []
max_idf = []
max_tfidf = []

min_tf = []
min_idf = []
min_tfidf = []

sum_tf = []
sum_idf = []
sum_tfidf = []


for ind, row in cleaned_df_train.iterrows():
  search = row['corrected_search']
  text = row['brand']
  tf_vals = []
  idf_vals = []
  tfidf_vals = []
  for word in search.split():
    if word in idf_dict.keys():
      tf = text.count(word)
      idf = idf_dict[word]
    else:
      tf = text.count(word)
      idf = np.log(N+1)

    tf_vals.append(tf)
    idf_vals.append(idf)
    tfidf_vals.append(tf*idf)
  
  max_tf.append(max(tf_vals))
  min_tf.append(min(tf_vals))
  sum_tf.append(sum(tf_vals))

  max_idf.append(max(idf_vals))
  min_idf.append(min(idf_vals))
  sum_idf.append(sum(idf_vals))

  max_tfidf.append(max(tfidf_vals))
  min_tfidf.append(min(tfidf_vals))
  sum_tfidf.append(sum(tfidf_vals))

data3_train['max_tf_SB'] = max_tf
data3_train['max_idf_SB'] = max_idf
data3_train['max_tfidf_SB'] = max_tfidf

data3_train['min_tf_SB'] = min_tf
data3_train['min_idf_SB'] = min_idf
data3_train['min_tfidf_SB'] = min_tfidf

data3_train['sum_tf_SB'] = sum_tf
data3_train['sum_idf_SB'] = sum_idf
data3_train['sum_tfidf_SB'] = sum_tfidf

In [None]:
max_tf = []
max_idf = []
max_tfidf = []

min_tf = []
min_idf = []
min_tfidf = []

sum_tf = []
sum_idf = []
sum_tfidf = []

for ind, row in cleaned_df_test.iterrows():
  search = row['corrected_search']
  text = row['brand']
  tf_vals = []
  idf_vals = []
  tfidf_vals = []
  for word in search.split():
    if word in idf_dict.keys():
      tf = text.count(word)
      idf = idf_dict[word]
    else:
      tf = text.count(word)
      idf = np.log(N+1)

    tf_vals.append(tf)
    idf_vals.append(idf)
    tfidf_vals.append(tf*idf)
  
  max_tf.append(max(tf_vals))
  min_tf.append(min(tf_vals))
  sum_tf.append(sum(tf_vals))

  max_idf.append(max(idf_vals))
  min_idf.append(min(idf_vals))
  sum_idf.append(sum(idf_vals))

  max_tfidf.append(max(tfidf_vals))
  min_tfidf.append(min(tfidf_vals))
  sum_tfidf.append(sum(tfidf_vals))

data3_test['max_tf_SB'] = max_tf
data3_test['max_idf_SB'] = max_idf
data3_test['max_tfidf_SB'] = max_tfidf

data3_test['min_tf_SB'] = min_tf
data3_test['min_idf_SB'] = min_idf
data3_test['min_tfidf_SB'] = min_tfidf

data3_test['sum_tf_SB'] = sum_tf
data3_test['sum_idf_SB'] = sum_idf
data3_test['sum_tfidf_SB'] = sum_tfidf

DESCRIPTION

In [None]:
tfidf_model = TfidfVectorizer(token_pattern=r"(?u)\b\w+\b")
tfidf_model.fit(cleaned_df_train['description'])
idf_dict = dict(zip(tfidf_model.get_feature_names(), list(tfidf_model.idf_)))
N = len(cleaned_df_train)

params = {'idf_dict':idf_dict, 'N':N}
with open('Final/featurization/F3_SmM_params_desc.pkl','wb') as f:
  pickle.dump(params, f)


max_tf = []
max_idf = []
max_tfidf = []

min_tf = []
min_idf = []
min_tfidf = []

sum_tf = []
sum_idf = []
sum_tfidf = []


for ind, row in cleaned_df_train.iterrows():
  search = row['corrected_search']
  text = row['description']
  tf_vals = []
  idf_vals = []
  tfidf_vals = []
  for word in search.split():
    if word in idf_dict.keys():
      tf = text.count(word)
      idf = idf_dict[word]
    else:
      tf = text.count(word)
      idf = np.log(N+1)

    tf_vals.append(tf)
    idf_vals.append(idf)
    tfidf_vals.append(tf*idf)
  
  max_tf.append(max(tf_vals))
  min_tf.append(min(tf_vals))
  sum_tf.append(sum(tf_vals))

  max_idf.append(max(idf_vals))
  min_idf.append(min(idf_vals))
  sum_idf.append(sum(idf_vals))

  max_tfidf.append(max(tfidf_vals))
  min_tfidf.append(min(tfidf_vals))
  sum_tfidf.append(sum(tfidf_vals))

data3_train['max_tf_SD'] = max_tf
data3_train['max_idf_SD'] = max_idf
data3_train['max_tfidf_SD'] = max_tfidf

data3_train['min_tf_SD'] = min_tf
data3_train['min_idf_SD'] = min_idf
data3_train['min_tfidf_SD'] = min_tfidf

data3_train['sum_tf_SD'] = sum_tf
data3_train['sum_idf_SD'] = sum_idf
data3_train['sum_tfidf_SD'] = sum_tfidf

In [None]:
max_tf = []
max_idf = []
max_tfidf = []

min_tf = []
min_idf = []
min_tfidf = []

sum_tf = []
sum_idf = []
sum_tfidf = []

for ind, row in cleaned_df_test.iterrows():
  search = row['corrected_search']
  text = row['description']
  tf_vals = []
  idf_vals = []
  tfidf_vals = []
  for word in search.split():
    if word in idf_dict.keys():
      tf = text.count(word)
      idf = idf_dict[word]
    else:
      tf = text.count(word)
      idf = np.log(N+1)

    tf_vals.append(tf)
    idf_vals.append(idf)
    tfidf_vals.append(tf*idf)
  
  max_tf.append(max(tf_vals))
  min_tf.append(min(tf_vals))
  sum_tf.append(sum(tf_vals))

  max_idf.append(max(idf_vals))
  min_idf.append(min(idf_vals))
  sum_idf.append(sum(idf_vals))

  max_tfidf.append(max(tfidf_vals))
  min_tfidf.append(min(tfidf_vals))
  sum_tfidf.append(sum(tfidf_vals))

data3_test['max_tf_SD'] = max_tf
data3_test['max_idf_SD'] = max_idf
data3_test['max_tfidf_SD'] = max_tfidf

data3_test['min_tf_SD'] = min_tf
data3_test['min_idf_SD'] = min_idf
data3_test['min_tfidf_SD'] = min_tfidf

data3_test['sum_tf_SD'] = sum_tf
data3_test['sum_idf_SD'] = sum_idf
data3_test['sum_tfidf_SD'] = sum_tfidf

In [None]:
print(data3_train.iloc[:,8:].shape)
print(data3_test.iloc[:,8:].shape)

data3_train.iloc[:,8:].to_pickle('featurization/train/feature_set3/data3_train.pkl')
data3_test.iloc[:,8:].to_pickle('featurization/test/feature_set3/data3_test.pkl')

(59253, 39)
(14814, 39)
