In [1]:
import os
import re
import time
import string
import numpy as np
import pandas as pd

from scipy.spatial.distance import cosine
from gensim.models import Word2Vec, KeyedVectors

from tqdm import tqdm, tqdm_notebook
tqdm.pandas()

import warnings
warnings.simplefilter("ignore")

pd.options.display.max_colwidth = 200
pd.options.display.max_rows = 100

  from pandas import Panel


In [2]:
def get_data():
    train = pd.read_csv("../data/train_ne.csv")
    test = pd.read_csv("../data/test_ne.csv")
    train.dropna(inplace=True)
    print("Train Shape : {}\nTest Shape :  {}".format(train.shape, test.shape))
    
    return train, test

In [3]:
train, test = get_data()
target = 'category'

Train Shape : (1199559, 5)
Test Shape :  (92, 4)


In [4]:
train.head()

Unnamed: 0,title,description,category,fold_id,text
0,ziczac black red euro 44,clothing related products b2c shoes shoe laces,R,1,ziczac black red euro 44 clothing related products b2c shoes shoe laces
1,9x9 resista 484938,publishing printing printing services,S,1,9x9 resista 484938 publishing printing printing services
2,halle pant short inseam 013049561d0010001 02,clothing related products b2c general,R,1,halle pant short inseam 013049561d0010001 02 clothing related products b2c general
3,harry houser travel expenses meals,security personnel,S,1,harry houser travel expenses meals security personnel
4,tee time 740078609 greens fee composite,admissions green fees privately owned golf course,R,1,tee time 740078609 greens fee composite admissions green fees privately owned golf course


In [5]:
test['description'][test['description'] == "none"] = np.nan
test.isnull().sum()

title           0
description    16
category        0
text            0
dtype: int64

### We have 16 descriptions null i.e ~18% (from Notebook 0).

## Imputing Descriptions

We have seen from the above EDA that description holds a lot of weight in classifying products and services.

So I here hypothsize what can be the ways we can impute the descriptions : 

1. Using Mean embedding of title to find nearest title in Train and then imputing the description of the mapped title in the train set.
1. Using Doc2Vec to find similar documents 
1. Find a keyword in title to find the optimum description from train set ex : jean. Could be done by an custom-NER system.

In [6]:
train.nunique()

title          1193949
description       1279
category             2
fold_id             10
text           1194957
dtype: int64

In [7]:
test.nunique()

title          89
description    59
category        2
text           92
dtype: int64

### As we have very less unique descriptions in train we hope to learn some-mapping as :  

    [title --> description]
    
    
I'm here using the cosine similarity between the sum of word vectors formed between train and test titles.

The vectors here are the custom-trained Word2Vec model.

In [8]:
def make_feature_vec(words, model, num_features):
    """
    Average the word vectors for a set of words
    """
    feature_vec = np.zeros((num_features,),dtype="float32")  # pre-initialize (for speed)
    nwords = 0.
    index2word_set = set(model.wv.index2word)  # words known to the model

    for word in words:
        if word in index2word_set: 
            nwords = nwords + 1.
            feature_vec = np.add(feature_vec, model[word])
    
    feature_vec = np.divide(feature_vec, nwords)
    return feature_vec

In [9]:
train['title_tokenized'] = train['title'].progress_apply(lambda x: x.split())
test['title_tokenized'] = test['title'].progress_apply(lambda x: x.split())

train_subsample = train[train['fold_id'] == 1]
train_subsample.shape

100%|██████████| 1199559/1199559 [00:04<00:00, 246969.83it/s]
100%|██████████| 92/92 [00:00<00:00, 100829.88it/s]


(119951, 6)

In [10]:
col = 'title'
values = train_subsample['{}_tokenized'.format(col)].values.tolist()
model = Word2Vec(values, min_count=10)
print("Length of Vocabulary : {}".format(len(model.wv.vocab)))

model.most_similar("shirt")

Length of Vocabulary : 5814


[('men', 0.9733667969703674),
 ('jacket', 0.9719110727310181),
 ('long', 0.970596969127655),
 ('button', 0.9678511023521423),
 ('ladies', 0.9666380882263184),
 ('polo', 0.9661392569541931),
 ('fleece', 0.9636504650115967),
 ('sleeve', 0.9608524441719055),
 ('women', 0.9590590596199036),
 ('boys', 0.9587820172309875)]

In [11]:
train_sentence_embs = np.zeros((train_subsample.shape[0], 100))

for i in tqdm_notebook(range(train_subsample.shape[0])):
    train_sentence_embs[i] = make_feature_vec(train['title_tokenized'].iloc[i], model, 100)

HBox(children=(FloatProgress(value=0.0, max=119951.0), HTML(value='')))




In [12]:
test_sentence_embs = np.zeros((test.shape[0], 100))

for i in tqdm_notebook(range(test.shape[0])):
    test_sentence_embs[i] = make_feature_vec(test['title_tokenized'].iloc[i], model, 100)

HBox(children=(FloatProgress(value=0.0, max=92.0), HTML(value='')))




In [13]:
def calc_cosine_similarity(train_vecs, test_vec):
    start_time = time.time()
    result = np.full(len(train_vecs), -9999, dtype=float)
    for idx, _ in enumerate(range(len(train_vecs))):
        r = 1 - cosine(train_vecs[idx], test_vec)
        if str(r) != "nan":
            result[idx] = r
    
    print("Time Taken : {:.2f}".format(time.time() - start_time))
    
    return np.argmax(result), result[np.argmax(result)]

In [14]:
null_idxs = test[test['description'].isnull()].index.tolist()

mapping = {}

for i in null_idxs:
    item = test['title'].iloc[i]
    mapping[i] = {}
    mapping[i]['test_title'] = item
    
    max_idx, max_sim = calc_cosine_similarity(train_sentence_embs, test_sentence_embs[i])
    print("Index : {} \t Similarity : {}".format(max_idx, max_sim))

    mapping[i]['train_title'] = train_subsample['title'].iloc[max_idx]
    mapping[i]['mapped_desc'] = train_subsample['description'].iloc[max_idx]
    
    print("Test Item : {}".format(item))
    print("Train Title Item matched : {}".format(mapping[i]['train_title']))
    print("Train Description mapped : {}".format(mapping[i]['mapped_desc']))
    print("--"*50)

Time Taken : 7.83
Index : 41275 	 Similarity : 0.97726827899959
Test Item : carpet repairs
Train Title Item matched : repaired mechanical problem modified blocker bolt
Train Description mapped : repair performed tpp equipment parts labor separately stated
----------------------------------------------------------------------------------------------------
Time Taken : 8.39
Index : 4608 	 Similarity : 0.9706081579707684
Test Item : vct floor refinishing
Train Title Item matched : fs436 us26d floor stop
Train Description mapped : hardware sold medical facility
----------------------------------------------------------------------------------------------------
Time Taken : 8.90
Index : 43561 	 Similarity : 0.8863722690210839
Test Item : clean carpet clean windows scrub buff vct floors
Train Title Item matched : front porch actor windows ba
Train Description mapped : computer software implementation prewritten software electronically downloaded
----------------------------------------------

In [15]:
for key, value in mapping.items():
    test['description'].iloc[key] = value['mapped_desc']

In [16]:
test['text'] = test['title'] + " " + test['description']
test.to_csv("../data/test_ne_imputed.csv", index=False)