In [1]:
import os
import re
import gc
import string
import numpy as np
import pandas as pd

from tqdm import tqdm, tqdm_notebook
tqdm.pandas()

from gensim.models import KeyedVectors

import warnings
warnings.simplefilter("ignore")

pd.options.display.max_colwidth = 200
pd.options.display.max_rows = 100

  from pandas import Panel


In [2]:
from nltk import WordNetLemmatizer
from nltk.corpus import stopwords

import unidecode
import codecs
import spacy

In [3]:
def get_data():
    train = pd.read_csv("../data/v2/train_10_skf.csv")
    test = pd.read_csv("../data/test.csv", sep="\t")
    
    print("Train Shape : {}\nTest Shape :  {}".format(train.shape, test.shape))
    
    return train, test

In [4]:
train, test = get_data()
target = 'category'

Train Shape : (1199998, 5)
Test Shape :  (92, 3)


In [5]:
train.head(3)

Unnamed: 0,title,description,category,fold_id,source
0,ZicZac // Black + Red (Euro: 44),Clothing & related products (B2C) - Shoes and shoe laces,R,1,valid
1,9X9 RESISTA/484938,Publishing/Printing - Printing Services,S,1,train
2,Halle Pant - Short Inseam 013049561D0010001_ 02,Clothing & related products (B2C) - General,R,1,train


### TODO : LATER

import the imputed dataset for test

In [6]:
test.fillna("none", inplace=True)

### Text Cleaning  (For frequency-methods): 

Usually this type of **regular text cleaning** could be useful while using frequency methods and own-embedding models but when using pre-trained methods we have to preprocess differently(shown later)

Steps : 

1. Lower
1. Don't split camelcase
1. Dealing with contractions
1. removing special characters
1. removing stop-words
1. lemmatization ? 
1. tokenize 

In [7]:
wnl = WordNetLemmatizer()

contraction_mapping = {"ain't": "is not", "aren't": "are not","can't": "cannot", "can't've": "cannot have", "'cause": "because", "could've": "could have", "couldn't": "could not", "couldn't've": "could not have","didn't": "did not",  "doesn't": "does not", "don't": "do not", "hadn't": "had not", "hadn't've": "had not have", "hasn't": "has not", "haven't": "have not",  "he'd": "he would", "he'd've": "he would have", "he'll": "he will", "he'll've": "he will have", "he's": "he is", "how'd": "how did", "how'd'y": "how do you", "how'll": "how will", "how's": "how is",  "I'd": "I would", "I'd've": "I would have", "I'll": "I will", "I'll've": "I will have","I'm": "I am", "I've": "I have", "i'd": "i would", "i'd've": "i would have", "i'll": "i will", "i'll've": "i will have","i'm": "i am", "i've": "i have", "isn't": "is not", "it'd": "it would", "it'd've": "it would have", "it'll": "it will", "it'll've": "it will have","it's": "it is", "let's": "let us", "ma'am": "madam", "mayn't": "may not", "might've": "might have","mightn't": "might not","mightn't've": "might not have", "must've": "must have", "mustn't": "must not", "mustn't've": "must not have", "needn't": "need not", "needn't've": "need not have","o'clock": "of the clock", "oughtn't": "ought not", "oughtn't've": "ought not have", "shan't": "shall not","sha'n't": "shall not", "shan't've": "shall not have", "she'd": "she would", "she'd've": "she would have", "she'll": "she will", "she'll've": "she will have", "she's": "she is", "should've": "should have", "shouldn't": "should not", "shouldn't've": "should not have", "so've": "so have","so's": "so as", "this's": "this is","that'd": "that would", "that'd've": "that would have","that's": "that is", "there'd": "there would", "there'd've": "there would have","there's": "there is", "here's": "here is","they'd": "they would", "they'd've": "they would have", "they'll": "they will", "they'll've": "they will have", "they're": "they are", "they've": "they have", "to've": "to have", "wasn't": "was not", "we'd": "we would", "we'd've": "we would have", "we'll": "we will", "we'll've": "we will have", "we're": "we are", "we've": "we have", "weren't": "were not", "what'll": "what will", "what'll've": "what will have", "what're": "what are", "what's": "what is", "what've": "what have", "when's": "when is", "when've": "when have", "where'd": "where did", "where's": "where is", "where've": "where have", "who'll": "who will", "who'll've": "who will have", "who's": "who is", "who've": "who have", "why's": "why is", "why've": "why have", "will've": "will have", "won't": "will not", "won't've": "will not have", "would've": "would have", "wouldn't": "would not", "wouldn't've": "would not have", "y'all": "you all", "y'all'd": "you all would","y'all'd've": "you all would have","y'all're": "you all are","y'all've": "you all have","you'd": "you would", "you'd've": "you would have", "you'll": "you will", "you'll've": "you will have", "you're": "you are", "you've": "you have" }
stop_words = set(stopwords.words('english'))

def treat_text(X):
    # Decoding 
    try:
        decoded = unidecode.unidecode(codecs.decode(X, 'unicode_escape'))
    except:
        decoded = unidecode.unidecode(X)

    # Handling apostrophes
    apostrophe_handled = re.sub("’", "'", decoded)
    X = " ".join([contraction_mapping[t] if t in contraction_mapping else t for t in apostrophe_handled.split(" ")])
    
    # Keeping only text + numbers and lowered.
    X = re.findall(r"[a-zA-Z0-9]+", X.lower())
    
    # Removing stopwords
    X = [word for word in X if (word not in stop_words)]
    
    # Lemming
#     X = [wnl.lemmatize(word) for word in X]
    
    return " ".join(X)

In [8]:
print("Sample Preprocessing \n")
for i in range(3):
    item = train['title'].iloc[i]
    print("Original Text : {}\nPreprocessed Text : {}\n".format(item, treat_text(item)))

Sample Preprocessing 

Original Text : ZicZac // Black + Red (Euro: 44)
Preprocessed Text : ziczac black red euro 44

Original Text : 9X9 RESISTA/484938
Preprocessed Text : 9x9 resista 484938

Original Text : Halle Pant - Short Inseam 013049561D0010001_ 02
Preprocessed Text : halle pant short inseam 013049561d0010001 02



In [9]:
for data in [train, test]:
    data['title'] = data['title'].progress_apply(lambda x: treat_text(x))
    data['description'] = data['description'].progress_apply(lambda x: treat_text(x))
    data['text'] = data['title'] + " " + data['description']

100%|██████████| 1199998/1199998 [00:22<00:00, 54194.49it/s]
100%|██████████| 1199998/1199998 [00:19<00:00, 61314.93it/s]
100%|██████████| 92/92 [00:00<00:00, 44180.90it/s]
100%|██████████| 92/92 [00:00<00:00, 18130.71it/s]


In [10]:
train.head()

Unnamed: 0,title,description,category,fold_id,source,text
0,ziczac black red euro 44,clothing related products b2c shoes shoe laces,R,1,valid,ziczac black red euro 44 clothing related products b2c shoes shoe laces
1,9x9 resista 484938,publishing printing printing services,S,1,train,9x9 resista 484938 publishing printing printing services
2,halle pant short inseam 013049561d0010001 02,clothing related products b2c general,R,1,train,halle pant short inseam 013049561d0010001 02 clothing related products b2c general
3,harry houser travel expenses meals,security personnel,S,1,train,harry houser travel expenses meals security personnel
4,tee time 740078609 greens fee composite,admissions green fees privately owned golf course,R,1,valid,tee time 740078609 greens fee composite admissions green fees privately owned golf course


In [12]:
train.to_csv("../data/v2/train_ne.csv", index=False)
test.to_csv("../data/v2/test_ne.csv", index=False)

In [13]:
del train, test
gc.collect()

41

### Text Preprocessing : (For Embedding)

I am using google-news-vectors to show how the preprocessing for pre-trained embeddings is different.

In [7]:
def build_vocab(sentences, verbose =  True):
    vocab = {}
    for sentence in tqdm(sentences, disable = (not verbose)):
        for word in sentence:
            try:
                vocab[word] += 1
            except KeyError:
                vocab[word] = 1
    
    return vocab

#### Re-initialising the data

In [8]:
train, test = get_data()

test.fillna("none", inplace=True)

train['text'] = train['title'] + " " + train['description']
test['text'] = test['title'] + " " + test['description']

Train Shape : (1199998, 4)
Test Shape :  (92, 3)


In [9]:
re_tok = re.compile(f'([{string.punctuation}“”¨«»®´·º½¾¿¡§£₤‘’])')

def tokenize(s): 
    return re_tok.sub(r' \1 ', s).split()

In [10]:
train_sentences = train['text'].progress_apply(lambda x: tokenize(x)).values
test_sentences = test['text'].progress_apply(lambda x: tokenize(x)).values

train_vocab = build_vocab(train_sentences)
test_vocab = build_vocab(test_sentences)

100%|██████████| 1199998/1199998 [00:35<00:00, 33559.24it/s]
100%|██████████| 92/92 [00:00<00:00, 21777.53it/s]
100%|██████████| 1199998/1199998 [00:09<00:00, 128000.74it/s]
100%|██████████| 92/92 [00:00<00:00, 43611.66it/s]


In [11]:
# Sample vocabulary
print({k: train_vocab[k] for k in list(train_vocab)[: 5]})

{'ZicZac': 1, '/': 1031538, 'Black': 33463, '+': 18240, 'Red': 4638}


### Importing the pre-trained embedding

In [12]:
%%time

news_path = '../input/googlenewsvectorsnegative300/GoogleNews-vectors-negative300.bin'
embeddings_index = KeyedVectors.load_word2vec_format(news_path, binary=True)

CPU times: user 36.5 s, sys: 10.3 s, total: 46.9 s
Wall time: 46.9 s


OOV : out of vocabulary

In [13]:
import operator

def check_coverage(vocab, embedding_index):
    intersection = {}
    oov = {}
    found_len = 0
    not_found_len = 0
    
    for word in tqdm(vocab):
        try:
            intersection[word] = embedding_index[word]
            found_len += vocab[word]
        except:
            oov[word] = vocab[word]
            not_found_len += vocab[word]
            
    print("Found embeddings for {:.2f} % of vocab".format((len(intersection) / len(vocab))*100))
    print("Found embeddings for {:.2f} % of all text".format((found_len / (found_len + not_found_len))*100))
    
    sorted_oov = sorted(oov.items(), key=operator.itemgetter(1))[::-1]
    
    return sorted_oov

In [14]:
print("Train")
train_oov = check_coverage(train_vocab, embeddings_index)
print("Test")
test_oov = check_coverage(test_vocab, embeddings_index)

  2%|▏         | 19606/1208022 [00:00<00:06, 196056.45it/s]

Train


100%|██████████| 1208022/1208022 [00:05<00:00, 230316.86it/s]
100%|██████████| 949/949 [00:00<00:00, 161116.96it/s]

Found embeddings for 8.99 % of vocab
Found embeddings for 57.38 % of all text
Test
Found embeddings for 93.47 % of vocab
Found embeddings for 74.12 % of all text





First let's understand the terms : 

1. Vocab Coverage : This is the percentage of unique tokens/words found in our pre-trained embeddings which matches with our own vocab.

![Vocab Coverage](https://latex.codecogs.com/gif.latex?VocabCoverage%20%3D%20\frac{Matching%20Terms%20In%20Pretrained%20Embedding}{TotalTermsInOurVocab})

2. All Text Coverage : This is the percentage of total tokens matched by the total number of tokens we have.

![All Text Coverage](https://latex.codecogs.com/gif.latex?AllTextCoverage%20%3D%20%5Cfrac%7BMatchedTokensInOurVocab%7D%7BTotalTokensInOurVocab%7D)

## Our main aim now is to increase the vocab by understanding how the underlying pre-trained embedding is trained on.

In [15]:
train_oov[:10]

[('-', 2208672),
 ('/', 1031538),
 ('(', 927001),
 (':', 926898),
 (')', 925636),
 ('.', 476453),
 (',', 332092),
 ('of', 165727),
 ('and', 146635),
 ('2019', 83006)]

Punctuations seems to be a problem here. Let's fix that.

In [16]:
'?' in embeddings_index

False

In [17]:
'&' in embeddings_index

True

In [18]:
def clean_text(x):

    x = str(x)
    for punct in "/-'":
        x = x.replace(punct, ' ')
    for punct in '&':
        x = x.replace(punct, f' {punct} ')
    for punct in '?!.,"#$%\'()*+-/:;<=>@[\\]^_`{|}~' + '“”’':
        x = x.replace(punct, '')
    return x

In [19]:
train["text"] = train["text"].progress_apply(lambda x: clean_text(x))
test["text"] = test["text"].progress_apply(lambda x: clean_text(x))

train_sentences = train["text"].progress_apply(lambda x: tokenize(x))
test_sentences = test["text"].progress_apply(lambda x: tokenize(x))
train_vocab = build_vocab(train_sentences)
test_vocab = build_vocab(test_sentences)

oov = check_coverage(train_vocab, embeddings_index)
oov_test = check_coverage(test_vocab, embeddings_index)

100%|██████████| 1199998/1199998 [00:17<00:00, 68887.66it/s]
100%|██████████| 92/92 [00:00<00:00, 30047.97it/s]
100%|██████████| 1199998/1199998 [00:15<00:00, 77716.92it/s]
100%|██████████| 92/92 [00:00<00:00, 34060.90it/s]
100%|██████████| 1199998/1199998 [00:08<00:00, 137588.34it/s]
100%|██████████| 92/92 [00:00<00:00, 47457.38it/s]
100%|██████████| 1277988/1277988 [00:05<00:00, 232069.82it/s]
100%|██████████| 932/932 [00:00<00:00, 189009.35it/s]

Found embeddings for 8.26 % of vocab
Found embeddings for 79.29 % of all text
Found embeddings for 93.56 % of vocab
Found embeddings for 89.69 % of all text





It seems the stopwords : [of, and, to] and numbers > 10 were masked to "##".

Let's fix this.

But fixing the numbers may ot be good as we saw in EDA notebook that most *product* categories have sizes in numbers like for shirts the size could be [42, 40], etc.


In [22]:
train.to_csv("train_w2v.csv", index=False)
test.to_csv("test_w2v.csv", index=False)

Unnamed: 0,title,description,category,fold_id,text
0,ZicZac // Black + Red (Euro: 44),Clothing & related products (B2C) - Shoes and shoe laces,R,1,ZicZac Black Red Euro 44 Clothing & related products B2C Shoes and shoe laces
1,9X9 RESISTA/484938,Publishing/Printing - Printing Services,S,1,9X9 RESISTA 484938 Publishing Printing Printing Services
2,Halle Pant - Short Inseam 013049561D0010001_ 02,Clothing & related products (B2C) - General,R,1,Halle Pant Short Inseam 013049561D0010001 02 Clothing & related products B2C General
3,Harry Houser Travel Expenses - Meals,Security - personnel,S,1,Harry Houser Travel Expenses Meals Security personnel
4,Tee Time: 740078609 : Greens Fee - Composite,Admissions - Green Fees for Privately Owned Golf Course,R,1,Tee Time 740078609 Greens Fee Composite Admissions Green Fees for Privately Owned Golf Course


### TODO : 

1. Handle Contractions
1. Handle  Word-Corrections
1. This embeddings is trained on US english so most of the common words could be converted into UK english 
        For example : the keywords [Instagram, Facebook, etc] are mapped to [social media]
1. Can be done a very-time consuming coverage increase (below) but I didn't use this I think instead of pre-trained embedding a custom-trained embedding would proved benefical.