Inspired from https://www.kaggle.com/christofhenkel/how-to-preprocessing-when-using-embeddings

In [None]:
import pandas as pd
from tqdm import tqdm
import string
tqdm.pandas()
import numpy as np

In [None]:
train = pd.read_csv("parsed.csv")
test = pd.read_csv("parsed_test.csv")
print("Train shape : ",train.shape)
print("Test shape : ",test.shape)

In [None]:
def build_vocab(sentences, verbose =  True):
    """
    :param sentences: list of list of words
    :return: dictionary of words and their count
    """
    vocab = {}
    for sentence in tqdm(sentences, disable = (not verbose)):
        for word in sentence.split():
            try:
                vocab[word] += 1
            except KeyError:
                vocab[word] = 1
    return vocab

In [None]:
vocab = build_vocab(train['texts'])
print({k: vocab[k] for k in list(vocab)[:5]})

In [None]:
EMBEDDING_FILE = 'glove.840B.300d.txt'
def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')
embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(EMBEDDING_FILE))

In [None]:
import operator 

def check_coverage(vocab,embeddings_index):
    a = {}
    oov = {}
    k = 0
    i = 0
    for word in tqdm(vocab):
        try:
            a[word] = embeddings_index[word]
            k += vocab[word]
        except:

            oov[word] = vocab[word]
            i += vocab[word]
            pass

    print('Found embeddings for {:.2%} of vocab'.format(len(a) / len(vocab)))
    print('Found embeddings for  {:.2%} of all text'.format(k / (k + i)))
    sorted_x = sorted(oov.items(), key=operator.itemgetter(1))[::-1]

    return sorted_x

In [None]:
oov = check_coverage(vocab,embeddings_index)

In [None]:
oov[:50]

In [None]:
for punct in string.punctuation:
    print(punct,punct in embeddings_index)

In [None]:
to_remove = ['a','to','of','and']
def remove_words(x):
    x = str(x)
    y = []
    for w in x.split():
        if w not in to_remove:
            y.append(w)    
    return " ".join(y)

def remove_punc(x):
    x = str(x)
    for punct in '\n' + '\r':
        x = x.replace(punct, ' ')
    for punct in string.punctuation:
        x = x.replace(punct, f' {punct} ')   
    return x.lower()

def clean_text(x):
    x = remove_punc(x)
    x = remove_words(x)
    return x    

In [None]:
train["texts"] = train["texts"].progress_apply(lambda x: clean_text(x))
test["texts"] = test["texts"].progress_apply(lambda x: clean_text(x))
vocab = build_vocab(train["texts"])

In [None]:
oov = check_coverage(vocab,embeddings_index)

In [None]:
oov[:50]

In [None]:
train.to_csv('cleaned_train_punc_brute.csv',index=False)
test.to_csv('cleaned_test_punc_brute.csv',index=False)

In [None]:
test

In [None]:
train