https://www.kaggle.com/christofhenkel/how-to-preprocessing-when-using-embeddings

In [1]:
from collections import Counter
import gc
from pathlib import Path
import re
import time

import numpy as np
import pandas as pd
import gensim

from tqdm import tqdm
tqdm.pandas()

In [2]:
data_dir = Path.home() / 'Desktop/kaggle/data/quora'

train_all = pd.read_csv(data_dir / 'train.csv.zip')
test_all = pd.read_csv(data_dir / 'test.csv.zip')

print("Train shape : ", train_all.shape)
print("Test shape : ", test_all.shape)

Train shape :  (1306122, 3)
Test shape :  (56370, 2)


In [3]:
def build_vocab(sentences, verbose =  True):
    """
    :param sentences: list of list of words
    :return: dictionary of words and their count
    """
    vocab = Counter()
    for sentence in tqdm(sentences, disable = (not verbose)):
        for word in sentence:
            vocab[word] += 1
    return vocab

In [4]:
sentences = train_all["question_text"].progress_apply(lambda x: x.split()).values
vocab1 = build_vocab(sentences)

100%|██████████| 1306122/1306122 [00:03<00:00, 348410.42it/s]
100%|██████████| 1306122/1306122 [00:05<00:00, 234003.75it/s]


In [5]:
vocab1.most_common(5)

[('the', 653948),
 ('What', 417802),
 ('to', 403183),
 ('a', 402682),
 ('in', 363131)]

In [6]:
news_path = data_dir / 'GoogleNews-vectors-negative300/GoogleNews-vectors-negative300.bin'
embeddings_index = gensim.models.KeyedVectors.load_word2vec_format(news_path, binary=True)

In [7]:
def check_coverage(vocab, embeddings_index):
    covered_vocab = set()
    oov = Counter()
    covered_count = 0
    all_count = 0
    for word, count in tqdm(vocab.items()):
        all_count += count
        if word in embeddings_index:
            covered_vocab.add(word)
            covered_count += count
        else:
            oov[word] = count

    print('Found embeddings for {:.2%} of vocab'.format(len(covered_vocab) / len(vocab)))
    print('Found embeddings for  {:.2%} of all words'.format(covered_count / all_count))
    return oov

In [8]:
oov = check_coverage(vocab1, embeddings_index)

100%|██████████| 508823/508823 [00:00<00:00, 972744.37it/s]

Found embeddings for 24.31% of vocab
Found embeddings for  78.75% of all words





In [9]:
def is_ascii(s):
    return all(ord(c) < 128 for c in s)

cnt = 0
for word in embeddings_index.vocab:
    if len(word) not in [1]:
        continue
    if '#' in word:
        continue
    if not is_ascii(word):
        continue
    if not re.fullmatch(r'[^a-zA-Z0-9]+', word):
        continue
    print(word)
    cnt += 1
    if cnt > 100:
        break        

@
+
%
&
>
_
^
$
~
`
*
=


In [10]:
punct_covered = re.compile(r'(^|\b|\s)([#=+*%_`>^~@$&])($|\b|\s)')

for s in ['a+b', 'aa+bb+c', 'a++b']:
    print(punct_covered.sub(lambda m: ' {} '.format(m.group(2)), s))

a + b
aa + bb + c
a++b


In [11]:
punct_unknown = re.compile(r'(^|\b|\s)[?!.,:;"\'/|(){}\[\]\\-]($|\b|\s)')

for s in ['{b}', 'a\\b', 'a[b]c(d)e!', '"The"', 'it, ']:
    print(punct_unknown.sub(' ^ ', s))

 ^ b ^ 
a ^ b
a ^ b ^ c ^ d ^ e ^ 
 ^ The ^ 
it ^ 


In [12]:
punct_numbers = re.compile(r'([0-9]{2,})')

for s in ['1', '11', '111', '1111']:
    print(punct_numbers.sub(lambda m: '#' * len(m.group(1)), s))

1
##
###
####


In [13]:
rewrite_dict = {
    "What's": 'What is',
    "I'm": 'I am',
    "Shouldn't": 'Should not',
    "Wouldn't": 'Would not',
    "Couldn't": 'Could not',
    "Wasn't": 'Was not',
    "Aren't": 'Are not',
    "don't": 'do not',
    "can't": 'cannot',
    'colour':'color',
    'centre':'center',
    'didnt':'did not',
    "didn't": 'did not',
    'doesnt':'does not',
    "doesn't": 'does not',
    'isnt':'is not',
    "isn't": 'is not',
    'shouldnt':'should not',
    "shouldn't": 'should not',
    'behaviour': 'behavior',
    'favourite':'favorite',
    'travelling':'traveling',
    'counselling':'counseling',
    'theatre':'theater',
    'cancelled':'canceled',
    'labour':'labor',
    'organisation':'organization',
    'programme': 'program',
    'wwii':'world war 2',
    'citicise':'criticize',
    'instagram': 'social medium',
    'whatsapp': 'social medium',
    'snapchat': 'social medium',
    'Snapchat': 'social medium',
}

rewrite_re = re.compile('(%s)' % '|'.join(rewrite_dict.keys()))

In [14]:
def clean_text(x):
    x = str(x)
    x = x.replace("’", "'")
    x = rewrite_re.sub(lambda m: rewrite_dict[m.group(0)], x)
    x = punct_covered.sub(lambda m: ' {} '.format(m.group(2)), x)
    x = punct_unknown.sub(' ^ ', x)
    x = punct_numbers.sub(lambda m: '#' * len(m.group(1)), x)
    return x

In [15]:
cnt = 0
for k, v in sorted(oov.items(), key=lambda x: -x[1]):
    if re.fullmatch(r'[a-zA-Z]+[.,?]?', k):
        continue
    print('{} => {}'.format(k, v))
    cnt += 1
    if cnt > 20:
        break

10 => 4591
2017? => 4050
2018? => 3594
2017 => 3170
2018 => 2768
- => 2559
I’m => 2506
12 => 2448
"The => 2239
don’t => 2089
12th => 1926
20 => 1760
What’s => 1688
15 => 1569
Trump's => 1560
100 => 1490
? => 1384
/ => 1340
30 => 1286
(or => 1129
11 => 1127


In [16]:
cnt = 0
for k, v in sorted(oov.items(), key=lambda x: -x[1]):
    if re.fullmatch(r'[a-zA-Z]+[.,?]?', k):
        continue
    print('{} => {}'.format(k, v))
    cnt += 1
    if cnt > 20:
        break

10 => 4591
2017? => 4050
2018? => 3594
2017 => 3170
2018 => 2768
- => 2559
I’m => 2506
12 => 2448
"The => 2239
don’t => 2089
12th => 1926
20 => 1760
What’s => 1688
15 => 1569
Trump's => 1560
100 => 1490
? => 1384
/ => 1340
30 => 1286
(or => 1129
11 => 1127


In [17]:
oov.most_common(10)

[('to', 403183),
 ('a', 402682),
 ('of', 330825),
 ('and', 251973),
 ('India?', 16384),
 ('it?', 12900),
 ('do?', 8753),
 ('life?', 7753),
 ('you?', 6295),
 ('me?', 6202)]

In [18]:
gc.collect()
time.sleep(2)

In [23]:
super_frequent_words = ['a', 'of', 'to', 'and']
def split2(text):
    return (word for word in clean_text(text).split()
            if word not in super_frequent_words)

In [24]:
sentences2 = train_all.question_text.progress_apply(split2).values
vocab2 = build_vocab(sentences2)

100%|██████████| 1306122/1306122 [00:24<00:00, 53514.93it/s]
100%|██████████| 1306122/1306122 [00:08<00:00, 160764.16it/s]


In [25]:
oov2 = check_coverage(vocab2, embeddings_index)

100%|██████████| 270138/270138 [00:00<00:00, 976039.14it/s] 

Found embeddings for 53.80% of vocab
Found embeddings for  98.78% of all words





In [26]:
oov2.most_common(20)

[('bitcoin', 984),
 ('Quorans', 853),
 ('cryptocurrency', 817),
 ('programr', 687),
 ('####)?', 568),
 ('etc.?', 531),
 ('Brexit', 490),
 ('btech', 481),
 ('cryptocurrencies', 481),
 ('blockchain', 479),
 ('(I', 479),
 ('upvotes', 429),
 ('C++', 417),
 ('etc.)', 410),
 ('Redmi', 379),
 ('realise', 371),
 ('defence', 363),
 ('S.?', 362),
 ('KVPY', 349),
 ('Paytm', 334)]