https://www.kaggle.com/christofhenkel/how-to-preprocessing-when-using-embeddings

In [31]:
from collections import Counter
import gc
from pathlib import Path
import re

import numpy as np
import pandas as pd
import gensim

from tqdm import tqdm
tqdm.pandas()

In [5]:
data_dir = Path.home() / 'Desktop/kaggle/data/quora'

train_all = pd.read_csv(data_dir / 'train.csv.zip')
test_all = pd.read_csv(data_dir / 'test.csv.zip')

print("Train shape : ", train_all.shape)
print("Test shape : ", test_all.shape)

Train shape :  (1306122, 3)
Test shape :  (56370, 2)


In [6]:
def build_vocab(sentences, verbose =  True):
    """
    :param sentences: list of list of words
    :return: dictionary of words and their count
    """
    vocab = Counter()
    for sentence in tqdm(sentences, disable = (not verbose)):
        for word in sentence:
            vocab[word] += 1
    return vocab

In [7]:
sentences = train_all["question_text"].progress_apply(lambda x: x.split()).values
vocab = build_vocab(sentences)

100%|██████████| 1306122/1306122 [00:03<00:00, 369153.23it/s]
100%|██████████| 1306122/1306122 [00:05<00:00, 237625.42it/s]


In [8]:
vocab.most_common(5)

[('the', 653948),
 ('What', 417802),
 ('to', 403183),
 ('a', 402682),
 ('in', 363131)]

In [9]:
news_path = data_dir / 'GoogleNews-vectors-negative300/GoogleNews-vectors-negative300.bin'
embeddings_index = gensim.models.KeyedVectors.load_word2vec_format(news_path, binary=True)

In [17]:
def check_coverage(vocab, embeddings_index):
    covered_vocab = set()
    oov = Counter()
    covered_count = 0
    all_count = 0
    for word, count in tqdm(vocab.items()):
        all_count += count
        if word in embeddings_index:
            covered_vocab.add(word)
            covered_count += count
        else:
            oov[word] = count

    print('Found embeddings for {:.2%} of vocab'.format(len(covered_vocab) / len(vocab)))
    print('Found embeddings for  {:.2%} of all words'.format(covered_count / all_count))
    return oov

In [18]:
oov = check_coverage(vocab, embeddings_index)

100%|██████████| 508823/508823 [00:00<00:00, 915609.84it/s]

Found embeddings for 24.31% of vocab
Found embeddings for  78.75% of all text





In [51]:
def is_ascii(s):
    return all(ord(c) < 128 for c in s)

cnt = 0
for word in embeddings_index.vocab:
    if len(word) != 1:
        continue
    if not is_ascii(word):
        continue
    if not re.fullmatch(r'[^a-zA-Z0-9]', word):
        continue
    print(word)
    cnt += 1
    if cnt > 100:
        break        

#
_
@
`
>
~
*
^
$
+
%
&
=


In [32]:
cnt = 0
for k, v in sorted(oov.items(), key=lambda x: -x[1]):
    if re.fullmatch(r'[a-zA-Z]+[.,?]?', k):
        continue
    print('{} => {}'.format(k, v))
    cnt += 1
    if cnt > 20:
        break

In [37]:
cnt = 0
for k, v in sorted(oov.items(), key=lambda x: -x[1]):
    if re.fullmatch(r'[a-zA-Z]+[.,?]?', k):
        continue
    print('{} => {}'.format(k, v))
    cnt += 1
    if cnt > 20:
        break

10 => 4591
2017? => 4050
2018? => 3594
2017 => 3170
2018 => 2768
- => 2559
I’m => 2506
12 => 2448
"The => 2239
don’t => 2089
12th => 1926
20 => 1760
What’s => 1688
15 => 1569
Trump's => 1560
100 => 1490
? => 1384
/ => 1340
30 => 1286
(or => 1129
11 => 1127


In [19]:
oov.most_common(10)

[('to', 403183),
 ('a', 402682),
 ('of', 330825),
 ('and', 251973),
 ('India?', 16384),
 ('it?', 12900),
 ('do?', 8753),
 ('life?', 7753),
 ('you?', 6295),
 ('me?', 6202)]

In [20]:
gc.collect()

1405

In [None]:
def split2(text):
    return text.split()

In [None]:
sentences = train_all.question_text.progress_apply(split2).values
vocab = build_vocab(sentences)