In [1]:
from collections import Counter
import gc
import itertools
from pathlib import Path
import re
import time

import numpy as np
import pandas as pd
import gensim

from tqdm import tqdm
tqdm.pandas()

In [2]:
data_dir = Path.home() / 'Desktop/kaggle/data/quora'
train_all = pd.read_csv(data_dir / 'train.csv.zip')
print("Train shape:", train_all.shape)

Train shape: (1306122, 3)


In [3]:
def build_vocab(sentences, verbose =  True):
    vocab = Counter()
    for sentence in tqdm(sentences, disable=(not verbose)):
        for word in sentence:
            vocab[word] += 1
    return vocab

In [4]:
sentences1 = train_all.question_text.progress_apply(lambda x: x.split()).values
vocab1 = build_vocab(sentences1)

100%|██████████| 1306122/1306122 [00:03<00:00, 327255.95it/s]
100%|██████████| 1306122/1306122 [00:06<00:00, 216808.04it/s]


In [5]:
vocab1.most_common(10)

[('the', 653948),
 ('What', 417802),
 ('to', 403183),
 ('a', 402682),
 ('in', 363131),
 ('is', 331647),
 ('of', 330825),
 ('I', 306261),
 ('How', 261930),
 ('and', 251973)]

In [6]:
GLOVE_PATH = data_dir / 'glove.840B.300d/glove.840B.300d.txt'
GLOVE_NUM_LINES = 2196017

def key_value_pair(word, *vec):
    return word, np.asarray(vec, dtype='float32')

with GLOVE_PATH.open() as f:
    embeddings_index = dict(key_value_pair(*line.split(' '))
                            for line in tqdm(f, total=GLOVE_NUM_LINES))

100%|██████████| 2196017/2196017 [01:38<00:00, 22317.93it/s]


In [7]:
list(itertools.islice(embeddings_index.keys(), 10))

['Crats',
 'Asafoetida',
 'napping',
 '920-481',
 'Madelyn',
 'Tsimshatsui',
 '25By',
 'augmenté',
 'tazed',
 'beltre']

In [8]:
def check_coverage(vocab, embeddings_index):
    covered_vocab = set()
    oov = Counter()
    covered_count = 0
    all_count = 0
    for word, count in tqdm(vocab.items()):
        all_count += count
        if word in embeddings_index:
            covered_vocab.add(word)
            covered_count += count
        else:
            oov[word] = count

    print('Found embeddings for {:.2%} of vocab'.format(len(covered_vocab) / len(vocab)))
    print('Found embeddings for  {:.2%} of all words'.format(covered_count / all_count))
    return oov

In [9]:
oov1 = check_coverage(vocab1, embeddings_index)

100%|██████████| 508823/508823 [00:00<00:00, 1096265.60it/s]

Found embeddings for 33.16% of vocab
Found embeddings for  88.16% of all words





In [10]:
oov1.most_common(10)

[('India?', 16384),
 ('it?', 12900),
 ("What's", 12425),
 ('do?', 8753),
 ('life?', 7753),
 ('you?', 6295),
 ('me?', 6202),
 ('them?', 6140),
 ('time?', 5716),
 ('world?', 5386)]

In [11]:
def is_ascii(s):
    return all(ord(c) < 128 for c in s)

cnt = 0
for word in embeddings_index.keys():
    if len(word) != 1:
        continue
    if not is_ascii(word):
        continue
    if not re.fullmatch(r'[^a-zA-Z0-9]+', word):
        continue
    print(word, end=' ')
    cnt += 1
    if cnt > 1000:
        break        

< + * . { ; ) [ " } = > ( - % | : ! / @ & ' , $ ^ ~ ] ` ? _ # \ 

In [12]:

for s in ['a+b', 'aa+bb+c', 'a++b', 'aa?', ' (I']:
    print(punct_covered.sub(lambda m: ' {} '.format(m.group(2)), s))

NameError: name 'punct_covered' is not defined

In [None]:
punct_beginning = re.compile(r'(^|\b|\s)([("])')
punct_ending = re.compile(r'([.,?)])($|\b|\s)')
punct_covered = re.compile(r'(^|\b|\s)([&./!<}+?)_(;~@*#:={$`|>^",%\[\]\'\\-])($|\b|\s|\?$)')

def clean_text(x):
    x = str(x)
    x = x.replace("’", "'")
    x = x.replace("“", '"')
    x = punct_beginning.sub(lambda m: ' {} '.format(m.group(2)), x)
    x = punct_ending.sub(lambda m: ' {} '.format(m.group(1)), x)
    x = punct_covered.sub(lambda m: ' {} '.format(m.group(2)), x)
    return x

In [None]:
def split2(text):
    return clean_text(text).split()

In [None]:
sentences2 = train_all.question_text.progress_apply(split2).values
vocab2 = build_vocab(sentences2)
oov2 = check_coverage(vocab2, embeddings_index)

In [None]:
oov2.most_common(30)

In [None]:
def is_ascii(s):
    return all(ord(c) < 128 for c in s)

cnt = 0
for word in embeddings_index.vocab:
    if len(word) not in [1]:
        continue
    if not is_ascii(word):
        continue
    if not re.fullmatch(r'[^a-zA-Z0-9]+', word):
        continue
    print(word, end=' ')
    cnt += 1
    if cnt > 100:
        break        