In [1]:
import collections
import itertools
from pathlib import Path
import re

import numpy as np
import pandas as pd

In [2]:
data_dir = Path.home() / 'Desktop/kaggle/quora'

[str(p) for p in data_dir.iterdir()]

['/usr/local/google/home/maekawa/Desktop/kaggle/quora/.ipynb_checkpoints',
 '/usr/local/google/home/maekawa/Desktop/kaggle/quora/embeddings.zip',
 '/usr/local/google/home/maekawa/Desktop/kaggle/quora/train.csv.zip',
 '/usr/local/google/home/maekawa/Desktop/kaggle/quora/train_with_length.pickle',
 '/usr/local/google/home/maekawa/Desktop/kaggle/quora/test_with_keywords.pickle',
 '/usr/local/google/home/maekawa/Desktop/kaggle/quora/train_with_keywords.pickle',
 '/usr/local/google/home/maekawa/Desktop/kaggle/quora/sample_submission.csv.zip',
 '/usr/local/google/home/maekawa/Desktop/kaggle/quora/test.csv.zip',
 '/usr/local/google/home/maekawa/Desktop/kaggle/quora/sample_resubmissions.csv',
 '/usr/local/google/home/maekawa/Desktop/kaggle/quora/test_with_length.pickle']

In [3]:
train_all = pd.read_csv(data_dir / 'train.csv.zip')

In [4]:
samples = train_all[np.random.rand(len(train_all)) < 0.2]

## word counter

In [5]:
questions_split = train_all.question_text.str.lower().str.split()

In [6]:
counter_unigram_all = collections.Counter()
counter_unigram_pos = collections.Counter()
counter_bigram_all = collections.Counter()
counter_bigram_pos = collections.Counter()
counter_trigram_all = collections.Counter()
counter_trigram_pos = collections.Counter()

for words, target in zip(questions_split, train_all.target):
    for i in range(len(words)):
        w = words[i]
        w = re.sub(r'^[^a-z]+', '', w)
        w = re.sub(r'[^a-z]+$', '', w)
        words[i] = w

        counter_unigram_all[w] += 1
        if target == 1:
            counter_unigram_pos[w] += 1
        
        if i > 0:
            bg = (words[i-1], w)
            counter_bigram_all[bg] += 1
            if target == 1:
                counter_bigram_pos[bg] += 1
        if i > 1:
            tg = (words[i-2], words[i-1], w)
            counter_trigram_all[tg] += 1
            if target == 1:
                counter_trigram_pos[tg] += 1

In [21]:
MIN_QUESTION_COUNT = 400
MIN_POSITIVE_RATIO = 0.2
NUM_KEYWORDS = 100
NUM_TRIGRAMS = 0
NUM_BIGRAMS = 0

trigram_pos_ratios = []

for w, c in counter_trigram_all.items():
    if c < MIN_QUESTION_COUNT:
        continue
    ratio = counter_trigram_pos[w] / c
    if ratio < MIN_POSITIVE_RATIO:
        continue
    trigram_pos_ratios.append((ratio, c, w))
trigram_pos_ratios.sort(reverse=True)
print(len(trigram_pos_ratios[:NUM_TRIGRAMS]))
trigram_pos_ratios = trigram_pos_ratios[:NUM_TRIGRAMS]
trigram_pos_ratios

0


[]

In [22]:
unigram_set = set()
bigram_set = set()
for _, _, (w1, w2, w3) in trigram_pos_ratios[:NUM_TRIGRAMS]:
    unigram_set.add(w1)
    unigram_set.add(w2)
    unigram_set.add(w3)
    bigram_set.add((w1, w2))
    bigram_set.add((w2, w3))

bigram_pos_ratios = []

for w, c in counter_bigram_all.items():
    if c < MIN_QUESTION_COUNT:
        continue
    if w in bigram_set:
        continue
    ratio = counter_bigram_pos[w] / c
    if ratio < MIN_POSITIVE_RATIO:
        continue
    bigram_pos_ratios.append((ratio, c, w))

print(len(bigram_pos_ratios[:NUM_BIGRAMS]))
bigram_pos_ratios.sort(reverse=True)
bigram_pos_ratios = bigram_pos_ratios[:NUM_BIGRAMS]
bigram_pos_ratios

0


[]

In [23]:
unigram_set = set()
for _, _, (w1, w2) in bigram_pos_ratios[:NUM_BIGRAMS]:
    unigram_set.add(w1)
    unigram_set.add(w2)

unigram_pos_ratios = []

for w, c in counter_unigram_all.items():
    if c < MIN_QUESTION_COUNT:
        continue
    if w in unigram_set:
        continue
    ratio = counter_unigram_pos[w] / c
    if ratio < MIN_POSITIVE_RATIO:
        continue
    unigram_pos_ratios.append((ratio, c, (w,)))

unigram_pos_ratios.sort(reverse=True)
num_unigrams = NUM_KEYWORDS - len(trigram_pos_ratios) - len(bigram_pos_ratios)
print(num_unigrams)
unigram_pos_ratios = unigram_pos_ratios[:num_unigrams]
unigram_pos_ratios

100


[(0.7913279132791328, 738, ('blacks',)),
 (0.7618702428416093, 2759, ('liberals',)),
 (0.7456647398843931, 519, ('whites',)),
 (0.7375366568914956, 682, ('feminists',)),
 (0.7138157894736842, 608, ('fuck',)),
 (0.7131825703254274, 1813, ('democrats',)),
 (0.6930398572278406, 5043, ('muslims',)),
 (0.6754161331626121, 1562, ('hindus',)),
 (0.652452025586354, 469, ('ignorant',)),
 (0.6360544217687075, 588, ('palestinians',)),
 (0.6311000827129859, 1209, ('hillary',)),
 (0.6238938053097345, 452, ('shit',)),
 (0.6196707471506965, 2369, ('jews',)),
 (0.6187245590230664, 737, ('asians',)),
 (0.6049069373942471, 1182, ('supporters',)),
 (0.6040372670807453, 1288, ('atheists',)),
 (0.6029143897996357, 549, ('pakistanis',)),
 (0.5952380952380952, 504, ('holocaust',)),
 (0.5951787198669992, 1203, ('conservatives',)),
 (0.5925414364640884, 724, ('terrorists',)),
 (0.5914893617021276, 470, ('raped',)),
 (0.5907194994786236, 1918, ('christians',)),
 (0.5870936438622029, 2061, ('racist',)),
 (0.5800

In [24]:
keywords = ['_'.join(words) for _, _, words in itertools.chain(
    unigram_pos_ratios, bigram_pos_ratios, trigram_pos_ratios)]
keyword_set = set(keywords)

In [25]:
def process(df):
    questions_split = (
        df.question_text
        .str.lower()
        .str.split()
    )
    def to_counter(words):
        counter = collections.Counter()
        counter['__question_len__'] += len(words)
        for i in range(len(words)):
            w = words[i]
            w = re.sub(r'^[^a-z]+', '', w)
            w = re.sub(r'[^a-z]+$', '', w)
            words[i] = w
            if w in keyword_set:
                counter[w] += 1
            if i > 0:
                bg = '{}_{}'.format(words[i-1], w)
                if bg in keyword_set:
                    counter[bg] += 1
            if i > 1:
                tg = '{}_{}_{}'.format(words[i-2], words[i-1], w)
                if tg in keyword_set:
                    counter[tg] += 1
        return counter
    counters = questions_split.map(to_counter)

    for keyword in keywords:
        df.loc[:, 'kw_' + keyword] = counters.map(
            lambda ctr: ctr[keyword] / ctr['__question_len__'])
    df = df.drop(columns=['question_text'])
    return df

In [26]:
#process(samples.copy()).head().T

In [27]:
%%time

train_processed = process(train_all.copy())

CPU times: user 3min 39s, sys: 17.4 s, total: 3min 57s
Wall time: 1min 39s


In [28]:
%%time

nonzeros = (
    train_processed
    .drop(columns=['qid', 'target'])
    .apply(lambda row: row.sum() > 0.0, axis=1)
)
print('{} questions have at least one keywords'.format(nonzeros.sum()))
print('{} coverage'.format(nonzeros.sum() / len(nonzeros)))

110625 questions have at least one keywords
0.08469729473969506 coverage
CPU times: user 53.2 s, sys: 1.3 s, total: 54.5 s
Wall time: 53.3 s


In [29]:
train_processed.to_pickle(data_dir / 'train_with_keywords.pickle')