In [1]:
import collections
from pathlib import Path
import re

import numpy as np
import pandas as pd

In [2]:
data_dir = Path.home() / 'Desktop/kaggle/quora'

[str(p) for p in data_dir.iterdir()]

['/usr/local/google/home/maekawa/Desktop/kaggle/quora/.ipynb_checkpoints',
 '/usr/local/google/home/maekawa/Desktop/kaggle/quora/embeddings.zip',
 '/usr/local/google/home/maekawa/Desktop/kaggle/quora/train.csv.zip',
 '/usr/local/google/home/maekawa/Desktop/kaggle/quora/train_with_length.pickle',
 '/usr/local/google/home/maekawa/Desktop/kaggle/quora/test_with_keywords.pickle',
 '/usr/local/google/home/maekawa/Desktop/kaggle/quora/train_with_keywords.pickle',
 '/usr/local/google/home/maekawa/Desktop/kaggle/quora/sample_submission.csv.zip',
 '/usr/local/google/home/maekawa/Desktop/kaggle/quora/test.csv.zip',
 '/usr/local/google/home/maekawa/Desktop/kaggle/quora/sample_resubmissions.csv',
 '/usr/local/google/home/maekawa/Desktop/kaggle/quora/test_with_length.pickle']

In [3]:
train_all = pd.read_csv(data_dir / 'train.csv.zip')

In [4]:
samples = train_all[np.random.rand(len(train_all)) < 0.2]

## word counter

In [6]:
questions_split = train_all.question_text.str.lower().str.split()

In [8]:
counter_all = collections.Counter()
counter_pos = collections.Counter()
counter_neg = collections.Counter()

for words, target in zip(questions_split, train_all.target):
    for w in words:
        w = re.sub(r'^[^a-z]+', '', w)
        w = re.sub(r'[^a-z]+$', '', w)
        counter_all[w] += 1
        if target == 1:
            counter_pos[w] += 1
        else:
            counter_neg[w] += 1

In [9]:
pos_neg_ratios = []

for w, c in counter_all.items():
    if c < 10:
        continue
    ratio = counter_pos[w] / c
    pos_neg_ratios.append((ratio, c, w))

pos_neg_ratios.sort(reverse=True)

In [10]:
pos_neg_ratios[:30]

[(1.0, 12, 'hillary’s'),
 (1.0, 11, 'scumbags'),
 (1.0, 10, 'soetoro'),
 (1.0, 10, 'hypocritically'),
 (1.0, 10, 'fuckers'),
 (0.9523809523809523, 21, 'cunts'),
 (0.9411764705882353, 17, 'ra-apist'),
 (0.9411764705882353, 17, 'drumpf'),
 (0.9375, 16, 'nonwhites'),
 (0.9333333333333333, 15, 'spermatic'),
 (0.9333333333333333, 15, 'ra-aping'),
 (0.9333333333333333, 15, 'asinine'),
 (0.9230769230769231, 13, 'scumbag'),
 (0.9090909090909091, 11, 'peace-loving'),
 (0.9, 10, 'lesbianism'),
 (0.9, 10, 'kaffirs'),
 (0.9, 10, 'jihadist'),
 (0.9, 10, 'israels'),
 (0.9, 10, 'hindi-speaking'),
 (0.9, 10, 'grabbers'),
 (0.9, 10, 'anti-israel'),
 (0.8888888888888888, 27, 'massacred'),
 (0.8823529411764706, 17, 'inferiors'),
 (0.8817204301075269, 186, 'castrate'),
 (0.875, 64, 'moron'),
 (0.875, 16, 'cocks'),
 (0.8571428571428571, 14, 'butthurt'),
 (0.8540145985401459, 274, 'castrated'),
 (0.8333333333333334, 36, 'whores'),
 (0.8333333333333334, 24, 'mohajirs')]

In [11]:
keywords = [w for _, _, w in pos_neg_ratios[:100]]
keyword_set = set(keywords)

In [12]:
def process(df):
    questions_split = (
        df.question_text
        .str.lower()
        .str.split()
    )
    def to_counter(words):
        counter = collections.Counter()
        for w in words:
            w = re.sub(r'^[^a-z]+', '', w)
            w = re.sub(r'[^a-z]+$', '', w)
            if w in keyword_set:
                counter[w] += 1
        return counter
    counters = questions_split.map(to_counter)

    for keyword in keywords:
        df.loc[:, 'kw_' + keyword] = counters.map(lambda ctr: ctr[keyword])
    df = df.drop(columns=['question_text'])
    return df

In [13]:
# def process(df):
#     questions_split = (
#         df.question_text
#         .str.lower()
#         .str.split()
#     )
#     questions_with_keyword = {}
#     for i, keyword in enumerate(keywords):
#         questions_with_keyword['keyword_' + keyword] = (
#             questions_split
#             .map(lambda words: sum(w == keyword for w in words))
#         )
#     df = df.assign(**questions_with_keyword)
#     df = df.drop(columns=['question_text'])
#     return df


In [14]:
samples = process(samples.copy())
samples.head().T

Unnamed: 0,54,58,61,64,72
qid,000256f123aa2d179519,00029d76717deaff60f6,0002b9d73d24af32a648,0002c3835360dcb57597,00032cb9dab5b4033cc4
target,0,0,0,0,0
kw_hillary’s,0,0,0,0,0
kw_scumbags,0,0,0,0,0
kw_soetoro,0,0,0,0,0
kw_hypocritically,0,0,0,0,0
kw_fuckers,0,0,0,0,0
kw_cunts,0,0,0,0,0
kw_ra-apist,0,0,0,0,0
kw_drumpf,0,0,0,0,0


In [15]:
train_all = process(train_all)

In [16]:
train_all.to_pickle(data_dir / 'train_with_keywords.pickle')