In [1]:
from pathlib import Path

import numpy as np
import pandas as pd

In [2]:
data_dir = Path.home() / 'Desktop/kaggle/quora'

[str(p) for p in data_dir.iterdir()]

['/usr/local/google/home/maekawa/Desktop/kaggle/quora/.ipynb_checkpoints',
 '/usr/local/google/home/maekawa/Desktop/kaggle/quora/embeddings.zip',
 '/usr/local/google/home/maekawa/Desktop/kaggle/quora/train.csv.zip',
 '/usr/local/google/home/maekawa/Desktop/kaggle/quora/train_with_length.pickle',
 '/usr/local/google/home/maekawa/Desktop/kaggle/quora/test_with_keywords.pickle',
 '/usr/local/google/home/maekawa/Desktop/kaggle/quora/train_with_keywords.pickle',
 '/usr/local/google/home/maekawa/Desktop/kaggle/quora/sample_submission.csv.zip',
 '/usr/local/google/home/maekawa/Desktop/kaggle/quora/test.csv.zip',
 '/usr/local/google/home/maekawa/Desktop/kaggle/quora/sample_resubmissions.csv',
 '/usr/local/google/home/maekawa/Desktop/kaggle/quora/test_with_length.pickle']

In [3]:
train_all = pd.read_csv(data_dir / 'train.csv.zip')
test_all = pd.read_csv(data_dir / 'test.csv.zip')

In [4]:
samples = train_all[np.random.rand(len(train_all)) < 0.2]

In [5]:
samples.head()

Unnamed: 0,qid,question_text,target
20,0000dd973dfd35508c16,How I know whether a girl had done sex before ...,0
22,0000e91571b60c2fb487,Has the United States become the largest dicta...,1
23,000101ac65db6e4a1c13,"What is the strangest phenomenon you know of, ...",0
25,00010a2e064c3e8f152a,Can you make Amazon Alexa trigger events in th...,0
37,00019e6de5c31a235308,What does great wit mean?,0


In [6]:
bad_keywords = [
    'kill', 'killed', 'killing', 'fuck', 'fucking', 'penis', 'skin', 'races',
    'racism', 'racist',
]

In [7]:
def process(df):
    questions_split = (
        df.question_text
        .str.lower()
        .str.split()
    )
    questions_with_keyword = {}
    for i, keyword in enumerate(bad_keywords):
        questions_with_keyword['keyword_' + keyword] = (
            questions_split
            .map(lambda words: keyword in words)
        )
    df = df.assign(**questions_with_keyword)
    df = df.drop(columns=['question_text'])
    return df

In [8]:
questions_split = (
    samples.question_text
    .str.lower()
    .str.split()
)
questions_split.head()

20    [how, i, know, whether, a, girl, had, done, se...
22    [has, the, united, states, become, the, larges...
23    [what, is, the, strangest, phenomenon, you, kn...
25    [can, you, make, amazon, alexa, trigger, event...
37                      [what, does, great, wit, mean?]
Name: question_text, dtype: object

In [9]:
samples = process(samples)
samples.head()

Unnamed: 0,qid,target,keyword_kill,keyword_killed,keyword_killing,keyword_fuck,keyword_fucking,keyword_penis,keyword_skin,keyword_races,keyword_racism,keyword_racist
20,0000dd973dfd35508c16,0,False,False,False,False,False,False,False,False,False,False
22,0000e91571b60c2fb487,1,False,False,False,False,False,False,False,False,False,False
23,000101ac65db6e4a1c13,0,False,False,False,False,False,False,False,False,False,False
25,00010a2e064c3e8f152a,0,False,False,False,False,False,False,False,False,False,False
37,00019e6de5c31a235308,0,False,False,False,False,False,False,False,False,False,False


In [10]:
train_all = process(train_all)
test_all = process(test_all)

In [11]:
train_all.to_pickle(data_dir / 'train_with_keywords.pickle')
test_all.to_pickle(data_dir / 'test_with_keywords.pickle')