In [3]:
%matplotlib inline
import pandas as pd

raw_train_df = pd.read_csv('../data/train.csv')
raw_test_df = pd.read_csv('../data/test.csv')
print('train csv shape: {}'.format(raw_train_df.shape))
print('train columns: {}'.format(raw_train_df.columns))
print('test csv shape: {}'.format(raw_test_df.shape))
print('test columns: {}'.format(raw_test_df.columns))


train csv shape: (159571, 8)
train columns: Index(['id', 'comment_text', 'toxic', 'severe_toxic', 'obscene', 'threat',
       'insult', 'identity_hate'],
      dtype='object')
test csv shape: (153164, 2)
test columns: Index(['id', 'comment_text'], dtype='object')


In [4]:
comment_col = 'comment_text'
target_cols = ['toxic', 
               'severe_toxic',
               'obscene',
               'threat',
               'insult',
               'identity_hate']
# confirm all 0/1 values
assert all(raw_train_df[target_cols].apply(lambda x: x.unique() == [0, 1]))

print(raw_train_df[target_cols].mean().sort_values(ascending=False).round(3))



toxic            0.096
obscene          0.053
insult           0.049
severe_toxic     0.010
identity_hate    0.009
threat           0.003
dtype: float64


In [36]:
from tensorflow.python.keras.preprocessing.text import Tokenizer
tokenizer = Tokenizer()
train_test_comment_text = raw_train_df['comment_text'].append(raw_test_df['comment_text']).reset_index(drop=True)
tokenizer.fit_on_texts(train_test_comment_text)
print('vocab size: {}'.format(len(tokenizer.word_index)))


vocab size: 394787


In [37]:
print(pd.Series(tokenizer.word_counts).quantile([0.5, .75, 0.8, 0.85, .9, .95, .99]))


0.50      1.0
0.75      3.0
0.80      4.0
0.85      6.0
0.90     11.0
0.95     32.0
0.99    377.0
dtype: float64


In [39]:
tokenizer = Tokenizer(num_words=100000)
tokenizer.fit_on_texts(train_test_comment_text)
train_sequences = tokenizer.texts_to_sequences(raw_train_df['comment_text'])
test_sequences = tokenizer.texts_to_sequences(raw_test_df['comment_text'])
print(pd.Series([len(x) for x in train_sequences]).quantile([0.5, .75, 0.8, 0.85, .9, .95, .99]))
print(pd.Series([len(x) for x in test_sequences]).quantile([0.5, .75, 0.8, 0.85, .9, .95, .99]))




0.50     36.0
0.75     75.0
0.80     91.0
0.85    114.0
0.90    152.0
0.95    230.0
0.99    572.0
dtype: float64
0.50     30.0
0.75     65.0
0.80     79.0
0.85    100.0
0.90    133.0
0.95    209.0
0.99    561.0
dtype: float64
