In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.model_selection import GridSearchCV, StratifiedKFold

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
base_dir = "./input"
print(os.listdir(base_dir))

# Any results you write to the current directory are saved as output.

['embeddings', 'train.csv', 'test.csv', 'sample_submission.csv']


In [2]:
## some config values 
embed_size = 300 # how big is each word vector
max_features = 95000 # how many unique words to use (i.e num rows in embedding vector)
maxlen = 70 # max number of words in a question to use

**Load packages and data**

In [3]:
import os
import time
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from tqdm import tqdm
import math
from sklearn.model_selection import train_test_split
from sklearn import metrics

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, CuDNNLSTM, Embedding, Dropout, Activation, CuDNNGRU, Conv1D, GRU, LSTM
from keras.layers import Bidirectional, GlobalMaxPool1D, GlobalMaxPooling1D, GlobalAveragePooling1D
from keras.layers import Input, Embedding, Dense, Conv2D, MaxPool2D, concatenate
from keras.layers import Reshape, Flatten, Concatenate, Dropout, SpatialDropout1D
from keras.optimizers import Adam
from keras.models import Model
from keras import backend as K
from keras.engine.topology import Layer
from keras.callbacks import Callback
from keras import initializers, regularizers, constraints, optimizers, layers

import tensorflow as tf
import keras

Using TensorFlow backend.


In [4]:
import pandas as pd
from tqdm import tqdm
tqdm.pandas()

In [5]:
# Don't hog GPU
config = tf.ConfigProto()
config.gpu_options.allow_growth=True
sess = tf.Session(config=config)
keras.backend.set_session(sess)

In [6]:
puncts = [',', '.', '"', ':', ')', '(', '-', '!', '?', '|', ';', "'", '$', '&', '/', '[', ']', '>', '%', '=', '#', '*', '+', '\\', '•',  '~', '@', '£', 
 '·', '_', '{', '}', '©', '^', '®', '`',  '<', '→', '°', '€', '™', '›',  '♥', '←', '×', '§', '″', '′', 'Â', '█', '½', 'à', '…', 
 '“', '★', '”', '–', '●', 'â', '►', '−', '¢', '²', '¬', '░', '¶', '↑', '±', '¿', '▾', '═', '¦', '║', '―', '¥', '▓', '—', '‹', '─', 
 '▒', '：', '¼', '⊕', '▼', '▪', '†', '■', '’', '▀', '¨', '▄', '♫', '☆', 'é', '¯', '♦', '¤', '▲', 'è', '¸', '¾', 'Ã', '⋅', '‘', '∞', 
 '∙', '）', '↓', '、', '│', '（', '»', '，', '♪', '╩', '╚', '³', '・', '╦', '╣', '╔', '╗', '▬', '❤', 'ï', 'Ø', '¹', '≤', '‡', '√', ]
def clean_text(x):
    x = str(x)
    for punct in puncts:
        x = x.replace(punct, " " + punct + " ")
    return x

In [7]:
def gen_metadata(df):
    # Pure statistical features
    df['length'] = df['question_text'].progress_apply(lambda x : len(x))
    df['capitals'] = df['question_text'].progress_apply(lambda comment: sum(1 for c in comment if c.isupper()))
    df['caps_vs_length'] = df.progress_apply(lambda row: float(row['capitals'])/float(row['length']),axis=1)
    df['num_math'] = df['question_text'].progress_apply(lambda comment: sum(comment.count(w) for w in ('+','-','*','/','%',')','(','^',')','=','<','>')))
    df['num_exclamation_marks'] = df['question_text'].progress_apply(lambda comment: comment.count('!'))
    df['num_question_marks'] = df['question_text'].progress_apply(lambda comment: comment.count('?'))
    df['num_punctuation'] = df['question_text'].progress_apply(lambda comment: sum(comment.count(w) for w in '.,;:'))
    df['num_symbols'] = df['question_text'].progress_apply(lambda comment: sum(comment.count(w) for w in '*&$%'))
    df['num_words'] = df['question_text'].progress_apply(lambda comment: len(comment.split()))
    df['num_unique_words'] = df['question_text'].progress_apply(lambda comment: len(set(w for w in comment.split())))
    df['words_vs_unique'] = df['num_unique_words'] / df['num_words']
    df['num_smilies'] = df['question_text'].progress_apply(lambda comment: sum(comment.count(w) for w in (':-)', ':)', ';-)', ';)')))
    df['num_sad'] = df['question_text'].progress_apply(lambda comment: sum(comment.count(w) for w in (':-<', ':()', ';-()', ';(')))
    df['num_chars'] =    df['question_text'].progress_apply(len)

    # More Handy Features
    df["count_words_title"] = df["question_text"].progress_apply(lambda x: len([w for w in str(x).split() if w.istitle()]))
    df["mean_word_len"] = df["question_text"].progress_apply(lambda x: np.mean([len(w) for w in str(x).split()]))
    df['punct_percent']= df['num_punctuation']*100/df['num_words']


In [8]:
def load_and_prec():
    train_df = pd.read_csv(base_dir + "/train.csv")
    test_df = pd.read_csv(base_dir + "/test.csv")
    
    # Generate metadata
    gen_metadata(test_df)
    gen_metadata(train_df)
        
    # Concatenate all meta-features into one list of features
    metadata_keys = ['length', 'capitals', 'caps_vs_length', 'num_math',
                     'num_exclamation_marks', 'num_question_marks', 'num_punctuation',
                     'num_symbols', 'num_words', 'num_unique_words', 'words_vs_unique',
                     'num_smilies', 'num_sad', 'num_chars', 'count_words_title',
                     'mean_word_len', 'punct_percent']
    test_meta  = create_metadata_feature(test_df, metadata_keys)
    train_meta = create_metadata_feature(train_df, metadata_keys)
    
    print("Train shape : ",train_meta.shape)
    print("Test shape : ",test_meta.shape)

    ## Get the target values
    train_y = train_df['target'].values
    
    return train_meta, test_meta, train_y

In [9]:
def create_metadata_feature(df, keys):
    data = []
    for key in tqdm(keys):
        data.append(df[key].values)
    data = np.array(data).T
    return data

**Load data**

In [10]:
import time
start_time = time.time()
train_X, test_X, train_y = load_and_prec()
print("Time taken to process data :", time.time() - start_time)

100%|██████████| 56370/56370 [00:00<00:00, 358040.19it/s]
100%|██████████| 56370/56370 [00:00<00:00, 91037.17it/s]
100%|██████████| 56370/56370 [00:01<00:00, 31328.05it/s]
100%|██████████| 56370/56370 [00:00<00:00, 136740.01it/s]
100%|██████████| 56370/56370 [00:00<00:00, 380065.90it/s]
100%|██████████| 56370/56370 [00:00<00:00, 431785.74it/s]
100%|██████████| 56370/56370 [00:00<00:00, 182834.87it/s]
100%|██████████| 56370/56370 [00:00<00:00, 215323.07it/s]
100%|██████████| 56370/56370 [00:00<00:00, 238330.50it/s]
100%|██████████| 56370/56370 [00:00<00:00, 147945.80it/s]
100%|██████████| 56370/56370 [00:00<00:00, 243183.45it/s]
100%|██████████| 56370/56370 [00:00<00:00, 296506.65it/s]
100%|██████████| 56370/56370 [00:00<00:00, 742174.09it/s]
100%|██████████| 56370/56370 [00:00<00:00, 261795.68it/s]
100%|██████████| 56370/56370 [00:00<00:00, 60668.07it/s]
100%|██████████| 1306122/1306122 [00:02<00:00, 526663.61it/s]
100%|██████████| 1306122/1306122 [00:13<00:00, 99915.57it/s] 
100%|████

Train shape :  (1306122, 17)
Test shape :  (56370, 17)
Time taken to process data : 156.06918740272522


In [11]:
# Calculate F-1 score
def f1_score(y_true, y_pred):

    # Count positive samples.
    c1 = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    c2 = K.sum(K.round(K.clip(y_pred, 0, 1)))
    c3 = K.sum(K.round(K.clip(y_true, 0, 1)))

    # If there are no true samples, fix the F1 score at 0.
    if c3 == 0:
        return 0

    # How many selected items are relevant?
    precision = c1 / c2

    # How many relevant items are selected?
    recall = c1 / c3

    # Calculate f1_score
    f1_score = 2 * (precision * recall) / (precision + recall)
    return f1_score

In [12]:
print(np.sum(train_y==1))
print(np.sum(train_y==0))

80810
1225312


In [13]:
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
# clf = SVC(gamma='auto', probability=True)
clf = DecisionTreeClassifier(max_depth=5)
clf.fit(train_X, train_y)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=5,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [14]:
from sklearn.metrics import f1_score
f1s = []
for threshold in tqdm(range(int(1e2))):
    f1s.append(f1_score(train_y, (clf.predict_proba(train_X)[:,1] < threshold * 1e-2).astype(int)))
print(f1s)

  'precision', 'predicted', average, warn_for)
100%|██████████| 100/100 [00:33<00:00,  2.96it/s]

[0.0, 0.0, 0.0, 0.04333943562652594, 0.043532329833120956, 0.058881513011283915, 0.06618023043972161, 0.06865876270840207, 0.07947090237264433, 0.08226520237133385, 0.08436348728227262, 0.08522481830682388, 0.08538553417171467, 0.08598296022848236, 0.0954586974393326, 0.0954586974393326, 0.096052732452182, 0.096052732452182, 0.096052732452182, 0.09788632084399372, 0.09788632084399372, 0.09788632084399372, 0.10526851218419003, 0.10526851218419003, 0.10526851218419003, 0.1091482350536983, 0.1091482350536983, 0.1091482350536983, 0.1091482350536983, 0.1091482350536983, 0.1111796633768846, 0.1111796633768846, 0.1111796633768846, 0.1111796633768846, 0.1111796633768846, 0.1111796633768846, 0.1111796633768846, 0.1111796633768846, 0.1111796633768846, 0.1111796633768846, 0.11644067246644338, 0.11644067246644338, 0.11644067246644338, 0.11644067246644338, 0.11644067246644338, 0.11644067246644338, 0.11644067246644338, 0.11644067246644338, 0.11644067246644338, 0.11644067246644338, 0.1164406724664433




In [15]:
pred_test_y = clf.predict_proba(test_X)[:,1]
print(np.sum(preds > 0.33))

NameError: name 'preds' is not defined

In [None]:
best_threshold = 0.3
pred_test_y = (pred_test_y > best_threshold).astype(int)
test_df = pd.read_csv(base_dir +"/test.csv", usecols=["qid"])
out_df = pd.DataFrame({"qid":test_df["qid"].values})
out_df['prediction'] = pred_test_y
out_df.to_csv("submission.csv", index=False)