In [60]:
import numpy as np 
import pandas as pd 
from sklearn.cross_validation import train_test_split
from sklearn.metrics import accuracy_score

In [2]:
train = pd.read_csv("../data/training.csv")

In [43]:
def generate_word_matrix(train_ling):
    word_list = [str(line).split() for line in train_ling]
    refined_list = []
    max_elem = -1
    for str_list in word_list:
        int_list = [int(elem) for elem in str_list if elem != "nan"]
        for i in int_list:
            if i > max_elem:
                max_elem = i
        refined_list += [int_list]
    refined_list = np.array(refined_list)

    word_np = np.zeros((refined_list.shape[0], max_elem))
    for i, int_list in enumerate(refined_list):
        for num in int_list:
            word_np[i][num-1] = 1
    return word_np

In [47]:
name_np = generate_word_matrix(train.name)
description_np = generate_word_matrix(train.description)
caption_np = generate_word_matrix(train.caption)
print name_np.shape
print description_np.shape
print caption_np.shape

(40262, 2151)
(40262, 2151)
(40262, 2151)


In [59]:
X_no_ling = train[["latitude", "longitude", "width", "height", "size"]]
y = train.good
X_full = np.concatenate((X_no_ling.values, name_np, description_np, caption_np), axis=1)

In [66]:
from sklearn.ensemble import RandomForestClassifier

def benchmark(X, y, max_depth):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
    rf = RandomForestClassifier(n_estimators=50, max_depth=max_depth, n_jobs=-1)
    rf.fit(X_train, y_train)
    pred = rf.predict(X_test)
    return 1 - accuracy_score(y_test, pred)

In [67]:
print benchmark(X_full, y, 50)

0.235342816287
None


In [114]:
import xgboost as xgb

def xgb_benchmark(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
    dtrain = xgb.DMatrix(X_train, label=y_train)
    dtest = xgb.DMatrix(X_test, label=y_test)
    evallist  = [(dtest,'eval'), (dtrain,'train')]
    param = {'bst:max_depth':20, 'bst:eta':0.05, 'silent':1, 
             'objective':'binary:logistic', 'early_stopping_rounds':10,
             'subsample': 0.7, 'colsample_bytree': 0.1, 'min_child_weight': 1.05
            }
    param['nthread'] = 4
    param['eval_metric'] = 'auc'
    num_round = 500
    bst = xgb.train(param, dtrain, num_round, evallist )
    bst.save_model("../data/xgb.model")
    pred = bst.predict(dtest)
    pred[pred < 0.5] = 0
    pred[pred >= 0.5] = 1
    return (1 - accuracy_score(y_test, pred)), bst

In [115]:
acc, bst = xgb_benchmark(X_full, y)
print acc

[0]	eval-auc:0.683320	train-auc:0.728685
[1]	eval-auc:0.711972	train-auc:0.759746
[2]	eval-auc:0.720245	train-auc:0.767467
[3]	eval-auc:0.728608	train-auc:0.781279
[4]	eval-auc:0.737777	train-auc:0.787784
[5]	eval-auc:0.740045	train-auc:0.788618
[6]	eval-auc:0.747737	train-auc:0.797262
[7]	eval-auc:0.749427	train-auc:0.799234
[8]	eval-auc:0.752636	train-auc:0.802097
[9]	eval-auc:0.754801	train-auc:0.804191
[10]	eval-auc:0.759667	train-auc:0.812099
[11]	eval-auc:0.762034	train-auc:0.816251
[12]	eval-auc:0.763134	train-auc:0.816563
[13]	eval-auc:0.765471	train-auc:0.819354
[14]	eval-auc:0.767418	train-auc:0.821195
[15]	eval-auc:0.769504	train-auc:0.823186
[16]	eval-auc:0.771805	train-auc:0.825391
[17]	eval-auc:0.773066	train-auc:0.826592
[18]	eval-auc:0.774523	train-auc:0.828606
[19]	eval-auc:0.774844	train-auc:0.829087
[20]	eval-auc:0.777955	train-auc:0.830310
[21]	eval-auc:0.778268	train-auc:0.830774
[22]	eval-auc:0.780592	train-auc:0.831969
[23]	eval-auc:0.782407	train-auc:0.835147
[2

0.202152479868


[498]	eval-auc:0.824480	train-auc:0.947253
[499]	eval-auc:0.824482	train-auc:0.947307


In [121]:
test = pd.read_csv("../data/test.csv")
name_np = generate_word_matrix(test.name)
description_np = generate_word_matrix(test.description)
caption_np = generate_word_matrix(test.caption)
t_X_no_ling = test[["latitude", "longitude", "width", "height", "size"]]
t_X_full = np.concatenate((t_X_no_ling.values, name_np, description_np, caption_np), axis=1)

dtest = xgb.DMatrix(t_X_full)
pred = bst.predict(dtest)
pred[pred < 0.5] = 0
pred[pred >= 0.5] = 1
f = open("../data/submission.csv", 'w')
for i in pred:
    f.write(str(i))
    f.write("\n")