blogs:
http://blog.kaggle.com/2011/11/27/smile-alexander-dyakonov-on-placing-third-in-the-photo-quality-prediction-competition/
http://blog.kaggle.com/2011/11/23/picture-perfect-bo-yang-on-winning-the-photo-quality-prediction-competition/

In [49]:
import numpy as np 
import pandas as pd 
from sklearn.cross_validation import train_test_split
from sklearn.metrics import accuracy_score

In [50]:
train = pd.read_csv("../data/training.csv")
max_elem = 2151

In [51]:
def generate_word_list(train_ling):
    word_list = [str(line).split() for line in train_ling]
    refined_list = []
    for str_list in word_list:
        int_list = [int(elem) for elem in str_list if elem != "nan"]
        refined_list += [int_list]
    refined_list = np.array(refined_list)
    return refined_list

In [76]:
name_np = generate_word_list(train.name)
description_np = generate_word_list(train.description)
caption_np = generate_word_list(train.caption)

In [77]:
def get_average_score_by_word(word_list, score):
    agg_matrix = np.zeros((max_elem,))
    count_matrix = np.zeros((max_elem,))
    for i, int_list in enumerate(word_list):
        for num in int_list:
            agg_matrix[num-1] += score[i]
            count_matrix[num-1] += 1
    return agg_matrix / count_matrix

In [78]:
name_word_score = get_average_score_by_word(name_np, train.good)
description_word_score = get_average_score_by_word(description_np, train.good)
caption_word_score = get_average_score_by_word(caption_np, train.good)

In [79]:
# <2151 words> + <sum> + <avg>
def build_word_matrix(word_list, word_score_matrix):
    word_np = np.zeros((word_list.shape[0], max_elem))
    for i, int_list in enumerate(word_list):
        for num in int_list:
            word_np[i][num-1] = word_score_matrix[num-1]
    sum_matrix = np.zeros((word_np.shape[0],))
    count_matrix = np.zeros((word_np.shape[0],))
    for i in range(word_np.shape[0]):
        sum_matrix[i] = np.sum(word_np[i])
        count_matrix[i] = np.sum(np.count_nonzero(word_np[i]))
    avg_matrix = sum_matrix / count_matrix
    word_np = np.concatenate((word_np, sum_matrix.reshape(sum_matrix.shape[0], 1), 
                              avg_matrix.reshape(avg_matrix.shape[0], 1)), axis=1)
    return word_np

In [80]:
name_matrix = build_word_matrix(name_np, name_word_score)
description_matrix = build_word_matrix(description_np, description_word_score)
caption_matrix = build_word_matrix(caption_np, caption_word_score)

In [84]:
X_no_ling = train[["latitude", "longitude", "width", "height", "size"]]
y = train.good
X_full = np.concatenate((X_no_ling.values, name_matrix, description_matrix, caption_matrix), axis=1)
X_full = np.nan_to_num(X_full)

In [101]:
import xgboost as xgb

def xgb_benchmark(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
    dtrain = xgb.DMatrix(X_train, label=y_train)
    dtest = xgb.DMatrix(X_test, label=y_test)
    evallist  = [(dtest,'eval'), (dtrain,'train')]
    param = {'bst:max_depth':20, 'bst:eta':0.05, 'silent':1, 
             'objective':'binary:logistic', 'early_stopping_rounds':10,
             'subsample': 0.5, 'colsample_bytree': 0.1, 'min_child_weight': 1.05
            }
    param['nthread'] = 4
    param['eval_metric'] = 'auc'
    num_round = 500
    bst = xgb.train(param, dtrain, num_round, evallist )
    bst.save_model("../data/xgb.model")
    pred = bst.predict(dtest)
    pred[pred < 0.5] = 0
    pred[pred >= 0.5] = 1
    return (1 - accuracy_score(y_test, pred)), bst

In [102]:
acc, bst = xgb_benchmark(X_full, y)
print acc

[0]	eval-auc:0.614218	train-auc:0.635787
[1]	eval-auc:0.637034	train-auc:0.659805
[2]	eval-auc:0.711071	train-auc:0.754160
[3]	eval-auc:0.720541	train-auc:0.760729
[4]	eval-auc:0.771393	train-auc:0.806697
[5]	eval-auc:0.775296	train-auc:0.809672
[6]	eval-auc:0.789877	train-auc:0.831516
[7]	eval-auc:0.784927	train-auc:0.835379
[8]	eval-auc:0.786054	train-auc:0.834954
[9]	eval-auc:0.788586	train-auc:0.837047
[10]	eval-auc:0.797603	train-auc:0.848808
[11]	eval-auc:0.796848	train-auc:0.847623
[12]	eval-auc:0.801983	train-auc:0.851707
[13]	eval-auc:0.804926	train-auc:0.856205
[14]	eval-auc:0.806658	train-auc:0.858570
[15]	eval-auc:0.807521	train-auc:0.864716
[16]	eval-auc:0.808064	train-auc:0.864608
[17]	eval-auc:0.807799	train-auc:0.864183
[18]	eval-auc:0.813749	train-auc:0.870604
[19]	eval-auc:0.814224	train-auc:0.870128
[20]	eval-auc:0.814518	train-auc:0.870180
[21]	eval-auc:0.814453	train-auc:0.870411
[22]	eval-auc:0.815314	train-auc:0.870336
[23]	eval-auc:0.819937	train-auc:0.879028
[2

0.193798449612


[499]	eval-auc:0.839899	train-auc:0.982410


loollll this would actually be a top 20 finish.....