In [12]:
import os
import sys
import operator
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn import model_selection, preprocessing, ensemble, linear_model, grid_search
from sklearn.metrics import log_loss
from scipy import sparse
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import math

In [2]:
pd.options.display.max_columns = 999

In [3]:
train_df = pd.read_csv('./input/Working_Traning.csv')
test_df = pd.read_csv('./input/Working_Test.csv')
y = train_df['interest_level']

In [4]:
train_df

Unnamed: 0,listing_id,bathrooms,bedrooms,created,latitude,longitude,price,interest_level,month,photos_num,features_num,desc_chars,hour,dayOfMonth,dayOfWeek,anger,anticipation,disgust,fear,joy,negative,positive,sadness,surprise,trust,mean_g,sd_r,mean_b,mean_r,sd_g,sd_b,features_high,features_low,features_med
0,6811957,1.0,1,2016-04-01 22:12:41,40.7302,-73.9924,3195,high,April,5,3,94,22,1,5,0,0,0,0,0,0,1,0,0,0,177.642274,38.385079,174.315374,181.162631,43.713107,49.814755,0.071210,0.758886,0.169904
1,6811965,1.0,0,2016-04-01 22:56:00,40.7576,-73.9677,2000,medium,April,2,4,357,22,1,5,0,0,0,0,0,0,1,0,0,0,85.647753,36.857728,71.968565,94.369002,40.335422,46.010138,0.071210,0.758886,0.169904
2,6811966,2.0,3,2016-04-01 22:57:15,40.7388,-73.9851,5850,high,April,7,9,411,22,1,5,0,1,0,0,1,0,4,0,1,2,131.500834,72.843366,115.019069,144.347497,74.783355,74.760640,0.020727,0.886146,0.093127
3,6811973,1.0,1,2016-04-01 23:26:07,40.7939,-73.9738,2745,medium,April,5,7,816,23,1,5,0,1,0,0,2,0,7,0,0,3,162.906757,30.895741,138.270958,192.543785,43.396042,57.456460,0.071210,0.758886,0.169904
4,6811975,1.0,1,2016-04-02 00:48:13,40.7784,-73.9491,2400,medium,April,7,7,535,0,2,6,0,1,0,0,2,0,6,0,3,2,177.342636,46.739983,167.007626,180.674098,50.358546,57.212208,0.129667,0.621077,0.249256
5,6812000,1.0,1,2016-04-02 01:11:13,40.7091,-74.0182,3650,low,April,5,5,808,1,2,6,0,4,0,0,5,0,14,0,3,7,168.186104,52.737984,167.781396,168.713285,52.957839,54.388567,0.085448,0.734817,0.179735
6,6812002,1.0,2,2016-04-02 01:11:30,40.7366,-73.9826,3150,low,April,0,3,93,1,2,6,0,1,0,0,0,0,0,0,0,0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.085448,0.734817,0.179735
7,6812004,1.0,1,2016-04-02 01:11:31,40.7318,-73.9822,3325,low,April,5,5,431,1,2,6,1,0,0,0,0,1,4,0,0,0,173.575192,50.532590,171.785069,176.535110,53.786579,57.819771,0.032274,0.812530,0.155196
8,6812005,1.0,1,2016-04-02 01:11:34,40.7314,-73.9864,2795,low,April,0,3,383,1,2,6,1,0,1,1,1,1,1,1,0,0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.017890,0.846949,0.135161
9,6812009,1.0,2,2016-04-02 01:11:40,40.8689,-73.9225,2400,low,April,6,1,188,1,2,6,0,0,0,0,0,0,2,0,0,0,114.255182,50.995419,94.576728,136.358520,57.587438,62.163423,0.085448,0.734817,0.179735


In [5]:
dropfeatures = ['listing_id','created']

In [6]:
categorical = ["month"]
for f in categorical:
        if train_df[f].dtype=='object':
            #print(f)
            lbl = preprocessing.LabelEncoder()
            lbl.fit(list(train_df[f].values) + list(test_df[f].values))
            train_df[f] = lbl.transform(list(train_df[f].values))
            test_df[f] = lbl.transform(list(test_df[f].values))

In [7]:
train_x = train_df.drop(dropfeatures+['interest_level'],axis=1)
test_x = test_df.drop(dropfeatures,axis=1)

In [8]:
target_num_map = {'high':0, 'medium':1, 'low':2}
train_y = np.array(train_df['interest_level'].apply(lambda x: target_num_map[x]))

In [286]:
def runXGB(train_X, train_y, test_X, test_y=None, feature_names=None, seed_val=0, num_rounds=1500):
    param = {}
    param['objective'] = 'multi:softmax'
    param['eta'] = 0.05
    param['gamma'] = 0
    param['max_depth'] = 4
    param['silent'] = 1
    param['num_class'] = 3
    param['eval_metric'] = "mlogloss"
    param['min_child_weight'] = 1
    param['subsample'] = 0.79
    param['colsample_bytree'] = 0.8
    param['seed'] = seed_val
    num_rounds = num_rounds

    plst = list(param.items())
    xgtrain = xgb.DMatrix(train_X, label=train_y)

    if test_y is not None:
        xgtest = xgb.DMatrix(test_X, label=test_y)
        watchlist = [ (xgtrain,'train'), (xgtest, 'test') ]
        model = xgb.train(plst, xgtrain, num_rounds, watchlist, early_stopping_rounds=50)
    else:
        xgtest = xgb.DMatrix(test_X)
        model = xgb.train(plst, xgtrain, num_rounds)

    pred_test_y = model.predict(xgtest)
    return pred_test_y, model

In [287]:
cv_scores = []
kf = model_selection.KFold(n_splits=5, shuffle=True, random_state=100)
for dev_index, val_index in kf.split(range(train_x.shape[0])):
        dev_X, val_X = train_x.iloc[dev_index,:], train_x.iloc[val_index,:]
        dev_y, val_y = train_y[dev_index], train_y[val_index]
        preds, model = runXGB(dev_X, dev_y, val_X, val_y)
        cv_scores.append(log_loss(val_y, preds))
        print(cv_scores)
        break

[0]	train-mlogloss:1.07096	test-mlogloss:1.07102
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 50 rounds.
[1]	train-mlogloss:1.04596	test-mlogloss:1.04617
[2]	train-mlogloss:1.02405	test-mlogloss:1.02431
[3]	train-mlogloss:1.00182	test-mlogloss:1.00214
[4]	train-mlogloss:0.981065	test-mlogloss:0.981516
[5]	train-mlogloss:0.961912	test-mlogloss:0.962359
[6]	train-mlogloss:0.944096	test-mlogloss:0.94451
[7]	train-mlogloss:0.927621	test-mlogloss:0.927988
[8]	train-mlogloss:0.912579	test-mlogloss:0.912916
[9]	train-mlogloss:0.898311	test-mlogloss:0.898723
[10]	train-mlogloss:0.884576	test-mlogloss:0.88495
[11]	train-mlogloss:0.871929	test-mlogloss:0.872318
[12]	train-mlogloss:0.859984	test-mlogloss:0.860298
[13]	train-mlogloss:0.849077	test-mlogloss:0.849422
[14]	train-mlogloss:0.839406	test-mlogloss:0.839839
[15]	train-mlogloss:0.829474	test-mlogloss:0.829897
[16]	train-mlogloss:0.821082	test-mlo

ValueError: y_true and y_pred contain different number of classes 3, 2. Please provide the true labels explicitly through the labels argument. Classes found in y_true: [0 1 2]

In [288]:
X_over, y_over = oversample.fit_sample(train_x, train_y)
X_under, y_under = undersample.fit_sample(train_x, train_y)

In [289]:
X_over = pd.DataFrame(X_over,columns=train_x.columns)
X_under = pd.DataFrame(X_under,columns=train_x.columns)

In [290]:
preds1, model = runXGB(X_over, y_over, test_x, num_rounds=2000)
preds2, model = runXGB(X_under, y_under, test_x, num_rounds=2000)

KeyboardInterrupt: 

In [276]:
probs1 = pd.DataFrame(preds1, columns = ['high','medium','low'])
probs2 = pd.DataFrame(preds2, columns = ['high','medium','low'])
probs1['listing_id'] = test_df.listing_id.values
probs2['listing_id'] = test_df.listing_id.values
probsmixed = pd.DataFrame()
probsmixed['high'] = probs1['high'].apply(lambda x: x * .5) + probs2['high'].apply(lambda x: x * .5)
probsmixed['medium'] = probs1['medium'].apply(lambda x: x * .5) + probs2['medium'].apply(lambda x: x * .5)
probsmixed['low'] = probs1['low'].apply(lambda x: x * .5) + probs2['low'].apply(lambda x: x * .5)
probsmixed['listing_id'] = test_df.listing_id.values

In [277]:
probs1.to_csv("xgbover.csv", index=False)
probs2.to_csv("xgbunder.csv", index=False)

In [11]:
preds, model = runXGB(train_x, train_y, test_x, num_rounds=2000)

In [14]:
out_df = pd.DataFrame(preds)
out_df.columns = ["high", "medium", "low"]
out_df["listing_id"] = test_df.listing_id.values
out_df.to_csv("xgb.csv", index=False)

In [237]:
pd.concat([out_df.iloc[:,0:-1].apply(lambda x: x *.4),out_df['listing_id']],axis=1).to_csv("xgb.csv", index=False)

In [234]:
importance = model.get_fscore()
importance = sorted(importance.items(), key=operator.itemgetter(1),reverse=True)
featimport = pd.DataFrame(pd.DataFrame([[i for i,j in importance],[j for i,j in importance]],index = ['feature','importance']).T)
featimport

Unnamed: 0,feature,importance
0,latitude,7667
1,price,7590
2,longitude,6681
3,desc_chars,5217
4,features_high,4546
5,features_med,3754
6,sd_r,3540
7,features_low,3536
8,mean_b,3524
9,mean_r,3488


In [42]:
X_train, X_val, y_train, y_val = model_selection.train_test_split(train_x, train_y, test_size = .2)

In [145]:
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler

In [291]:
oversample = RandomOverSampler()
X_oversampled, y_oversampled = oversample.fit_sample(stacktrain_x, stacktrain_y)

In [292]:
undersample = RandomUnderSampler()
X_undersampled, y_undersampled = undersample.fit_sample(stacktrain_x, stacktrain_y)

In [293]:
X_undersampled = pd.DataFrame(X_undersampled,columns=stacktrain_x.columns)

In [294]:
predsundersample, modelundersample = runXGB(X_undersampled, y_undersampled, stackval_x)

In [295]:
X_oversampled = pd.DataFrame(X_oversampled,columns=stacktrain_x.columns)

In [296]:
predsoversample, modeloversample = runXGB(X_oversampled, y_oversampled, stackval_x)

In [40]:
from sklearn.metrics import confusion_matrix

In [94]:
# val_index = [ind for ind, val in enumerate(train_df['listing_id']) if val in ValIDs.tolist()]
# stacktrain_index = [ind for ind, val in enumerate(train_df['listing_id']) if val not in ValIDs.tolist()]

In [112]:
# stacktrain_x = train_x.iloc[stacktrain_index]
# stackval_x = train_x.iloc[val_index]
# stacktrain_y = train_y[stacktrain_index]
# stackval_y = train_y[val_index]

In [299]:
predsval, modeltrain = runXGB(stacktrain_x, stacktrain_y, stackval_x)

In [None]:
# ValIDs.values

In [221]:
# over_df = pd.DataFrame(predsoversample)
# over_df.columns = ["high", "medium", "low"]
# over_df["listing_id"] = ValIDs.values
# over_df.to_csv("xgbover.csv", index=False)

In [222]:
# under_df = pd.DataFrame(predsundersample)
# under_df.columns = ["high", "medium", "low"]
# under_df["listing_id"] = ValIDs.values
# under_df.to_csv("xgbunder.csv", index=False)

In [143]:
# val_df = pd.DataFrame(predsval)
# val_df.columns = ["high", "medium", "low"]
# val_df["listing_id"] = ValIDs.values
# val_df.to_csv("valxgbpreds.csv", index=False)

In [303]:
pd.read_csv('tuning log.csv')

Unnamed: 0,﻿TRAIN,TEST,KAGGLE,ETA,GAMMA,Depth,Trees
0,train-mlogloss:0.477492,test-mlogloss:0.589609,0.59838,0.1,0.0,4,1500
1,train-mlogloss:0.459133,test-mlogloss:0.588973,0.59512,0.05,0.0,4,1500
2,train-mlogloss:0.481442,test-mlogloss:0.589257,,0.05,0.5,4,1500
3,train-mlogloss:0.480891,test-mlogloss:0.588903,,0.05,0.75,4,1500
4,train-mlogloss:0.384976,test-mlogloss:0.587733,0.60339,0.05,0.75,6,1500
5,train-mlogloss:0.446731,test-mlogloss:0.587637,,0.05,0.75,5,1500
6,train-mlogloss:0.51061,test-mlogloss:0.591108,0.59758,0.05,0.75,3,2000


In [300]:
pd.DataFrame(confusion_matrix(stackval_y, predsval),columns=['high','medium','low'],index=['high','medium','low'])

Unnamed: 0,high,medium,low
high,191,306,270
medium,104,722,1419
low,33,439,6384


In [297]:
pd.DataFrame(confusion_matrix(stackval_y, predsoversample),columns=['high','medium','low'],index=['high','medium','low'])

Unnamed: 0,high,medium,low
high,409,304,54
medium,525,1120,600
low,304,1470,5082


In [298]:
pd.DataFrame(confusion_matrix(stackval_y, predsundersample),columns=['high','medium','low'],index=['high','medium','low'])

Unnamed: 0,high,medium,low
high,503,209,55
medium,799,855,591
low,670,1430,4756
