## Libraries

In [55]:
!pip install xgboost-0.7-cp36-cp36m-win_amd64.whl

Processing c:\users\user\google drive\data science competitions\predicting poverty - world bank\xgboost-0.7-cp36-cp36m-win_amd64.whl
Installing collected packages: xgboost
Successfully installed xgboost-0.7


In [1]:

%matplotlib inline

import os
import numpy as np
import pandas as pd
import xgboost as xgb
import matplotlib.pyplot as plt
from xgboost.sklearn import XGBClassifier
from sklearn import model_selection, metrics 
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import log_loss

# data directory
#DATA_DIR = os.path.join('C:\\Users\\User\\Google Drive\\Data Science Competitions\\Predicting Poverty - World Bank', 'Data', 'processed')

## Functions

In [12]:
def standardize(df, numeric_only=True):
    
        numeric = df.select_dtypes(include=['int64', 'float64'])
    
    # subtracy mean and divide by std
        df[numeric.columns] = (numeric - numeric.mean()) / numeric.std()
    
        return df

def standardize_Test(df):
        df = df.apply(LabelEncoder().fit_transform) 
    
        numeric = df.select_dtypes(include=['int64', 'float64'])
    
    # subtracy mean and divide by std
        df[numeric.columns] = (numeric - numeric.mean()) / numeric.std()
    
        return df
    

def pre_process_data(df, enforce_cols=None):
    
        print("Input shape:\t{}".format(df.shape))
        df = df.apply(LabelEncoder().fit_transform)  
        enforce_cols = enforce_cols.apply(LabelEncoder().fit_transform)   
        #df = pd.get_dummies(df)
        print("After converting categoricals:\t{}".format(df.shape))
        df = standardize(df)
        enforce_cols = standardize(enforce_cols)
        print("After standardization {}".format(df.shape))
        
        # create dummy variables for categoricals
    
    

    # match test set and training set columns
        if enforce_cols is not None:
            to_drop = np.setdiff1d(df.columns, enforce_cols.columns)
            to_add = np.setdiff1d(enforce_cols.columns, df.columns)
            
            df.drop(to_drop, axis=1, inplace=True)
            df = df.assign(**{c: 0 for c in to_add})
    
            df.fillna(0, inplace=True)
    
            return df

## Imports

In [13]:
a_train_h = pd.read_csv('~Data\A_hhold_train.csv')
b_train_h = pd.read_csv('~\Data\B\B_hhold_train.csv')
c_train_h = pd.read_csv('~\Data\C_hhold_train.csv')

a_train_i = pd.read_csv('~\Data\A_indiv_train.csv')
b_train_i = pd.read_csv('~\Data\B_indiv_train.csv')
c_train_i = pd.read_csv('~\Data\C_indiv_train.csv')

a_test_h = pd.read_csv('~\Data\A_hhold_test.csv')
b_test_h = pd.read_csv('~\Data\B_hhold_test.csv')
c_test_h = pd.read_csv('~\Data\C_hhold_test.csv')

a_test_i = pd.read_csv('~\Data\A_indiv_test.csv')
b_test_i = pd.read_csv('~\Data\B_indiv_test.csv')
c_test_i = pd.read_csv('~\Data\C_indiv_test.csv')

## Pre-process

In [14]:
A_Train = pd.merge(a_train_h, a_train_i,on='id', how='outer')
B_Train = pd.merge(b_train_h, b_train_i,on='id', how='outer')
C_Train = pd.merge(c_train_h, c_train_i,on='id', how='outer')

A_Train = A_Train.set_index('id')
B_Train = B_Train.set_index('id')
C_Train = C_Train.set_index('id')

A_Test = pd.merge(a_test_h, a_test_i,on='id', how='outer')
B_Test = pd.merge(b_test_h, b_test_i,on='id', how='outer')
C_Test = pd.merge(c_test_h, c_test_i,on='id', how='outer')

A_Test = A_Test.set_index('id')
B_Test = B_Test.set_index('id')
C_Test = C_Test.set_index('id')

A_Test = standardize_Test(A_Test)
B_Test = standardize_Test(B_Test)
C_Test = standardize_Test(C_Test)

print("Country A")
AX_Train = pre_process_data(A_Train.drop('poor_x', axis=1), enforce_cols = A_Test)
 
print("\nCountry B")
BX_Train = pre_process_data(B_Train.drop('poor_x', axis=1), enforce_cols = B_Test) 

print("\nCountry C")
CX_Train = pre_process_data(C_Train.drop('poor_x', axis=1), enforce_cols = C_Test)

Country A
Input shape:	(37560, 387)
After converting categoricals:	(37560, 387)
After standardization (37560, 387)

Country B
Input shape:	(20252, 667)
After converting categoricals:	(20252, 667)
After standardization (20252, 667)

Country C
Input shape:	(29913, 206)
After converting categoricals:	(29913, 206)
After standardization (29913, 206)


In [15]:
A_Train = A_Train.apply(LabelEncoder().fit_transform)
B_Train = B_Train.apply(LabelEncoder().fit_transform)
C_Train = C_Train.apply(LabelEncoder().fit_transform)

A_Test = A_Test.reset_index().drop_duplicates(subset='id', keep='last').set_index('id')
B_Test = B_Test.reset_index().drop_duplicates(subset='id', keep='last').set_index('id')
C_Test = C_Test.reset_index().drop_duplicates(subset='id', keep='last').set_index('id')

ay_train = A_Train['poor_x']
by_train = B_Train['poor_x']
cy_train = C_Train['poor_x']

In [16]:
AX_Train = AX_Train.drop(['country_x', 'country_y', 'nKoaotpH'], axis=1)
BXX_Train = BX_Train.drop(['HFgaiygl', 'VMvwrYds', 'country_x', 'country_y'], axis=1)
CX_Train = CX_Train.drop(['country_x', 'country_y'], axis=1)

A_Test = A_Test.drop(['country_x', 'country_y', 'nKoaotpH'], axis=1)
B_Test = B_Test.drop(['HFgaiygl', 'VMvwrYds', 'country_x', 'country_y'], axis=1)
C_Test = C_Test.drop(['country_x', 'country_y'], axis=1)

## Model

In [18]:
from numpy import sort
def modelfit(alg, dtrain, target, dtest, useTrainCV=True, cv_folds=10, early_stopping_rounds=3):
    
    if useTrainCV:
        xgb_param = alg.get_xgb_params()
        dtraint = xgb.DMatrix(dtrain, label=target)
        
        cvresult = xgb.cv(xgb_param, dtraint, num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds,
        metrics='logloss', early_stopping_rounds=early_stopping_rounds)
        alg.set_params(n_estimators=cvresult.shape[0])

         #Fit the algorithm on the data
        alg.fit(dtrain, target, eval_metric='logloss')
        
        dtrain_predictions = alg.predict(dtest)
        dtrain_predprob = alg.predict_proba(dtest)
        
        log_lss = log_loss(dtrain_predictions, dtrain_predprob[:,1])
        
        #feat_imp =alg.feature_importances_ 
        feat_imp = pd.Series(alg._Booster.get_score()).sort_values(ascending=False)

    
        #feat_imp.plot(kind='barh', x='feature', y='fscore', legend=False, figsize=(10, 100))
        #plt.ylabel('Feature Importance Score')
        return feat_imp, dtrain_predprob, log_lss

In [26]:
import warnings 
warnings.filterwarnings('ignore')
from sklearn.grid_search import GridSearchCV

param_test1 = {
 'learning_rate': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
 'n_estimators': list(range(100,500,100)), 
 'max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9]
 #'min_child_weight' : list(range(30,40,1))
# 'colsample_bytree':[i/100.0 for i in range(2,22)]
 #'reg_lambda':[0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1]
 #'gamma':[i/100.0 for i in range(11,11)]
 #'subsample':[1, 2, 3, 4, 5]
 #'max_delta_step' : [0, 1,2, 3, 4, 5, 6, 7, 8, 9, 10]
  #'reg_alpha': [i/100.0 for i in range(1,10)]
}
gsearch1 = GridSearchCV(estimator = XGBClassifier(nthread=4, scale_pos_weight=1, seed=27), 
 param_grid = param_test1, scoring='roc_auc',n_jobs=4,iid=False, cv=10, verbose=10)
#print(gsearch1)
#n_estimators=600, reg_alpha=0.2, gamma=1.9, colsample_bytree=0.3, min_child_weight=27,


In [None]:
gsearch1.fit(AX_Train,ay_train)

Fitting 10 folds for each of 360 candidates, totalling 3600 fits


[Parallel(n_jobs=4)]: Done   5 tasks      | elapsed:   32.0s
[Parallel(n_jobs=4)]: Done  10 tasks      | elapsed:   51.5s
[Parallel(n_jobs=4)]: Done  17 tasks      | elapsed:  1.9min
[Parallel(n_jobs=4)]: Done  24 tasks      | elapsed:  3.0min
[Parallel(n_jobs=4)]: Done  33 tasks      | elapsed:  5.2min
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:  6.8min
[Parallel(n_jobs=4)]: Done  53 tasks      | elapsed:  8.6min
[Parallel(n_jobs=4)]: Done  64 tasks      | elapsed: 11.8min
[Parallel(n_jobs=4)]: Done  77 tasks      | elapsed: 17.4min
[Parallel(n_jobs=4)]: Done  90 tasks      | elapsed: 20.1min
[Parallel(n_jobs=4)]: Done 105 tasks      | elapsed: 26.6min
[Parallel(n_jobs=4)]: Done 120 tasks      | elapsed: 35.0min
[Parallel(n_jobs=4)]: Done 137 tasks      | elapsed: 40.6min
[Parallel(n_jobs=4)]: Done 154 tasks      | elapsed: 51.9min
[Parallel(n_jobs=4)]: Done 173 tasks      | elapsed: 61.4min
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed: 76.0min
[Parallel(n_jobs=4)]: Do

In [None]:
gsearch1.grid_scores_, gsearch1.best_params_, gsearch1.best_score_

In [139]:
## import warnings 
warnings.filterwarnings('ignore')
from sklearn.grid_search import GridSearchCV

param_test2 = {
#'learning_rate': [i/100 for i in range(0,10)],
# 'n_estimators': list(range(1000,1500,100)), 
 #'max_depth': list(range(1,9,1)),
 'min_child_weight' : list(range(1,10,1)),
 #'colsample_bylevel':[0.5, 1]
 #'colsample_bytree':[0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1],
 'gamma': [i/100.0 for i in range(10,20)],
 #'subsample':[0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
 #'max_delta_step': [0, 1, 2, 4, 5, 6, 7, 8, 9, 10],
 'reg_alpha': [i/1000.0 for i in range(20,100, 10)]  
}
gsearch2 = GridSearchCV(estimator = XGBClassifier(gamma=0.1, reg_alpha=0.02, nthread=4, scale_pos_weight=1, seed=27), 
 param_grid = param_test2, scoring='roc_auc',n_jobs=4,iid=False, cv=10, verbose=10)
#print(gsearch1)


In [None]:
gsearch2.fit(BXX_Train,by_train)

In [None]:
gsearch2.grid_scores_, gsearch2.best_params_, gsearch2.best_score_

In [208]:
#alg1 = XGBClassifier(n_estimators=600, max_depth=3, learning_rate=0.1, reg_alpha = 0.2, colsample_bytree =0.2, gamma=1.9, min_child_weight=27)
#alg23 = XGBClassifier(n_estimators=100, reg_alpha=0.08, min_child_weight=18, gamma=0.1, max_delta_step=2, colsample_bytree = 0.4)
#alg24 = XGBClassifier()
#alg23 = XGBClassifier(gamma=0.1, reg_alpha=0.02, min_child_weight=15)

In [209]:
#pd.set_option('display.max_rows', None)
feat_a, p_a, log_loss_a = modelfit(alg1, AX_Train, ay_train, A_Test)

In [130]:
feat_b, p_b, log_loss_b = modelfit(alg23, BXX_Train, by_train, B_Test)

In [15]:
feat_c, p_c, log_loss_c = modelfit(alg24, CX_Train, cy_train, C_Test)

In [12]:
import statistics
 
print(statistics.mean([log_loss_a, log_loss_b, log_loss_c]))

0.0981484258898


In [177]:
def make_country_sub(preds, test_feat, country):
    # make sure we code the country correctly
    country_codes = ['A', 'B', 'C']
    
    # get just the poor probabilities
    country_sub = pd.DataFrame(data=preds[:, 1],  # proba p=1
                               columns=['poor'], 
                               index=test_feat.index)

    
    # add the country code for joining later
    country_sub["country"] = country
    return country_sub[["country", "poor"]]

In [17]:
a_sub = make_country_sub(p_a, A_Test, 'A')
b_sub = make_country_sub(p_b, B_Test, 'B')
c_sub = make_country_sub(p_c, C_Test, 'C')

In [211]:
a_sub.head()

Unnamed: 0_level_0,country,poor
id,Unnamed: 1_level_1,Unnamed: 2_level_1
418,A,0.9628
41249,A,0.000832
16205,A,0.795557
97501,A,0.00057
67756,A,0.978803


In [132]:
b_sub.head()

Unnamed: 0_level_0,country,poor
id,Unnamed: 1_level_1,Unnamed: 2_level_1
9135,B,0.022004
117,B,0.029178
29085,B,0.019645
55442,B,0.031494
29281,B,0.013645


In [None]:
submission = pd.concat([a_sub, b_sub, c_sub])

In [None]:
submission.to_csv("submission.csv")