#### Dataset from Kaggle Project - Airbnb
#### https://www.kaggle.com/c/airbnb-recruiting-new-user-bookings/data

In [1]:
# Name: Jianlei(John) Sun

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import GradientBoostingClassifier
from datetime import datetime, date

from sklearn.model_selection import train_test_split

% matplotlib inline
plt.style.use('ggplot')

# Data Exploration

In [2]:
# read in the datasets
train_users = pd.read_csv('./data_HW5/train_users_2.csv.zip')
age_gender_bkts = pd.read_csv('./data_HW5/age_gender_bkts.csv.zip')
countries = pd.read_csv('./data_HW5/countries.csv.zip')
sessions = pd.read_csv('./data_HW5/sessions.csv.zip')
test_users = pd.read_csv('./data_HW5/test_users.csv.zip')
sample_submission_NDF = pd.read_csv('./data_HW5/sample_submission_NDF.csv.zip')

In [25]:
train_users.head(2)

Unnamed: 0,id,date_account_created,timestamp_first_active,date_first_booking,gender,age,signup_method,signup_flow,language,affiliate_channel,affiliate_provider,first_affiliate_tracked,signup_app,first_device_type,first_browser,country_destination
0,gxn3p5htnn,2010-06-28,20090319043255,,-unknown-,,facebook,0,en,direct,direct,untracked,Web,Mac Desktop,Chrome,NDF
1,820tgsjxq7,2011-05-25,20090523174809,,MALE,38.0,facebook,0,en,seo,google,untracked,Web,Mac Desktop,Chrome,NDF


## Feature Engineering

In [4]:
Y_train = train_users['country_destination'].values
train_data = train_users.drop(['country_destination'], axis = 1)

data = pd.concat((train_data, test_users), axis = 0, ignore_index = True)
data_all = data.fillna(-1)
data_all = data_all.replace('-unknown-', -1)
data_all = data_all.drop(['date_first_booking'], axis = 1)

# check number of null valus in each record
data_all['n_null'] = np.array([sum(r == -1) for r in data_all.values]) 

# 1) data_account_created 
dac = np.vstack(data_all['date_account_created'].astype(str)
                .apply(lambda x: list(map(int, x.split('-')))).values)
data_all['dac_year'] = dac[:,0]
data_all['dac_month'] = dac[:,1]
data_all['dac_day'] = dac[:,2]

dac_dates = [datetime(x[0],x[1],x[2]) for x in dac]
data_all['dac_wn'] = np.array([d.isocalendar()[1] for d in dac_dates])
data_all['dac_w'] = np.array([d.weekday() for d in dac_dates])
t = pd.get_dummies(data_all.dac_w, prefix='dac_w')
data_all = pd.concat((data_all, t), axis=1)
data_all = data_all.drop(['date_account_created', 'dac_w'], axis=1)

# 2) timestamp_first_active
tfa = np.vstack(data_all['timestamp_first_active'].astype(str)
                .apply(lambda x: list(map(int, [x[:4],x[4:6],x[6:8],x[8:10],x[10:12],x[12:14]]))).values)
data_all['tfa_year'] = tfa[:,0]
data_all['tfa_month'] = tfa[:,1]
data_all['tfa_day'] = tfa[:,2]
data_all['tfa_hour'] = tfa[:,3]

tfa_dates = [datetime(x[0],x[1],x[2],x[3],x[4],x[5]) for x in tfa]
data_all['tfa_wn'] = np.array([d.isocalendar()[1] for d in tfa_dates])
data_all['tfa_w'] = np.array([d.weekday() for d in tfa_dates])
t = pd.get_dummies(data_all.tfa_w, prefix='tfa_w')
data_all = pd.concat((data_all, t), axis=1)
data_all = data_all.drop(['timestamp_first_active', 'tfa_w'], axis=1)

# 3) timespans between 'dac' and 'tfa'
data_all['dac_tfa_secs'] = np.array([np.log(1+abs((dac_dates[i]-tfa_dates[i]).total_seconds())) 
                                  for i in range(len(dac_dates))])
data_all['sig_dac_tfa'] = np.array([np.sign((dac_dates[i]-tfa_dates[i]).total_seconds()) 
                                 for i in range(len(dac_dates))])

# 4) seaons for 'dac' and 'tfa'
Y = 2000 # dummy year
seasons = [(0, (date(Y,  1,  1),  date(Y,  3, 20))),  #'winter'
           (1, (date(Y,  3, 21),  date(Y,  6, 20))),  #'spring'
           (2, (date(Y,  6, 21),  date(Y,  9, 22))),  #'summer'
           (3, (date(Y,  9, 23),  date(Y, 12, 20))),  #'autumn'
           (0, (date(Y, 12, 21),  date(Y, 12, 31)))]  #'winter'
def get_season(dt):
    dt = dt.date()
    dt = dt.replace(year=Y)
    return next(season for season, (start, end) in seasons
                if start <= dt <= end)
data_all['season_dac'] = np.array([get_season(dt) for dt in dac_dates])
data_all['season_tfa'] = np.array([get_season(dt) for dt in tfa_dates])

# 5) group 'age'

# OK for'ages' in 14 < age < 99, otherwise group 'ages'
av = data_all['age'].values
av = np.where(np.logical_and(av<2000, av>1900), 2014-av, av) # birthdays 
av = np.where(np.logical_and(av<14, av>0), 4, av)
av = np.where(np.logical_and(av<2016, av>2010), 9, av) # 9 for current years
av = np.where(av > 99, 110, av)  # 110 for age values above 99
data_all['age'] = av

# create ranges for 'age' and One-hot encoding 
interv =  [0, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80, 85, 90, 100]
def get_interv_value(age):
    iv = 20
    for i in range(len(interv)):
        if age < interv[i]:
            iv = i 
            break
    return iv
data_all['age_interv'] = data_all.age.apply(lambda x: get_interv_value(x))
t = pd.get_dummies(data_all.age_interv, prefix='age_interv')
data_all = pd.concat((data_all, t), axis=1)

data_all = data_all.drop(['age_interv'], axis=1)

# 6) One-hot-encoding other categorical features
ohe_features = ['gender', 'signup_method', 'signup_flow', 'language', 
             'affiliate_channel', 'affiliate_provider', 'first_affiliate_tracked', 
             'signup_app', 'first_device_type', 'first_browser']

for f in ohe_features:
    data_all_dummies = pd.get_dummies(data_all[f], prefix=f)
    data_all = data_all.drop([f], axis=1)
    data_all = pd.concat((data_all, data_all_dummies), axis=1)

In [5]:
# 7) session
sessions['id'] = sessions['user_id']
sessions = sessions.drop(['user_id'], axis = 1)

# fill missing values with 'NAN'
sessions['action'] = sessions.action.fillna('NAN')
sessions['action_type'] = sessions.action_type.fillna('NAN')
sessions['action_detail'] = sessions.action_detail.fillna('NAN')
sessions['device_type'] = sessions.device_type.fillna('NAN')

# assign "OTHER" to the low frequency "action" iterms
actFreq = 100 # JIANLEI_DEFINED: frequency threhold 
actDict = dict(zip(*np.unique(sessions.action, return_counts=True)))
sessions['action'] = sessions.action.apply(lambda x: 'OTHER' if actDict[x] < actFreq else x)

f_act = sessions.action.value_counts().argsort()
f_act_detail = sessions.action_detail.value_counts().argsort()
f_act_type = sessions.action_type.value_counts().argsort()
f_dev_type = sessions.device_type.value_counts().argsort()

# group session by 'id'
dgr_sess = sessions.groupby(['id'])
# Loop on dgr_sess to create all the features
samples = []
cont = 0
ln = len(dgr_sess)
for g in dgr_sess:
    if cont%10000 == 0:
        print("%s from %s" %(cont, ln))
    gr = g[1]
    l = []
    
    # add the 'id'
    l.append(g[0])
    
    # add the number of values
    l.append(len(gr))
    
    sev = gr.secs_elapsed.fillna(0).values 
    
    # add 'action' features
    c_act = [0] * len(f_act)
    for i,v in enumerate(gr.action.values):
        c_act[f_act[v]] += 1
    _, c_act_uqc = np.unique(gr.action.values, return_counts=True)
    c_act += [len(c_act_uqc), np.mean(c_act_uqc), np.std(c_act_uqc)]
    l = l + c_act
    
    # add 'action_detail' features
    c_act_detail = [0] * len(f_act_detail)
    for i,v in enumerate(gr.action_detail.values):
        c_act_detail[f_act_detail[v]] += 1 
    _, c_act_det_uqc = np.unique(gr.action_detail.values, return_counts=True)
    c_act_detail += [len(c_act_det_uqc), np.mean(c_act_det_uqc), np.std(c_act_det_uqc)]
    l = l + c_act_detail
    
    # add 'action_type' features
    l_act_type = [0] * len(f_act_type)
    c_act_type = [0] * len(f_act_type)
    for i,v in enumerate(gr.action_type.values):
        l_act_type[f_act_type[v]] += sev[i]   
        c_act_type[f_act_type[v]] += 1  
    l_act_type = np.log(1 + np.array(l_act_type)).tolist()
    _, c_act_type_uqc = np.unique(gr.action_type.values, return_counts=True)
    c_act_type += [len(c_act_type_uqc), np.mean(c_act_type_uqc), np.std(c_act_type_uqc)]
    l = l + c_act_type + l_act_type    
    
    # add 'device_type' features
    c_dev_type  = [0] * len(f_dev_type)
    for i,v in enumerate(gr.device_type .values):
        c_dev_type[f_dev_type[v]] += 1 
    _, c_dev_type_uqc = np.unique(gr.device_type.values, return_counts=True)
    c_dev_type += [len(c_dev_type_uqc), np.mean(c_dev_type_uqc), np.std(c_dev_type_uqc)]        
    l = l + c_dev_type    
    
    # add 'secs_elapsed' features        
    l_secs = [0] * 5 # five simple statistics
    l_log = [0] * 15 # group the values into 15 intervals
    if len(sev) > 0:
        l_secs[0] = np.log(1 + np.sum(sev))
        l_secs[1] = np.log(1 + np.mean(sev)) 
        l_secs[2] = np.log(1 + np.std(sev))
        l_secs[3] = np.log(1 + np.median(sev))
        l_secs[4] = l_secs[0] / float(l[1])
        
        log_sev = np.log(1 + sev).astype(int)
        l_log = np.bincount(log_sev, minlength=15).tolist()                      
    l = l + l_secs + l_log
    
    # add one sample's all features
    samples.append(l)
    cont += 1

# Create a dataframe with the computed features    
col_names = []    
for i in range(len(samples[0])-1):
    col_names.append('c_' + str(i)) 
    
samples = np.array(samples)
samp_ar = samples[:, 1:].astype(np.float16)
samp_id = samples[:, 0]  
      
df_agg_sess = pd.DataFrame(samp_ar, columns=col_names)
df_agg_sess['id'] = samp_id
df_agg_sess.index = df_agg_sess.id


0 from 135483
10000 from 135483
20000 from 135483
30000 from 135483
40000 from 135483
50000 from 135483
60000 from 135483
70000 from 135483
80000 from 135483
90000 from 135483
100000 from 135483
110000 from 135483
120000 from 135483
130000 from 135483


In [6]:
# 8) comine 'train, test' with 'sessions'
data_all = pd.merge(data_all, df_agg_sess, how='left')
data_all = data_all.drop(['id'], axis=1)
data_all = data_all.fillna(-2)  # Missing features for 'samples' without sesssion data.

data_all['all_null'] = np.array([sum(r<0) for r in data_all.values]) # add all null types

In [7]:
# 9) split the "data_all" back to "train_data" and "test_data"
X_train = data_all.values[:train_users.shape[0]]
X_test = data_all.values[train_users.shape[0]:]
LE = LabelEncoder()
y = LE.fit_transform(Y_train)  

# Modeling 

In [8]:
# split 'X_train' data into sub_training_data and validation_data

np.random.seed(2016)
x_train, x_val, y_train, y_val = train_test_split(X_train, y, test_size = 0.2)

In [9]:
a = pd.DataFrame(y_train, columns=['Y'])
a.Y.value_counts()

7     99613
10    49952
11     8083
4      3995
6      2241
5      1842
3      1804
1      1144
2       859
8       625
0       423
9       179
Name: Y, dtype: int64

In [10]:
# 1) build the baseline prediction model - "Random Forest Tree Model"
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics

RF = RandomForestClassifier(random_state = 2016)
RF = RF.fit(x_train, y_train)
y_pred = RF.predict(x_val)

print metrics.classification_report(y_val, y_pred)

             precision    recall  f1-score   support

          0       0.00      0.00      0.00       116
          1       0.00      0.00      0.00       284
          2       0.00      0.00      0.00       202
          3       0.02      0.00      0.00       445
          4       0.04      0.01      0.01      1028
          5       0.02      0.00      0.00       482
          6       0.05      0.01      0.01       594
          7       0.68      0.83      0.75     24930
          8       0.00      0.00      0.00       137
          9       0.00      0.00      0.00        38
         10       0.47      0.44      0.45     12424
         11       0.06      0.01      0.01      2011

avg / total       0.54      0.61      0.57     42691



  'precision', 'predicted', average, warn_for)


In [11]:
# 2) optimize the hyperparameters of the baseline - "Random Forest Tree Model"

# define a function call to "GridSearchCV"; to search for the best model parameters
from sklearn.model_selection import GridSearchCV
np.random.seed(2016)

model = GridSearchCV(estimator  = RandomForestClassifier(),
                     param_grid = {'n_estimators':[11, 21, 41, 101], 'criterion':['gini', 'entropy']},
                     verbose = 10, n_jobs = -1, iid  = True, refit = False)

model.fit(x_train, y_train)
print("Best score: %0.3f" % model.best_score_)
print("Best parameters set:", model.best_params_)
print("Scores:", model.grid_scores_)    


Fitting 3 folds for each of 8 candidates, totalling 24 fits
[CV] n_estimators=11, criterion=gini .................................
[CV] n_estimators=11, criterion=gini .................................
[CV] n_estimators=11, criterion=gini .................................
[CV] n_estimators=21, criterion=gini .................................
[CV] n_estimators=21, criterion=gini .................................
[CV] n_estimators=21, criterion=gini .................................
[CV] n_estimators=41, criterion=gini .................................
[CV] n_estimators=41, criterion=gini .................................
[CV] .. n_estimators=11, criterion=gini, score=0.616408, total=  27.4s
[CV] n_estimators=41, criterion=gini .................................
[CV] .. n_estimators=11, criterion=gini, score=0.614259, total=  26.8s


[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:   34.0s


[CV] n_estimators=101, criterion=gini ................................
[CV] .. n_estimators=11, criterion=gini, score=0.610602, total=  30.8s
[CV] n_estimators=101, criterion=gini ................................
[CV] .. n_estimators=21, criterion=gini, score=0.626034, total=  51.1s
[CV] n_estimators=101, criterion=gini ................................
[CV] .. n_estimators=21, criterion=gini, score=0.624607, total=  50.8s
[CV] .. n_estimators=21, criterion=gini, score=0.623269, total=  50.4s
[CV] n_estimators=11, criterion=entropy ..............................
[CV] n_estimators=11, criterion=entropy ..............................
[CV]  n_estimators=11, criterion=entropy, score=0.613544, total=  24.0s
[CV] n_estimators=11, criterion=entropy ..............................
[CV]  n_estimators=11, criterion=entropy, score=0.612888, total=  24.6s
[CV] n_estimators=21, criterion=entropy ..............................
[CV] .. n_estimators=41, criterion=gini, score=0.630738, total= 1.4min


[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:  1.8min


[CV] n_estimators=21, criterion=entropy ..............................
[CV] .. n_estimators=41, criterion=gini, score=0.631023, total= 1.5min
[CV] n_estimators=21, criterion=entropy ..............................
[CV] .. n_estimators=41, criterion=gini, score=0.630473, total= 1.4min
[CV] n_estimators=41, criterion=entropy ..............................
[CV]  n_estimators=11, criterion=entropy, score=0.611533, total=  25.6s


[Parallel(n_jobs=-1)]: Done  12 out of  24 | elapsed:  2.0min remaining:  2.0min


[CV] n_estimators=41, criterion=entropy ..............................
[CV]  n_estimators=21, criterion=entropy, score=0.625015, total=  43.2s
[CV] n_estimators=41, criterion=entropy ..............................
[CV]  n_estimators=21, criterion=entropy, score=0.623816, total=  43.4s
[CV] n_estimators=101, criterion=entropy .............................
[CV]  n_estimators=21, criterion=entropy, score=0.624288, total=  44.6s


[Parallel(n_jobs=-1)]: Done  15 out of  24 | elapsed:  2.6min remaining:  1.6min


[CV] n_estimators=101, criterion=entropy .............................
[CV]  n_estimators=41, criterion=entropy, score=0.631199, total= 1.3min
[CV] n_estimators=101, criterion=entropy .............................
[CV]  n_estimators=41, criterion=entropy, score=0.631652, total= 1.4min
[CV]  n_estimators=41, criterion=entropy, score=0.630315, total= 1.3min


[Parallel(n_jobs=-1)]: Done  18 out of  24 | elapsed:  3.8min remaining:  1.3min


[CV] . n_estimators=101, criterion=gini, score=0.634695, total= 3.2min
[CV] . n_estimators=101, criterion=gini, score=0.635394, total= 3.2min
[CV] . n_estimators=101, criterion=gini, score=0.634145, total= 3.0min


[Parallel(n_jobs=-1)]: Done  21 out of  24 | elapsed:  4.2min remaining:   36.2s


[CV]  n_estimators=101, criterion=entropy, score=0.634080, total= 2.2min
[CV]  n_estimators=101, criterion=entropy, score=0.634937, total= 2.2min
[CV]  n_estimators=101, criterion=entropy, score=0.633477, total= 1.7min


[Parallel(n_jobs=-1)]: Done  24 out of  24 | elapsed:  5.3min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  24 out of  24 | elapsed:  5.3min finished


Best score: 0.635
('Best parameters set:', {'n_estimators': 101, 'criterion': 'gini'})
('Scores:', [mean: 0.61376, std: 0.00240, params: {'n_estimators': 11, 'criterion': 'gini'}, mean: 0.62464, std: 0.00113, params: {'n_estimators': 21, 'criterion': 'gini'}, mean: 0.63074, std: 0.00022, params: {'n_estimators': 41, 'criterion': 'gini'}, mean: 0.63474, std: 0.00051, params: {'n_estimators': 101, 'criterion': 'gini'}, mean: 0.61266, std: 0.00084, params: {'n_estimators': 11, 'criterion': 'entropy'}, mean: 0.62437, std: 0.00049, params: {'n_estimators': 21, 'criterion': 'entropy'}, mean: 0.63106, std: 0.00056, params: {'n_estimators': 41, 'criterion': 'entropy'}, mean: 0.63416, std: 0.00060, params: {'n_estimators': 101, 'criterion': 'entropy'}])




In [12]:
# 3) optimize the hyperparameters of the another model - "Extra Tree Model"
from sklearn.ensemble import ExtraTreesClassifier

np.random.seed(2016)
model = GridSearchCV(estimator  = ExtraTreesClassifier(),
                     param_grid = {'n_estimators':[21, 41, 101], 'criterion':['gini', 'entropy']},
                     verbose = 10, n_jobs = -1, iid  = True, refit = False)

model.fit(x_train, y_train)
print("Best score: %0.3f" % model.best_score_)
print("Best parameters set:", model.best_params_)
print("Scores:", model.grid_scores_)  


Fitting 3 folds for each of 6 candidates, totalling 18 fits
[CV] n_estimators=21, criterion=gini .................................
[CV] n_estimators=21, criterion=gini .................................
[CV] n_estimators=21, criterion=gini .................................
[CV] n_estimators=41, criterion=gini .................................
[CV] n_estimators=41, criterion=gini .................................
[CV] n_estimators=41, criterion=gini .................................
[CV] n_estimators=101, criterion=gini ................................
[CV] n_estimators=101, criterion=gini ................................
[CV] .. n_estimators=21, criterion=gini, score=0.615968, total=  52.1s
[CV] n_estimators=101, criterion=gini ................................
[CV] .. n_estimators=21, criterion=gini, score=0.615208, total=  55.2s


[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:  1.1min


[CV] n_estimators=21, criterion=entropy ..............................
[CV] .. n_estimators=21, criterion=gini, score=0.615047, total=  56.2s
[CV] n_estimators=21, criterion=entropy ..............................
[CV]  n_estimators=21, criterion=entropy, score=0.615881, total=  57.2s
[CV] n_estimators=21, criterion=entropy ..............................
[CV] .. n_estimators=41, criterion=gini, score=0.621045, total= 1.8min


[Parallel(n_jobs=-1)]: Done   5 out of  18 | elapsed:  2.1min remaining:  5.5min


[CV]  n_estimators=21, criterion=entropy, score=0.614610, total=  57.2s
[CV] n_estimators=41, criterion=entropy ..............................
[CV] n_estimators=41, criterion=entropy ..............................
[CV] .. n_estimators=41, criterion=gini, score=0.619509, total= 1.9min
[CV] .. n_estimators=41, criterion=gini, score=0.621269, total= 1.9min


[Parallel(n_jobs=-1)]: Done   7 out of  18 | elapsed:  2.2min remaining:  3.5min


[CV] n_estimators=41, criterion=entropy ..............................
[CV] n_estimators=101, criterion=entropy .............................
[CV]  n_estimators=21, criterion=entropy, score=0.611779, total=  57.6s


[Parallel(n_jobs=-1)]: Done   9 out of  18 | elapsed:  3.1min remaining:  3.1min


[CV] n_estimators=101, criterion=entropy .............................
[CV]  n_estimators=41, criterion=entropy, score=0.619499, total= 1.8min
[CV] n_estimators=101, criterion=entropy .............................
[CV]  n_estimators=41, criterion=entropy, score=0.620109, total= 1.8min


[Parallel(n_jobs=-1)]: Done  11 out of  18 | elapsed:  4.1min remaining:  2.6min


[CV]  n_estimators=41, criterion=entropy, score=0.618508, total= 1.8min
[CV] . n_estimators=101, criterion=gini, score=0.625173, total= 4.5min


[Parallel(n_jobs=-1)]: Done  13 out of  18 | elapsed:  5.0min remaining:  1.9min


[CV] . n_estimators=101, criterion=gini, score=0.624308, total= 4.6min
[CV] . n_estimators=101, criterion=gini, score=0.624693, total= 4.3min


[Parallel(n_jobs=-1)]: Done  15 out of  18 | elapsed:  5.5min remaining:  1.1min


[CV]  n_estimators=101, criterion=entropy, score=0.623188, total= 3.4min
[CV]  n_estimators=101, criterion=entropy, score=0.623641, total= 2.9min
[CV]  n_estimators=101, criterion=entropy, score=0.621987, total= 2.4min


[Parallel(n_jobs=-1)]: Done  18 out of  18 | elapsed:  6.6min finished


Best score: 0.625
('Best parameters set:', {'n_estimators': 101, 'criterion': 'gini'})
('Scores:', [mean: 0.61541, std: 0.00040, params: {'n_estimators': 21, 'criterion': 'gini'}, mean: 0.62061, std: 0.00078, params: {'n_estimators': 41, 'criterion': 'gini'}, mean: 0.62472, std: 0.00035, params: {'n_estimators': 101, 'criterion': 'gini'}, mean: 0.61409, std: 0.00171, params: {'n_estimators': 21, 'criterion': 'entropy'}, mean: 0.61937, std: 0.00066, params: {'n_estimators': 41, 'criterion': 'entropy'}, mean: 0.62294, std: 0.00070, params: {'n_estimators': 101, 'criterion': 'entropy'}])




In [13]:
from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import cross_val_score

RF = RandomForestClassifier(n_estimators=101, criterion='gini')
ET = ExtraTreesClassifier(n_estimators=101, criterion='gini')

finalModel = VotingClassifier(estimators=[('Random Forest',RF), ('Extra Tree',ET)], voting='hard')

In [14]:
for clf, label in zip([RF, ET, finalModel], ['Random Forest', 'Extra Eree', 'Ensemble']):
    scores = cross_val_score(clf, X_train, y, cv=5)
    print "Accuracy: %0.2f (+/- %0.2f) [%s]" % (scores.mean(), scores.std(), label)

Accuracy: 0.29 (+/- 0.15) [Random Forest]
Accuracy: 0.37 (+/- 0.12) [Extra Eree]
Accuracy: 0.30 (+/- 0.14) [Ensemble]


# Submission

In [15]:
finalModel.fit(X_train, y)

VotingClassifier(estimators=[('Random Forest', RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
   ...imators=101, n_jobs=1, oob_score=False, random_state=None,
           verbose=0, warm_start=False))],
         n_jobs=1, voting='hard', weights=None)

In [16]:
# submmit the results

t = pd.DataFrame(finalModel.predict(X_test), columns=['country'])
predict_data = pd.concat([sample_submission_NDF['id'], t], axis =1)
predict_data.to_csv('./data_HW5/Submmission.csv', index=False)