In [1]:
from datetime import datetime
import timeit

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import Imputer
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

# Load the files
train_A = pd.read_csv('data/A_hhold_train.csv', index_col='id')
test_A = pd.read_csv('data/A_hhold_test.csv', index_col='id')

train_B = pd.read_csv('data/B_hhold_train.csv', index_col='id')
test_B = pd.read_csv('data/B_hhold_test.csv', index_col='id')

train_C = pd.read_csv('data/C_hhold_train.csv', index_col='id')
test_C = pd.read_csv('data/C_hhold_test.csv', index_col='id')

# Separate the labels from the values
labels_A = train_A.poor
train_A = train_A.drop(['poor','country'], axis='columns')
test_A = test_A.drop(['country'], axis='columns')

labels_B = train_B.poor
train_B = train_B.drop(['poor','country'], axis='columns')
test_B = test_B.drop(['country'], axis='columns')

labels_C = train_C.poor
train_C = train_C.drop(['poor','country'], axis='columns')
test_C = test_C.drop(['country'], axis='columns')

***
# Function to prepare the data
I will refrain from using a pipeline since the data sets are different for each country.

### <font color=gray>Function to combine all text rows
I will employ a bag-of-words technique. To do this I will first combine all object columns to one column called 'text', and use the sklearn CountVectorizer to get dummies.

In [2]:
def combine_text_columns(data_frame):
    """ converts all text in each row of data_frame to single vector """
   
    # Join all text items in a row that have a space in between
    return data_frame.apply(lambda x: " ".join(x), axis=1)

In [3]:
def prepare_data(train, test):
    """Prepare data ready to fit the model"""
    
    # Split each dataframes by numeric and object
    train_num = train.loc[:,train.dtypes != 'object']
    test_num = test.loc[:,test.dtypes != 'object']
    train_obj = train.loc[:,train.dtypes == 'object']
    test_obj = test.loc[:,test.dtypes == 'object']
    
    # Note the numeric column names for use later to prevent a colision on joining
    num_cols = train_num.columns
    
    ########################
    # Impute the numeric data using the median
    impute = Imputer(strategy='median')
    train_num = impute.fit_transform(train_num)
    test_num = impute.transform(test_num)
    
    # Scale the numberic data
    scale = StandardScaler()
    train_num = impute.fit_transform(train_num)
    test_num = impute.transform(test_num)
    
    ########################
    # Create text column of combine text
    train_obj['text'] = combine_text_columns(train_obj)
    test_obj['text'] = combine_text_columns(test_obj)
    
    # Countvectorizer on 'text'
    vectorizer = CountVectorizer(token_pattern='[A-Za-z0-9]+(?=\\s+)')
    train_obj = vectorizer.fit_transform(train_obj['text'])
    test_obj = vectorizer.transform(test_obj['text'])
    
    # Return the datasets to dataframes
    train_num = pd.DataFrame(train_num, index=train.index, columns=num_cols)
    test_num = pd.DataFrame(test_num, index=test.index, columns=num_cols)
    train_obj = pd.DataFrame(train_obj.todense(), index=train.index)
    test_obj = pd.DataFrame(test_obj.todense(), index=test.index)
    
    # Join the to dataframes
    train = train_num.join(train_obj)
    test = test_num.join(test_obj)
    
    return train, test

### <font color=gray>Apply the prepare_data function to the datasets of each country

In [4]:
# Transform each country data set
train_a_trans, test_a_trans = prepare_data(train_A, test_A)
train_b_trans, test_b_trans = prepare_data(train_B, test_B)
train_c_trans, test_c_trans = prepare_data(train_C, test_C)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


***
# GridSearchCV Using RandomForest
### <font color=gray>Define a function to make searching easier

In [5]:
def grid_clf_log_loss(X, y, param, verbose=2):
    """grid search using RandomForestClassifier"""
    
    #print('Process start time:',datetime.now(),'\n')
    tic = timeit.default_timer()

    clf = RandomForestClassifier(random_state=2049, n_jobs=-1, class_weight= "balanced")
    gridCV = GridSearchCV(clf, param, cv=3, verbose=verbose,  scoring='neg_log_loss', 
                          return_train_score=True, n_jobs=1)
    gridCV.fit(X, y)
    
    
    # Make and print results dataframe
    cv_result = pd.DataFrame(gridCV.cv_results_)
    param_columns = ['param_'+key for key in list(param.keys())]
    cv_columns = ['mean_test_score', 'std_test_score', 'rank_test_score'] + param_columns

    toc = timeit.default_timer()
    print('Total process time (min):',(toc-tic)/60)

    return clf, cv_result[cv_columns].sort_values('rank_test_score')

### <font color=gray>GridSearch on country A

In [6]:
param = {'n_estimators':[100],
         'max_features':['auto', None],
         'min_samples_leaf':[1,2,3],
         'max_depth':range(20,100,20)}

model, cv_results = grid_clf_log_loss(train_a_trans, labels_A, param, verbose=0)
cv_results.head()

Total process time (min): 8.535966905249127


Unnamed: 0,mean_test_score,std_test_score,rank_test_score,param_max_depth,param_n_estimators,param_min_samples_leaf,param_max_features
11,-0.349245,0.008646,1,40,100,3,
17,-0.349245,0.008646,1,60,100,3,
23,-0.349245,0.008646,1,80,100,3,
5,-0.349829,0.008579,4,20,100,3,
10,-0.349923,0.009137,5,40,100,2,


### <font color=gray>GridSearch on country B

In [7]:
param = {'n_estimators':[100],
         'max_features':['auto', None],
         'min_samples_leaf':[2,3,4],
         'max_depth':range(20,100,20)}

model, cv_results = grid_clf_log_loss(train_b_trans, labels_B, param, verbose=0)
cv_results.head()

Total process time (min): 3.385880766933163


Unnamed: 0,mean_test_score,std_test_score,rank_test_score,param_max_depth,param_n_estimators,param_min_samples_leaf,param_max_features
18,-0.231556,0.005738,1,80,100,2,auto
6,-0.231556,0.005738,1,40,100,2,auto
12,-0.231556,0.005738,1,60,100,2,auto
0,-0.23167,0.004348,4,20,100,2,auto
16,-0.235748,0.01131,5,60,100,3,


### <font color=gray>GridSearch on country C

In [8]:
param = {'n_estimators':[100],
         'max_features':['auto', None],
         'min_samples_leaf':[1,2,3],
         'max_depth':range(20,100,20)}

model, cv_results = grid_clf_log_loss(train_c_trans, labels_C, param, verbose=0)
cv_results.head()

Total process time (min): 2.5792096895340366


Unnamed: 0,mean_test_score,std_test_score,rank_test_score,param_max_depth,param_n_estimators,param_min_samples_leaf,param_max_features
21,-0.046905,0.022878,1,80,100,1,
3,-0.046905,0.022878,1,20,100,1,
15,-0.046905,0.022878,1,60,100,1,
9,-0.046905,0.022878,1,40,100,1,
11,-0.04854,0.013988,5,40,100,3,


### <font color=gray>Estimated mean log loss score across all 3 countries
This is an estimate of score received from DriveData.org using these three optimal models.

In [9]:
score_A = -0.349245
score_B = -0.231556
score_C = -0.046905
n_observations_A = test_A.shape[0]
n_observations_B = test_B.shape[0]
n_observations_C = test_C.shape[0]
total_observations = n_observations_A + n_observations_B + n_observations_C

estimated_score = -(score_A*n_observations_A + score_B*n_observations_B + score_C*n_observations_C)/total_observations

print('Estimated log-loss score on the test data: {0:.5f}'.format(estimated_score))

Estimated log-loss score on the test data: 0.21877


***
# Prepare final submission

In [10]:
def predict(train, test, labels, n_estimators, max_features, max_depth, min_samples_leaf, country):
    clf = RandomForestClassifier(n_estimators=n_estimators, max_features=max_features,
                                 max_depth=max_depth, min_samples_leaf=min_samples_leaf, 
                                 random_state=2049, n_jobs=-1, class_weight= "balanced")
    clf.fit(train,labels)
    prediction = clf.predict(test)
    probability = clf.predict_proba(test)
    neg_probablity, pos_probability = zip(*probability)
    
    # Create dataframe with correct index and a broadcast country column
    result_dataframe = pd.DataFrame(columns=['country'],
                                   index=test.index,
                                   data=country)
    # Add the results column
    result_dataframe['poor'] = pos_probability
    
    
    return result_dataframe

### <font color=gray>Apply the predict function with the each countries optimal hyperparameters

In [11]:
result_df_A = predict(train_a_trans, test_a_trans, labels_A, n_estimators=100,
                      max_features=None, max_depth=40, min_samples_leaf=3, country='A')

result_df_B = predict(train_b_trans, test_b_trans, labels_B, n_estimators=100,
                      max_features='auto', max_depth=80, min_samples_leaf=2, country='B')

result_df_C = predict(train_c_trans, test_c_trans, labels_C, n_estimators=100,
                      max_features=None, max_depth=80, min_samples_leaf=1, country='C')

### <font color=gray>Concatenate the results to a single dataframe ready for submission

In [12]:
frames = [result_df_A, result_df_B, result_df_C]
submission_df = pd.concat(frames)
submission_df.head()

Unnamed: 0_level_0,country,poor
id,Unnamed: 1_level_1,Unnamed: 2_level_1
418,A,0.608189
41249,A,0.074432
16205,A,0.608053
97501,A,0.089529
67756,A,0.934198


### <font color=gray>Export the dataframe to a csv file

In [13]:
submission_df.to_csv('submission_file.csv')