# 06 Explore Feature selection
Used the cleaned, scaled, and normalised data to train a model.

**Disclaimer.** I used AzureML for this step. My trail expired one week before the end of the competition so I started using SciKit Learn. I submitted results from here to the competition but never bettered the score I got via AzureML even though the RMSE scores were comparable. I can only assume that I was overtraining here!

## Initialise the styles for the workbooks

In [1]:
# Initialise styles and packages we need
from IPython.core.display import HTML
def css_styling():
    styles = open("styles/custom.css", "r").read()
    return HTML(styles)
css_styling()

## Imports and classes used

In [2]:
# All the imports used
import pandas as pd
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPRegressor
from sklearn import metrics
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

print("Pandas version:       {}".format(pd.__version__))
#print("Scikit learn version: {}".format(sklearn.__version__))

Pandas version:       0.23.4


## Import cleaned, scaled and normalised data we created in 03 Data Scaling and Normalising

In [3]:
final_scaled_normalised_training_values_filename = 'data/DAT102x_Predicting_Chronic_Hunger_-_Clean_Normal_Training_values.csv'
training_labels_filename = 'data/DAT102x_Predicting_Chronic_Hunger_-_Training_labels.csv'
final_scaled_normalised_test_values_filename = 'data/DAT102x_Predicting_Chronic_Hunger_-_Clean_Normal_Test_values.csv'

training_values = pd.read_csv(final_scaled_normalised_training_values_filename)
training_labels = pd.read_csv(training_labels_filename)
test_values = pd.read_csv(final_scaled_normalised_test_values_filename)

# Makes sure country_code and year are treated as categorical!
training_values['country_code'] = training_values['country_code'].astype('category')
training_values['year'] = training_values['year'].astype('category')
test_values['country_code'] = test_values['country_code'].astype('category')
test_values['year'] = test_values['year'].astype('category')

print("Training values: {}".format(training_values.shape))
print("Training label: {}".format(training_labels.shape))
print("Test values:     {}".format(test_values.shape))
print(training_values.head())
#print(training_values.dtypes)

Training values: (1311, 19)
Training label: (1401, 2)
Test values:     (616, 19)
   row_id country_code  year  agricultural_land_area  forest_area  \
0       0      889f053  2002                0.644849     0.326591   
1       1      9e614ab  2012                0.393423     0.598121   
2       2      100c476  2000                0.013088     0.099657   
3       3      4609682  2013                0.545174     0.371419   
4       4      be2a7f5  2008                0.059169     0.177130   

   total_land_area  population_growth  avg_value_of_food_production  \
0         0.522760           0.555582                      0.221955   
1         0.435434           0.463276                      0.583676   
2         0.017267           0.515693                      0.360597   
3         0.396593           0.455697                      0.647380   
4         0.039847           0.379837                      0.672717   

   food_imports_as_share_of_merch_exports  \
0                               

## All the steps below is one example of trying to improve the model
That is, this is one run of tuning I tried as an example.

In this specific case I limted the feature set to only:
* access_to_improved_water_sources
* access_to_electricity
* gross_domestic_product_per_capita_ppp
* avg_supply_of_protein_of_animal_origin
* caloric_energy_from_cereals_roots_tubers

In [4]:
print(training_values.columns)
features_to_use = ['row_id',
                   'access_to_improved_water_sources',
                   'access_to_electricity',
                   'gross_domestic_product_per_capita_ppp',
                   'avg_supply_of_protein_of_animal_origin',
                   'caloric_energy_from_cereals_roots_tubers']

TFVals = training_values[features_to_use]
TestVals = test_values[features_to_use]
print(TFVals.shape)
print(TestVals.shape)

print(TFVals.head())

Index(['row_id', 'country_code', 'year', 'agricultural_land_area',
       'forest_area', 'total_land_area', 'population_growth',
       'avg_value_of_food_production',
       'food_imports_as_share_of_merch_exports',
       'gross_domestic_product_per_capita_ppp',
       'per_capita_food_supply_variability',
       'avg_supply_of_protein_of_animal_origin',
       'caloric_energy_from_cereals_roots_tubers',
       'access_to_improved_sanitation', 'access_to_improved_water_sources',
       'obesity_prevalence', 'access_to_electricity', 'co2_emissions',
       'ratio_urban_population_total'],
      dtype='object')
(1311, 6)
(616, 6)
   row_id  access_to_improved_water_sources  access_to_electricity  \
0       0                          0.381366               0.514125   
1       1                          0.566625               0.762074   
2       2                          0.938416               0.893837   
3       3                          0.910661               0.971112   
4       4   

## Add the label values to the training features

In [5]:
tempDF = pd.merge(TFVals, training_labels, on='row_id', how='inner')
print(tempDF.shape)

(1311, 7)


## Prepare the training feature matrix and training label vector
I didn't actually time this, but using .values converts the DataFrame to an numpy Array and apparently improves the performace of SciKit Learn. Long way of saying that even though SciKit Learn handles DataFrames it should be more efficient to convert before calling SciKit Learn modules.

In [6]:
X = pd.get_dummies(TFVals.iloc[:,1:len(TFVals)]).values
y = tempDF['prevalence_of_undernourishment'].values
print(X)
print(y)

[[0.38136639 0.51412482 0.34322496 0.24725323 0.67218215]
 [0.5666249  0.7620736  0.3679343  0.31000858 0.73943552]
 [0.93841623 0.89383726 0.64441976 0.72922355 0.08933629]
 ...
 [0.6404953  0.90820707 0.41870264 0.65127939 0.31108026]
 [0.91747038 0.94717855 0.66192119 0.80430425 0.09206421]
 [0.70355405 0.22182681 0.26482206 0.26398007 0.90993425]]
[31.26071279 18.29823274 39.51339713 ... 12.08848436 26.43666106
 13.71256945]


## Spilt data, train model, predict, cross validate
Check result of this reduced feature set. Not actually tuning, more like feature selection.

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [8]:
nn = MLPRegressor(hidden_layer_sizes=300, max_iter=500, verbose=False, activation='identity', solver='adam')
print(nn)

MLPRegressor(activation='identity', alpha=0.0001, batch_size='auto',
       beta_1=0.9, beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=300, learning_rate='constant',
       learning_rate_init=0.001, max_iter=500, momentum=0.9,
       n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
       random_state=None, shuffle=True, solver='adam', tol=0.0001,
       validation_fraction=0.1, verbose=False, warm_start=False)


In [9]:
nn.fit(X_train, y_train)

MLPRegressor(activation='identity', alpha=0.0001, batch_size='auto',
       beta_1=0.9, beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=300, learning_rate='constant',
       learning_rate_init=0.001, max_iter=500, momentum=0.9,
       n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
       random_state=None, shuffle=True, solver='adam', tol=0.0001,
       validation_fraction=0.1, verbose=False, warm_start=False)

In [10]:
y_pred = nn.predict(X_test)
print((metrics.mean_squared_error(y_test,y_pred))**0.5)

7.650445849531831


In [11]:
# Basically search across 10 fold to find best "Model"!?
scores = cross_val_score(nn, X, y, cv=10, scoring='neg_mean_squared_error')
print("RMSE: %0.2f (+/- %0.2f)" % ((abs(scores.mean())**0.5), (abs(scores.std())**0.5) * 2))

RMSE: 7.67 (+/- 6.91)


In [12]:
print(nn)

MLPRegressor(activation='identity', alpha=0.0001, batch_size='auto',
       beta_1=0.9, beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=300, learning_rate='constant',
       learning_rate_init=0.001, max_iter=500, momentum=0.9,
       n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
       random_state=None, shuffle=True, solver='adam', tol=0.0001,
       validation_fraction=0.1, verbose=False, warm_start=False)


## Train on full dataset and predict PoU of test set.

In [13]:
nn.fit(X, y)

scores = cross_val_score(nn, X, y, cv=10, scoring='neg_mean_squared_error')
print("RMSE: %0.2f (+/- %0.2f)" % ((abs(scores.mean())**0.5), (abs(scores.std())**0.5) * 2))

RMSE: 7.67 (+/- 6.89)


In [14]:
# Use trained model to predict outcome?
X_test = pd.get_dummies(TestVals.iloc[:,1:len(TestVals)]).values
y_pred = nn.predict(X_test)

In [15]:
print(y_pred)

[-2.34721932e+00  2.56692110e+00  2.89906933e+00  2.32339838e+01
  2.71924265e+01  1.77004153e+01  1.85244483e+01  2.85530530e+01
  2.98416975e+01  3.76393784e+01  2.59813130e+00  8.44050997e+00
  1.18646038e+01  1.23621640e+00  2.55368945e+01  2.35386380e+01
  3.51971789e+01  2.59530865e+00  7.29717566e+00  1.95005766e+01
  2.26635000e+01  2.21050864e+01  1.72578505e+00  2.65039768e+00
  3.76717585e+00  2.08394626e+01  2.62032466e+01  1.10093351e+01
 -4.43978233e-01  3.34026621e+00  4.48274765e+00  3.07837059e+00
  3.57910832e+00  9.24112480e+00  4.10313022e+00  3.06048238e+01
  3.35998414e+01  8.43858414e-01  7.93740865e+00  2.98824232e+01
  2.30447046e+01  3.19295688e+01  6.08758108e+00  4.57154904e+00
  7.09102976e+00  1.69778883e+01  2.49323058e+01  4.84546203e+00
  3.17849271e+01  2.68577583e+01  4.64774185e+00  5.98955530e+00
  4.52563621e+00  1.03909147e+01  2.12377354e+01  3.22840774e+00
  2.07077980e+00  2.70348770e+00  2.80357714e+01  5.37111084e+00
  2.15684533e+01  2.18132

## Shape into output format and export as CSV to disk

In [16]:
pred = pd.DataFrame(pd.Series(y_pred), columns=['prevalence_of_undernourishment'])
row = pd.DataFrame(test_values['row_id'].copy(), columns=['row_id'])
res = pd.concat([row, pred], axis='columns')
print(type(pred))
print(type(row))
print(type(res))
print(res.head())


<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.frame.DataFrame'>
   row_id  prevalence_of_undernourishment
0       0                       -2.347219
1       1                        2.566921
2       2                        2.899069
3       3                       23.233984
4       4                       27.192426


In [17]:
res.to_csv('prediction-20181031-01.csv', index=False)