# 04 Train Model
Used the cleaned, scaled, and normalised data to train the model.

**Disclaimer.** I used AzureML for this step. My trail expired one week before the end of the competition so I started using SciKit Learn. I submitted results from here to the competition but never bettered the score I got via AzureML even though the RMSE scores were comparable. I can only assume that I was overtraining here!

## Initialise the styles for the workbooks¶

In [1]:
# Initialise styles and packages we need
from IPython.core.display import HTML
def css_styling():
    styles = open("styles/custom.css", "r").read()
    return HTML(styles)
css_styling()

## Imports and classes used

In [2]:
# All the imports used
import pandas as pd
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPRegressor
from sklearn import metrics
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

print("Pandas version:       {}".format(pd.__version__))
#print("Scikit learn version: {}".format(sklearn.__version__))

Pandas version:       0.23.4


## Import cleaned, scaled and normalised data we created in 03 Data Scaling and Normalising

In [45]:
final_scaled_normalised_training_values_filename = 'data/DAT102x_Predicting_Chronic_Hunger_-_Clean_Normal_Training_values.csv'
training_labels_filename = 'data/DAT102x_Predicting_Chronic_Hunger_-_Training_labels.csv'
final_scaled_normalised_test_values_filename = 'data/DAT102x_Predicting_Chronic_Hunger_-_Clean_Normal_Test_values.csv'

training_values = pd.read_csv(final_scaled_normalised_training_values_filename)
training_labels = pd.read_csv(training_labels_filename)
test_values = pd.read_csv(final_scaled_normalised_test_values_filename)

# Makes sure country_code and year are treated as categorical!
training_values['country_code'] = training_values['country_code'].astype('category')
training_values['year'] = training_values['year'].astype('category')
test_values['country_code'] = test_values['country_code'].astype('category')
test_values['year'] = test_values['year'].astype('category')

print("Training values: {}".format(training_values.shape))
print("Training label: {}".format(training_labels.shape))
print("Test values:     {}".format(test_values.shape))
print(training_values.head())
#print(training_values.dtypes)

Training values: (1311, 19)
Training label: (1401, 2)
Test values:     (616, 19)
   row_id country_code  year  agricultural_land_area  forest_area  \
0       0      889f053  2002                0.644849     0.326591   
1       1      9e614ab  2012                0.393423     0.598121   
2       2      100c476  2000                0.013088     0.099657   
3       3      4609682  2013                0.545174     0.371419   
4       4      be2a7f5  2008                0.059169     0.177130   

   total_land_area  population_growth  avg_value_of_food_production  \
0         0.522760           0.555582                      0.221955   
1         0.435434           0.463276                      0.583676   
2         0.017267           0.515693                      0.360597   
3         0.396593           0.455697                      0.647380   
4         0.039847           0.379837                      0.672717   

   food_imports_as_share_of_merch_exports  \
0                               

## Join training features and label into test dataset

In [46]:
tempDF = pd.merge(training_values, training_labels, on='row_id', how='inner')
print(tempDF.shape)

(1311, 20)


## Create the test feature matrix and test label vector.
Keep in mind that at this stage features are numerical and get_dummies doesn't actually do anything. I used it during development and saw no reason to remove it later.

In [50]:
# Start at 2nd column, i.e. exclude country_code
X = pd.get_dummies(training_values.iloc[:,2:len(training_values)])
y = tempDF['prevalence_of_undernourishment'].values
print(X.shape)
#print(X.dtypes)
print(y)

(1311, 32)
agricultural_land_area                      float64
forest_area                                 float64
total_land_area                             float64
population_growth                           float64
avg_value_of_food_production                float64
food_imports_as_share_of_merch_exports      float64
gross_domestic_product_per_capita_ppp       float64
per_capita_food_supply_variability          float64
avg_supply_of_protein_of_animal_origin      float64
caloric_energy_from_cereals_roots_tubers    float64
access_to_improved_sanitation               float64
access_to_improved_water_sources            float64
obesity_prevalence                          float64
access_to_electricity                       float64
co2_emissions                               float64
ratio_urban_population_total                float64
year_2000                                     uint8
year_2001                                     uint8
year_2002                                     uint8
y

## Use train/test split with different random_state values

In [34]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=31)

(983, 100)


## Create regressor using LBFGS (small training set)

In [7]:
nn = MLPRegressor(activation='identity',
                  hidden_layer_sizes=300,
                  max_iter=500,
                  verbose=False,
                  solver='lbfgs')
print(nn)

MLPRegressor(activation='identity', alpha=0.0001, batch_size='auto',
       beta_1=0.9, beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=300, learning_rate='constant',
       learning_rate_init=0.001, max_iter=500, momentum=0.9,
       n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
       random_state=None, shuffle=True, solver='lbfgs', tol=0.0001,
       validation_fraction=0.1, verbose=False, warm_start=False)


## Train model and then use it to predict PoU

In [8]:
nn.fit(X_train, y_train)
y_pred = nn.predict(X_test)
print((metrics.mean_squared_error(y_test,y_pred))**0.5)

2.787673633282141


## Check model sensitivy by running a 10 fold cross validation to get a sense of the variance.

In [9]:
scores = cross_val_score(nn, X, y, cv=10, scoring='neg_mean_squared_error')
print("RMSE: %0.2f (+/- %0.2f)" % ((abs(scores.mean())**0.5), (abs(scores.std())**0.5) * 2))

RMSE: 2.76 (+/- 2.43)


In [10]:
print(nn)

MLPRegressor(activation='identity', alpha=0.0001, batch_size='auto',
       beta_1=0.9, beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=300, learning_rate='constant',
       learning_rate_init=0.001, max_iter=500, momentum=0.9,
       n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
       random_state=None, shuffle=True, solver='lbfgs', tol=0.0001,
       validation_fraction=0.1, verbose=False, warm_start=False)


# Try to use a grid search to find "best" parameters
# define the parameter values that should be searched
hidden_layer_sizes_range = [300, 400, 500] # had 100, 200, 400, 500
solver_options = [ 'lbfgs'] #, 'lbfgs', 'sgd',]
activation_options = ['identity']#, 'logistic', 'tanh', 'relu']
alpha_range = [0.0001]
max_iter_range = [400, 500] # 200, 300, 500..
#beta_1_range = [0.3, 0.5, 0.7, 0.9, 0.99]
#beta_2_range = [0.5, 0.999]

# create a parameter grid: map the parameter names to the values that should be searched
param_grid = dict(hidden_layer_sizes=hidden_layer_sizes_range,
                  solver=solver_options,
                  alpha=alpha_range,
                  activation=activation_options,
                  max_iter=max_iter_range)
#                  beta_1=beta_1_range,
#                  beta_2=beta_2_range)
#                  learning_rate_init=learning_rate_init_range,

print(param_grid)

# instantiate and fit the grid
grid = GridSearchCV(nn, param_grid, cv=10, 
                    scoring='neg_mean_squared_error', 
                    return_train_score=False,
                    n_jobs = -1)
grid.fit(X, y)

#grid = GridSearchCV(nn, param_grid, cv=10, 
#                    scoring='neg_mean_squared_error', 
#                    return_train_score=False,
#                    n_jobs = -1)

# Best using lbfgs:
#{'activation': 'identity', 'alpha': 0.0001, 'hidden_layer_sizes': 300, 'max_iter': 400, 'solver': 'lbfgs'}
# Best using adam:
#{'activation': 'identity', 'alpha': 0.0001, 'beta_1': 0.9, 'beta_2': 0.999, 'hidden_layer_sizes': 400, 'max_iter': 400, 'solver': 'adam'}

# Choose to use lbfgs!

print(pd.DataFrame(grid.cv_results_)[['mean_test_score', 'std_test_score', 'params']])
print(grid.best_score_)
print(grid.best_params_)

access_to_improved_water_sources
access_to_electricity
obesity_prevalence
access_to_improved_sanitation
avg_supply_of_protein_of_animal_origin
life_expectancy
adult_literacy_rate
avg_value_of_food_production

droughts_floods_extreme_temps
population_growth
anemia_prevalence
caloric_energy_from_cereals_roots_tubers
net_oda_received_percent_gni
open_defecation
fertility_rate
country_code

# All the testing below implies this is the BEST model, so use it to predict and check result?
nn = MLPRegressor(activation='identity',
                  hidden_layer_sizes=300,
                  max_iter=500,
                  verbose=False,
                  solver='lbfgs')
print(nn)

In [11]:
# Use the whole dataset to train the model
nn.fit(X, y)
y_pred = nn.predict(X)
print((metrics.mean_squared_error(y,y_pred))**0.5)
#print(y_pred)

2.5256444240817757


In [33]:
#print(test_values.shape)
#print(test_values.head())
print(test_values.dtypes)
#X_test = pd.get_dummies(test_values).iloc[:,2:-1].values
#print(X)
#print(X_test)
#X_pred = pd.get_dummies(test_values).iloc[:,4:-1] #.values
#X_pred = test_values.iloc[:,2:-1] #.values
#print(X_pred.shape)
#print(X_pred)
#y_pred = nn.predict(X_test)

row_id                                        int64
country_code                                 object
year                                          int64
agricultural_land_area                      float64
forest_area                                 float64
total_land_area                             float64
population_growth                           float64
avg_value_of_food_production                float64
food_imports_as_share_of_merch_exports      float64
gross_domestic_product_per_capita_ppp       float64
per_capita_food_supply_variability          float64
avg_supply_of_protein_of_animal_origin      float64
caloric_energy_from_cereals_roots_tubers    float64
access_to_improved_sanitation               float64
access_to_improved_water_sources            float64
obesity_prevalence                          float64
access_to_electricity                       float64
co2_emissions                               float64
ratio_urban_population_total                float64
dtype: objec

In [None]:
print(y_pred)

In [None]:
pred = pd.DataFrame(pd.Series(y_pred), columns=['prevalence_of_undernourishment'])
row = pd.DataFrame(test_values['row_id'].copy(), columns=['row_id'])
res = pd.concat([row, pred], axis='columns')
print(type(pred))
print(type(row))
print(type(res))
print(res.head())


In [None]:
res.to_csv('data/prediction-20181030-03.csv', index=False)