# 03 Data Scaling and Normalising
Although not all models need scaling and/or normalising as we want to use Neural Network it is best to make the transformations up front.

## Initialise the styles for the workbooks

In [1]:
# Initialise styles and packages we need
from IPython.core.display import HTML
def css_styling():
    styles = open("styles/custom.css", "r").read()
    return HTML(styles)
css_styling()

## Imports and classes used

In [2]:
# All the imports used
import pandas as pd
from sklearn import preprocessing

print("Pandas version:       {}".format(pd.__version__))
#print("Scikit learn version: {}".format(sklearn.__version__))

Pandas version:       0.23.4


## Import cleaned data we created in 02 Data Cleansing

In [3]:
training_values_filename = 'data/DAT102x_Predicting_Chronic_Hunger_-_Clean_Training_values.csv'
test_values_filename = 'data/DAT102x_Predicting_Chronic_Hunger_-_Clean_Test_values.csv'

training_values = pd.read_csv(training_values_filename)
test_values = pd.read_csv(test_values_filename)

print("Training values: {}".format(training_values.shape))
print("Test values:     {}".format(test_values.shape))

print(training_values.columns)
print(training_values.head())

Training values: (1311, 19)
Test values:     (616, 19)
Index(['row_id', 'country_code', 'year', 'agricultural_land_area',
       'forest_area', 'total_land_area', 'population_growth',
       'avg_value_of_food_production',
       'food_imports_as_share_of_merch_exports',
       'gross_domestic_product_per_capita_ppp',
       'per_capita_food_supply_variability',
       'avg_supply_of_protein_of_animal_origin',
       'caloric_energy_from_cereals_roots_tubers',
       'access_to_improved_sanitation', 'access_to_improved_water_sources',
       'obesity_prevalence', 'access_to_electricity', 'co2_emissions',
       'ratio_urban_population_total'],
      dtype='object')
   row_id country_code  year  agricultural_land_area    forest_area  \
0       0      889f053  2002           235077.658128    5397.737544   
1       1      9e614ab  2012            23000.640738  178336.529912   
2       2      100c476  2000               90.954867     101.780566   
3       3      4609682  2013           100

## Make country_code and year categorical columns

In [4]:
category_features = ['country_code', 'year']
for cat in category_features:
    training_values[cat] = training_values[cat].astype('category')
    test_values[cat] = test_values[cat].astype('category')

## Apply a LogNormal aka PowerTransform transformation to all the left skewed features.

In [5]:
# Yeo-Johnson is the default
power_features = ['agricultural_land_area', 'forest_area', 'total_land_area',
                  'population_growth', 'avg_value_of_food_production',
                  'food_imports_as_share_of_merch_exports',
                  'gross_domestic_product_per_capita_ppp',
                  'per_capita_food_supply_variability',
                  'avg_supply_of_protein_of_animal_origin',
                  'obesity_prevalence', 'co2_emissions']

pt = preprocessing.PowerTransformer(standardize=True)
for f in power_features:
    training_values[f] = pt.fit_transform(pd.DataFrame(training_values[f]))
    test_values[f] = pt.fit_transform(pd.DataFrame(test_values[f]))

## Apply a MinMaxTransform to the numerical columns

In [6]:
min_max_features = ['caloric_energy_from_cereals_roots_tubers',
                    'access_to_improved_sanitation',
                    'access_to_improved_water_sources',
                    'access_to_electricity']

# ALL the columns should be done
numerical_features = power_features + min_max_features

mms = preprocessing.MinMaxScaler(copy=True, feature_range=(0, 1))
for f in numerical_features:
    training_values[f] = mms.fit_transform(pd.DataFrame(training_values[f]))
    test_values[f] = mms.fit_transform(pd.DataFrame(test_values[f]))  

In [7]:
# Curves look okay, but all overlaid on top of each other.
for f in training_values.iloc[0:1,4:-1].columns:
    training_values[f].plot(kind='kde')
#    plt.show()
#    plt.close()

## Sanity check result and export to CSV for next steps

In [8]:
print(training_values.head())

   row_id country_code  year  agricultural_land_area  forest_area  \
0       0      889f053  2002                0.644849     0.326591   
1       1      9e614ab  2012                0.393423     0.598121   
2       2      100c476  2000                0.013088     0.099657   
3       3      4609682  2013                0.545174     0.371419   
4       4      be2a7f5  2008                0.059169     0.177130   

   total_land_area  population_growth  avg_value_of_food_production  \
0         0.522760           0.555582                      0.221955   
1         0.435434           0.463276                      0.583676   
2         0.017267           0.515693                      0.360597   
3         0.396593           0.455697                      0.647380   
4         0.039847           0.379837                      0.672717   

   food_imports_as_share_of_merch_exports  \
0                                0.412590   
1                                0.270031   
2                      

In [9]:
final_scaled_normalised_training_values_filename = 'data/DAT102x_Predicting_Chronic_Hunger_-_Clean_Normal_Training_values.csv'
final_scaled_normalised_test_values_filename = 'data/DAT102x_Predicting_Chronic_Hunger_-_Clean_Normal_Test_values.csv'

training_values.to_csv(final_scaled_normalised_training_values_filename, index=False)
test_values.to_csv(final_scaled_normalised_test_values_filename, index=False)

## Next Steps
We are ready to train our model now. "04 Train Model" covers this.