# DengAI Analysis Part 6 - TPOT

By: Chengyi (Jeff) Chen, under guidance of CSCI499: AI for Social Good Teaching Assistant - Aaron Ferber

---
## Content

In this notebook, we will explore using an external API called [tpot](https://github.com/EpistasisLab/tpot) that automatically optimizes machine learning pipelines using genetic programming to help us find the best regressor for our dengue prediction.

<a id="imports"></a>

---
## Library Imports

In [1]:
# Library Imports
import pandas as pd
import numpy as np
import subprocess
import os
from collections import Counter
from sklearn import model_selection, kernel_ridge, linear_model, metrics, feature_selection, preprocessing
from os import listdir
from os.path import isfile, join, isdir
import warnings
warnings.filterwarnings('ignore')

# plotting libraries
import matplotlib
import matplotlib.pyplot as plt
plt.style.use('seaborn-paper')
import seaborn as sns
sns.set(style="ticks")
from pylab import rcParams
%matplotlib inline

---

## TPOT

__Let's run tpot on both City sj and iq data to generate the best possible regressors to be used for our prediction.__

In [2]:
from tpot import TPOTRegressor
from sklearn.model_selection import train_test_split

def preprocess_data(data_path, labels_path=None):
    """
    Fills all NaNs with the most recent value
    
    Parameters: 
    -----------
    data_path: (str) Path to location of the DengAI training set features
    labels_path: (str) Path to location of the DengAI training set labels
    
    Returns:
    --------
    The pandas dataframes of City SJ and IQ features with the labels at the last column
    """
    # load data and set index to city, year, weekofyear
    df = pd.read_csv(data_path, index_col=[0, 1, 2])
    
    # fill missing values
    df.fillna(method='ffill', inplace=True)

    # add labels to dataframe
    if labels_path:
        labels = pd.read_csv(labels_path, index_col=[0, 1, 2])
        df = df.join(labels)
    
    # separate san juan and iquitos
    sj = df.loc['sj']
    iq = df.loc['iq']
    
    return sj, iq

def get_tpot_best(X_train, X_test, y_train, y_test, city):
    """
    Uses TPOT's Regressor optimizer to find the best regressor given the data provided
    
    Parameters: 
    -----------
    train_test_split numpy arrays and specification of which city's data this belongs to
    
    Returns:
    --------
    Nothing. It saves the optimum regressor into a python script
    """
    tpot = TPOTRegressor(generations=10, population_size=100,
                         offspring_size=None, mutation_rate=0.9,
                         crossover_rate=0.1,
                         scoring='neg_mean_absolute_error', cv=5,
                         subsample=1.0, n_jobs=-1,
                         max_time_mins=None, max_eval_time_mins=5,
                         random_state=None, config_dict=None,
                         template="RandomTree",
                         warm_start=False,
                         memory=None,
                         use_dask=False,
                         periodic_checkpoint_folder=None,
                         early_stop=None,
                         verbosity=0,
                         disable_update_check=False)
    tpot.fit(X_train, y_train)
    print(tpot.score(X_test, y_test))
    tpot.export('{}_tpot_dengai_pipeline.py'.format(city))



In [3]:
# Get City SJ and IQ data
sj_train, iq_train = preprocess_data(data_path='./data/dengai/features/dengue_features_train.csv', 
                                     labels_path='./data/dengai/labels/dengue_labels_train.csv')

In [4]:
sj_train.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,week_start_date,ndvi_ne,ndvi_nw,ndvi_se,ndvi_sw,precipitation_amt_mm,reanalysis_air_temp_k,reanalysis_avg_temp_k,reanalysis_dew_point_temp_k,reanalysis_max_air_temp_k,...,reanalysis_relative_humidity_percent,reanalysis_sat_precip_amt_mm,reanalysis_specific_humidity_g_per_kg,reanalysis_tdtr_k,station_avg_temp_c,station_diur_temp_rng_c,station_max_temp_c,station_min_temp_c,station_precip_mm,total_cases
year,weekofyear,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
1990,18,1990-04-30,0.1226,0.103725,0.198483,0.177617,12.42,297.572857,297.742857,292.414286,299.8,...,73.365714,12.42,14.012857,2.628571,25.442857,6.9,29.4,20.0,16.0,4
1990,19,1990-05-07,0.1699,0.142175,0.162357,0.155486,22.82,298.211429,298.442857,293.951429,300.9,...,77.368571,22.82,15.372857,2.371429,26.714286,6.371429,31.7,22.2,8.6,5
1990,20,1990-05-14,0.03225,0.172967,0.1572,0.170843,34.54,298.781429,298.878571,295.434286,300.5,...,82.052857,34.54,16.848571,2.3,26.714286,6.485714,32.2,22.8,41.4,4
1990,21,1990-05-21,0.128633,0.245067,0.227557,0.235886,15.36,298.987143,299.228571,295.31,301.4,...,80.337143,15.36,16.672857,2.428571,27.471429,6.771429,33.3,23.3,4.0,3
1990,22,1990-05-28,0.1962,0.2622,0.2512,0.24734,7.52,299.518571,299.664286,295.821429,301.9,...,80.46,7.52,17.21,3.014286,28.942857,9.371429,35.0,23.9,5.8,6


In [5]:
iq_train.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,week_start_date,ndvi_ne,ndvi_nw,ndvi_se,ndvi_sw,precipitation_amt_mm,reanalysis_air_temp_k,reanalysis_avg_temp_k,reanalysis_dew_point_temp_k,reanalysis_max_air_temp_k,...,reanalysis_relative_humidity_percent,reanalysis_sat_precip_amt_mm,reanalysis_specific_humidity_g_per_kg,reanalysis_tdtr_k,station_avg_temp_c,station_diur_temp_rng_c,station_max_temp_c,station_min_temp_c,station_precip_mm,total_cases
year,weekofyear,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
2000,26,2000-07-01,0.192886,0.132257,0.340886,0.2472,25.41,296.74,298.45,295.184286,307.3,...,92.418571,25.41,16.651429,8.928571,26.4,10.775,32.5,20.7,3.0,0
2000,27,2000-07-08,0.216833,0.2761,0.289457,0.241657,60.61,296.634286,298.428571,295.358571,306.6,...,93.581429,60.61,16.862857,10.314286,26.9,11.566667,34.0,20.8,55.6,0
2000,28,2000-07-15,0.176757,0.173129,0.204114,0.128014,55.52,296.415714,297.392857,295.622857,304.5,...,95.848571,55.52,17.12,7.385714,26.8,11.466667,33.0,20.7,38.1,0
2000,29,2000-07-22,0.227729,0.145429,0.2542,0.200314,5.6,295.357143,296.228571,292.797143,303.6,...,87.234286,5.6,14.431429,9.114286,25.766667,10.533333,31.5,14.7,30.0,0
2000,30,2000-07-29,0.328643,0.322129,0.254371,0.361043,62.76,296.432857,297.635714,293.957143,307.0,...,88.161429,62.76,15.444286,9.5,26.6,11.48,33.3,19.1,4.0,0


### Optimize Regressor for City SJ data

In [6]:
sj_X_train, sj_X_test, sj_y_train, sj_y_test = train_test_split(sj_train.drop(['week_start_date', 'total_cases'], axis=1).astype(float), 
                                                                sj_train['total_cases'].astype(float), 
                                                                train_size=0.75, 
                                                                test_size=0.25)

get_tpot_best(sj_X_train, sj_X_test, sj_y_train, sj_y_test, city='sj')

-23.16131926561329


### Optimize Regressor for City IQ data

In [7]:
iq_X_train, iq_X_test, iq_y_train, iq_y_test = train_test_split(iq_train.drop(['week_start_date', 'total_cases'], axis=1).astype(float), 
                                                                iq_train['total_cases'].astype(float), 
                                                                train_size=0.75, 
                                                                test_size=0.25)

get_tpot_best(iq_X_train, iq_X_test, iq_y_train, iq_y_test, city='iq')

-5.45120052924523


---

## Optimized Models

In [8]:
from sklearn.pipeline import make_pipeline, make_union
from sklearn.metrics import mean_absolute_error

__City San Juan Regressor__

In [9]:
from sklearn.ensemble import AdaBoostRegressor, RandomForestRegressor
from sklearn.svm import LinearSVR
from tpot.builtins import StackingEstimator

# Average CV score on the training set was:-20.202304763339182
sj_exported_pipeline = make_pipeline(
    StackingEstimator(estimator=AdaBoostRegressor(learning_rate=0.01, loss="exponential", n_estimators=100)),
    AdaBoostRegressor(learning_rate=0.001, loss="linear", n_estimators=100)
)

sj_exported_pipeline.fit(sj_X_train, sj_y_train)
sj_y_pred = sj_exported_pipeline.predict(sj_X_test)
mean_absolute_error(sj_y_test, sj_y_pred)

23.165315854645822

__City Iquitos Regressor__

In [10]:
from sklearn.feature_selection import SelectPercentile, f_regression
from xgboost import XGBRegressor
from sklearn.preprocessing import FunctionTransformer
from copy import copy

# Average CV score on the training set was:-5.872964338767224
iq_exported_pipeline = make_pipeline(
    make_union(
        FunctionTransformer(copy),
        FunctionTransformer(copy)
    ),
    SelectPercentile(score_func=f_regression, percentile=24),
    XGBRegressor(learning_rate=0.01, max_depth=5, min_child_weight=15, n_estimators=100, nthread=1, subsample=0.1)
)

iq_exported_pipeline.fit(iq_X_train, iq_y_train)
iq_y_pred = iq_exported_pipeline.predict(iq_X_test)
mean_absolute_error(iq_y_test, iq_y_pred)

5.45120052924523

__The MAE scores don't really look much better than the vanilla models we used previously though, I think we will have to choose a time series analysis model that captures the time aspect of the data instead.__

---

## Competition Predictions

In [13]:
sj_test, iq_test = preprocess_data('./data/dengai/test_features/dengue_features_test.csv')

sj_predictions = sj_exported_pipeline.predict(sj_test.drop(['week_start_date'], axis=1).astype(float)).astype(int)
iq_predictions = iq_exported_pipeline.predict(iq_test.drop(['week_start_date'], axis=1).astype(float)).astype(int)

submission = pd.read_csv('./data/dengai/submission_format.csv',
                         index_col=[0, 1, 2])

submission.total_cases = np.concatenate([sj_predictions, iq_predictions])
submission.to_csv('./data/dengai/tpot_submission.csv')