In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import accuracy_score, mean_squared_error, mean_absolute_error, precision_score, recall_score, f1_score, confusion_matrix
from math import sqrt

In [2]:
df_train = pd.read_csv("dengue_features_train.csv")
df_lables = pd.read_csv("dengue_labels_train.csv")
# merging data
data = pd.merge(df_train, df_lables, left_on= ['city','year','weekofyear'], 
                right_on=['city','year','weekofyear'])
print(data.shape)
data.head()

(1456, 25)


Unnamed: 0,city,year,weekofyear,week_start_date,ndvi_ne,ndvi_nw,ndvi_se,ndvi_sw,precipitation_amt_mm,reanalysis_air_temp_k,...,reanalysis_relative_humidity_percent,reanalysis_sat_precip_amt_mm,reanalysis_specific_humidity_g_per_kg,reanalysis_tdtr_k,station_avg_temp_c,station_diur_temp_rng_c,station_max_temp_c,station_min_temp_c,station_precip_mm,total_cases
0,sj,1990,18,1990-04-30,0.1226,0.103725,0.198483,0.177617,12.42,297.572857,...,73.365714,12.42,14.012857,2.628571,25.442857,6.9,29.4,20.0,16.0,4
1,sj,1990,19,1990-05-07,0.1699,0.142175,0.162357,0.155486,22.82,298.211429,...,77.368571,22.82,15.372857,2.371429,26.714286,6.371429,31.7,22.2,8.6,5
2,sj,1990,20,1990-05-14,0.03225,0.172967,0.1572,0.170843,34.54,298.781429,...,82.052857,34.54,16.848571,2.3,26.714286,6.485714,32.2,22.8,41.4,4
3,sj,1990,21,1990-05-21,0.128633,0.245067,0.227557,0.235886,15.36,298.987143,...,80.337143,15.36,16.672857,2.428571,27.471429,6.771429,33.3,23.3,4.0,3
4,sj,1990,22,1990-05-28,0.1962,0.2622,0.2512,0.24734,7.52,299.518571,...,80.46,7.52,17.21,3.014286,28.942857,9.371429,35.0,23.9,5.8,6


In [3]:
# fill missing values
data = (data.fillna(method='ffill'))
print(data.isnull().sum().sum())

0


In [4]:
# rename column names
data.columns = data.columns.str.replace("station", "stn")
data.columns = data.columns.str.replace("reanalysis", "re_an")
data.columns = data.columns.str.replace("humidity","hd")
data.columns = data.columns.str.replace("precipitation","prec")

In [5]:
data.city = data.city.astype("category")
data.year = data.year.astype("category")

In [6]:
dummies = pd.get_dummies(data[['city','year']])
dummies.columns

Index(['city_iq', 'city_sj', 'year_1990', 'year_1991', 'year_1992',
       'year_1993', 'year_1994', 'year_1995', 'year_1996', 'year_1997',
       'year_1998', 'year_1999', 'year_2000', 'year_2001', 'year_2002',
       'year_2003', 'year_2004', 'year_2005', 'year_2006', 'year_2007',
       'year_2008', 'year_2009', 'year_2010'],
      dtype='object')

For neural networks it is important that all features be on the same scale. So I have standardaised numerical features.

In [7]:
df_train = data.drop(columns = ['city','year','week_start_date','total_cases'])
df_test = data[['total_cases']]

In [8]:
scaler = StandardScaler()
df_train_std = scaler.fit_transform(df_train)

In [9]:
train = np.concatenate((df_train_std, dummies), axis =1)

In [10]:
train_x, test_x, train_y, test_y = train_test_split(train, df_test, test_size=0.2)
train_y.shape, train_x.shape

((1164, 1), (1164, 44))

### Neural Network Multi-Layer Perceptron Regressor model 

In [11]:
mlp_reg = MLPRegressor(hidden_layer_sizes=(6,9), activation='relu',max_iter=1000)
mlp_reg.fit(train_x, train_y.values.ravel()) # 



MLPRegressor(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
             beta_2=0.999, early_stopping=False, epsilon=1e-08,
             hidden_layer_sizes=(6, 9), learning_rate='constant',
             learning_rate_init=0.001, max_fun=15000, max_iter=1000,
             momentum=0.9, n_iter_no_change=10, nesterovs_momentum=True,
             power_t=0.5, random_state=None, shuffle=True, solver='adam',
             tol=0.0001, validation_fraction=0.1, verbose=False,
             warm_start=False)

In [12]:
pred_reg =  mlp_reg.predict(test_x)
print(mlp_reg.loss_)

154.19278624469015


In [13]:
# Metrics
print('Mean Squared Error :',mean_squared_error(test_y, pred_reg))
print('Mean Aboslute Error :',mean_absolute_error(test_y, pred_reg))
print('Root Mean Squared Error ',sqrt(mean_squared_error(test_y, pred_reg)))

Mean Squared Error : 339.205232560615
Mean Aboslute Error : 10.223614931595362
Root Mean Squared Error  18.417525147549412


In [14]:
dengue_features = pd.read_csv("dengue_features_test.csv")
dengue_features.columns = dengue_features.columns.str.replace(r"reanalysis", "re_an")
dengue_features.columns = dengue_features.columns.str.replace(r"humidity", "hd")
dengue_features.columns = dengue_features.columns.str.replace(r"precipitation","prec")
dengue_features = dengue_features.fillna(method='ffill')

In [15]:
dengue_features['city'] = dengue_features['city'].astype('category')
dengue_features['year'] = dengue_features['year'].astype('category')
df_dummies = pd.get_dummies(dengue_features[['city','year']])
df_dummies = dengue_features.reindex(columns = dummies.columns, fill_value=0)

In [16]:
st_pred_data = scaler.transform(dengue_features.drop(columns =['city', 'year','week_start_date']))
test_data = np.concatenate((st_pred_data, df_dummies), axis =1)

In [17]:
predictions = mlp_reg.predict(test_data)
predictions = predictions.astype("int64")
submission = dengue_features[['city','year','weekofyear']]
submission['total_cases'] = predictions
submission.to_csv("submission_MLP.csv",index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


#### Compare the models SGD, SVC and MLP 
<Br>
The SGD Regressor has error of 15.6
<br>
The SVC model has error of 38.7
<br>
The MLP Regressor has an error of 23.8

#### Adding a new column called 'above_average' with value 1 or 0. 1 if the total_cases > median of total_cases

In [18]:
data['above_average'] = np.where(data['total_cases']>=12, '1', '0')
data.above_average.value_counts()

1    737
0    719
Name: above_average, dtype: int64

In [19]:
df_train = data.drop(columns = ['city','year','week_start_date','total_cases','above_average'])
df_test = data['above_average']
df_train_std = scaler.fit_transform(df_train)
train = np.concatenate((df_train_std, dummies), axis =1)

#### Neural Network MLP Classifier on the 'above_average' column with 80/20 train/test split

In [20]:
train_x, test_x, train_y, test_y = train_test_split(train, df_test, test_size=0.2)

#### Build a Neural Network MLP Classifier on the 'above_average' column with 80/20 train/test split

In [21]:
# MLP Classifier
mlp_clf = MLPClassifier(hidden_layer_sizes=(100,), max_iter=1000)
mlp_clf.fit(train_x, train_y)

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
              beta_2=0.999, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=(100,), learning_rate='constant',
              learning_rate_init=0.001, max_fun=15000, max_iter=1000,
              momentum=0.9, n_iter_no_change=10, nesterovs_momentum=True,
              power_t=0.5, random_state=None, shuffle=True, solver='adam',
              tol=0.0001, validation_fraction=0.1, verbose=False,
              warm_start=False)

In [22]:
test_y_pred = mlp_clf.predict(test_x)

In [23]:
# Test Accuracy
print(accuracy_score(test_y, test_y_pred))

0.8732876712328768


In [24]:
# Confusion Matrix
cnf_matrix = confusion_matrix(test_y, test_y_pred)
cnf_matrix

array([[129,  21],
       [ 16, 126]])

__Precision__ The precision is intuitively the ability of the classifier to not label a sample as positive if it is negative.
The precision is the ratio TP / (TP + FP) 
<br>
__Recall__ The recall is  the ability of the classifier to find all the positive samples. The recall is the ratio TP / (TP + FN) 
<br>
__F1-Score__ The F-mesaure is the harmonic mean of the precision and recall. F1 = 2 * (precision * recall) / (precision + recall)
<br>
We use these metrics for the evaluate a classification problem as these metrics ifdentify how precisely model can classify.accuracy is not a good metric for classification, model can be wrong though it has high accuracy. Accuracy favours the major class of the classification data. The metrics recall and precision favours the minory class of data which is more useful.

In [25]:
# precision 
precision = precision_score(test_y, test_y_pred,average = "macro")
print('Precision: %f' % precision)
# recall
recall = recall_score(test_y, test_y_pred, average = "macro")
print('Recall: %f' % recall)
# f1
f1 = f1_score(test_y, test_y_pred, average = "macro")
print('F1 score: %f' % f1)

Precision: 0.873399
Recall: 0.873662
F1 score: 0.873274


High precision relates the low false positive rate. Recall relates to the high true positive rate. The recall and precision for our model of suggest that model is good.