## Imports

In [None]:
import numpy as np
import pandas as pd
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
#from keras.wrappers.scikit_learn import KerasRegressor
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.preprocessing import LabelEncoder
from keras.utils import np_utils
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
import pickle
import os
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedShuffleSplit


# Over-sampling for imbalance problem
from imblearn.over_sampling import SMOTE, ADASYN
from collections import Counter


## Pre-process data

In [None]:
os.chdir('/Users/hadi/Documents/Professional_development/DS/INSIGHT/Project/Data/Data_pro')
raw_data=pd.read_csv('Data_All_sorted_alpha_MVP_V7_2.csv')

#raw_data.index = raw_data.Neighbourhood

Y = pd.DataFrame(raw_data['Label'])

# Features selections
# options -: All fesstures:
X = pd.DataFrame(raw_data[['Home price','Change in housing pricing','Low income population',\
                             'Change in low income pop','Total Area','Total Population',\
                             'Pop  25 - 34 years','Recent Immigrants','TTC Stops','Health Providers','Businesses',\
                             'Social Housing Units','Rent Bank Applicants']])

# options -2: High correlation features
# X = pd.DataFrame(raw_data[['Change in low income pop',\
#                          'Pop  25 - 34 years','Businesses',\
#                         'Social Housing Units']])

X_resampled, y_resampled = SMOTE().fit_resample(X, Y)
print(sorted(Counter(y_resampled).items()))

#Split Test Train Data
#Method -1: None Stratified
    #if original data:
#X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.15, random_state = 44)

    #if resampled data:
X_train, X_test, Y_train, Y_test = train_test_split(X_resampled, y_resampled, test_size = 0.15, random_state = 44)



# #Method -2 : Stratified
# split = StratifiedShuffleSplit(n_splits=1, test_size=.20, random_state=44) # split is to classify for stratify
# for train_index, test_index in split.split(raw_data, raw_data[['Label']]):  # column to use to stratify
#     X_train = X.loc[train_index]
#     X_test = X.loc[test_index]
#     Y_train = Y.loc[train_index]
#     Y_test = Y.loc[test_index]

# encode class values as integers
encoder = LabelEncoder()
encoder.fit(Y_train)
encoded_Y_train = encoder.transform(Y_train)
# convert integers to dummy variables (i.e. one hot encoded)
dummy_Y_train = np_utils.to_categorical(encoded_Y_train)

encoder.fit(Y_test)
encoded_Y_test= encoder.transform(Y_test)
# convert integers to dummy variables (i.e. one hot encoded)
dummy_Y_test= np_utils.to_categorical(encoded_Y_test)

#Standardize the Data
# X_train = pd.DataFrame(StandardScaler().fit_transform(X_train)) 
# X_test = pd.DataFrame(StandardScaler().fit_transform(X_test))



## Modeling

In [None]:
# NN Modelling 

# define baseline model
def baseline_model():
# create model
    model = Sequential()
    model.add(Dense(10, input_dim=13, activation='relu'))
    model.add(Dense(8, kernel_initializer='normal', activation='relu'))
    model.add(Dense(6, kernel_initializer='normal', activation='relu'))
    model.add(Dense(4, activation='softmax'))
# Compile model
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    predictive_Cl_model_NN = 'finalized_Cl_NNmodel.sav'
    pickle.dump(model, open(predictive_Cl_model_NN, 'wb'))  
    return model

estimator = KerasClassifier(build_fn=baseline_model, epochs=200, batch_size=5, verbose=0)
seed = 7
np.random.seed(seed)
kfold = KFold(n_splits=4, shuffle=True, random_state=seed)
results = cross_val_score(estimator, X_train, dummy_Y_train, cv=kfold)
print("Baseline: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))

# # create model
# model = Sequential()
# #    model.add(Dropout (0.2))
# model.add(Dense(10, input_dim=13, kernel_initializer='normal', activation='relu'))
# model.add(Dense(8, kernel_initializer='normal', activation='relu'))
# model.add(Dense(6, kernel_initializer='normal', activation='relu'))
# model.add(Dense(1, kernel_initializer='normal', activation='relu'))
# # Compile model
# model.compile(loss='categorical_crossentropy',optimizer='sgd',metrics=['accuracy'])
# # save the model to disk
# predictive_model_NN = 'finalized_NNmodel_CL.sav'
# pickle.dump(model, open(predictive_model_NN, 'wb'))    



# # fix random seed for reproducibility
# seed = 7
# numpy.random.seed(seed)

# model.fit(X_train, Y_train, epochs=5, batch_size=20)
# loss_and_metrics = model.evaluate(X_test, Y_test, batch_size=20)

# classes = model.predict(X_test, Y_test, batch_size=20)

# print("Results: %.2f (%.2f) MSE" % (results.mean(), results.std()))


# # Since the result was lower for original data the folloing is commented. 
# # evaluate model with standardized dataset
# #numpy.random.seed(seed)
# # seed = 7
# # estimators = []
# # estimators.append(('standardize', StandardScaler()))
# # estimators.append(('mlp', KerasRegressor(build_fn=baseline_model, epochs=50, batch_size=5, verbose=0)))
# # pipeline = Pipeline(estimators)
# # kfold = KFold(n_splits=5, random_state=seed)
# # results = cross_val_score(pipeline, X_train, Y_train, cv=kfold)
# # print("Standardized: %.2f (%.2f) MSE" % (results.mean(), results.std()))
    

## Predictions

In [None]:
from sklearn import metrics
from sklearn.metrics import accuracy_score

# load the model from disk
loaded_model = pickle.load(open('finalized_Cl_NNmodel.sav', 'rb'))
y_pred_NN= loaded_model.predict(X_test)
y_pred_train_NN=loaded_model.predict(X_train)

print(y_pred_NN)
print(Y_test)
#print("NN method accuracy:"+str(accuracy_score(dummy_Y_test,y_pred_NN)))



Error = np.abs(dummy_Y_test - y_pred_NN)#/np.abs(Y_test)
print("Error : in "+str(np.mean(Error)))

rr_TRAIN = metrics.r2_score(dummy_Y_train, y_pred_train_NN)
rr_TRAIN = round(rr_TRAIN,2)
print("R-Squared-TRAIN ="+str(rr_TRAIN))

rr_TEST = metrics.r2_score(dummy_Y_test, y_pred_NN)
rr_TEST = round(rr_TEST,2)
print("R-Squared-TEST ="+str(rr_TEST))

