### Final submission to Queen City's Hackathon 2020 - Kaggle track
---
in this one we add a voting classifier for best models. We used three regressor:
1. KNeighborsRegressor
1. RandomForestRegressor
1. MLPRegressor

In [2]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [3]:
from pathlib import Path
import pandas as pd

import numpy as np

from joblib import dump, load
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import (RandomForestRegressor, RandomForestClassifier)
from sklearn.linear_model import LinearRegression
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn import svm
from sklearn.ensemble import VotingRegressor
from sklearn.feature_selection import VarianceThreshold

from sklearn import metrics
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline

In [4]:
## per competition's judging criteria: the models will be evaulated
## ... using a weighted MSE
## wmse = the weighted mean squared error. 
def wmse(actual, pred, weight):
    return sum((actual - pred) * (actual - pred) * weight) / sum(weight)

array([0, 1])

In [5]:
## testing wmse function with arbitrary numbers
wmse(np.array([0,1]), np.array([1,1]), np.array([2,1]))

0.6666666666666666

In [13]:
## gloval variables ##
## ---------------- ##
PROJ = Path(r".")
DATA = PROJ/'data'
RAW = DATA/'raw'
data_files_list = list(RAW.iterdir())
[str(f) for f in data_files_list]

['data/raw/.ipynb_checkpoints',
 'data/raw/data description.csv',
 'data/raw/example_code.py',
 'data/raw/testing.csv',
 'data/raw/training.csv']

---
### Reading the data sets

In [14]:

test_data = pd.read_csv(RAW/'testing.csv', index_col=0)
test_data.info()
## As we can see test_data doesn't 
## have the 'target' column, which is the one we'll predict.

<class 'pandas.core.frame.DataFrame'>
Int64Index: 45 entries, 0 to 44
Columns: 495 entries, Area_2015 to Transit_Ridership_Total_2013
dtypes: float64(352), int64(143)
memory usage: 174.4 KB


In [15]:
df = pd.read_csv(RAW/'training.csv', index_col=0)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 416 entries, 0 to 415
Columns: 496 entries, Area_2015 to target
dtypes: float64(353), int64(143)
memory usage: 1.6 MB


In [16]:
## calculating the weights for the wmse function
## the competition estipulated that the errors will be weighted by
## ...the column 'Population _2018'
weights = df['Population _2018']
# weights

In [17]:
## checking to see if there are any nulls
df.isnull().sum()

Area_2015                         0
Area_2013                         0
Population_Density_2018           0
Population _2018                  0
Population_Density_2017           0
                               ... 
Transit_Ridership_2014          107
Transit_Ridership_Total_2014    107
Transit_Ridership_2013          106
Transit_Ridership_Total_2013    106
target                            0
Length: 496, dtype: int64

---
### Splitting the data

In [18]:
## we include 'weights_train, weights_test' to the splitting when we have to give WMSE results
### first we drop the label column 'target' from X(with which we'll train our data)
X = df.drop('target', axis=1).copy()
y = df['target'].copy()
X_train, X_test, y_train, y_test, weights_train, weights_test = train_test_split(X, y, weights, random_state=42)

In [19]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((312, 495), (104, 495), (312,), (104,))

In [20]:
trans = Pipeline([('imputer', SimpleImputer(missing_values=np.nan, strategy='mean')),
                       ('scaler', StandardScaler())])

## calculating the 'imputer' and 'scaler' values for the X_train set
## QUESTION: where else is this used? 
trans.fit(X_train)

## replacing the values with the newly calculated values, and 
## ...returning a new dataset.
trans_Xtrain = trans.transform(X_train)
trans_Xtest = trans.transform(X_test)

display('trans_Xtest.shape:', trans_Xtest.shape) ## these are the same numbers as X_test.shape -- shouldn't it be different?

## applying transform to the test_data dataset
trans_test = trans.transform(test_data)

models3 = VotingRegressor([('knn', KNeighborsRegressor(n_neighbors=6)),
                          ('rf', RandomForestRegressor(random_state=42)), 
                          ('mlp', MLPRegressor(solver='lbfgs', 
                                              alpha=1e-5, 
                                              hidden_layer_sizes=(100,), 
                                              max_iter=200, 
                                              random_state=42,
                                              n_iter_no_change=10))])
models3.fit(trans_Xtrain, y_train)

'trans_Xtest.shape:'

(104, 495)

VotingRegressor(estimators=[('knn',
                             KNeighborsRegressor(algorithm='auto', leaf_size=30,
                                                 metric='minkowski',
                                                 metric_params=None,
                                                 n_jobs=None, n_neighbors=6,
                                                 p=2, weights='uniform')),
                            ('rf',
                             RandomForestRegressor(bootstrap=True,
                                                   ccp_alpha=0.0,
                                                   criterion='mse',
                                                   max_depth=None,
                                                   max_features='auto',
                                                   max_leaf_nodes=None,
                                                   max_samples=None,
                                                   min_impurity_decrease=0.0,

In [21]:
type(trans)

sklearn.pipeline.Pipeline

In [35]:
## making predictions from training set
predicts = models3.predict(trans_Xtest)
print("Test Score: {:.2f}".format(models3.score(trans_Xtest, y_test)))
print(metrics.mean_squared_error(y_test,predicts))
wmse(y_test, predicts, weights_test)

Test Score: 0.19
1.2738969793223929


0.37505134694783393

In [15]:
## making predictions for test_data set
predicts_test_data = models3.predict(trans_test)

## checking that the test data set has 45 rows.
predicts_test_data.shape

## in this case, we don't know how we did because we don't have the data. 
## only the judges know the answer, right?

(45,)