In [None]:
import pandas as pd
import numpy as np
import torch
from sklearn.model_selection import train_test_split
from torch.utils.data import TensorDataset
from torch.utils.data import DataLoader

import torch.optim as optim
from numpy import vstack
from pandas import read_csv
from sklearn.preprocessing import LabelEncoder

from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from torch.utils.data import random_split
from torch import Tensor
import torch.nn as nn
import torch.nn.functional as F
from torch.nn import Linear
from torch.nn import ReLU
from torch.nn import Sigmoid
from torch.nn import Module
from torch.optim import SGD
from torch.nn import CrossEntropyLoss

# Import for Hyperparameter optimization using Ax
from ax.plot.contour import plot_contour
from ax.plot.trace import optimization_trace_single_method
from ax import optimize
from ax.utils.notebook.plotting import render, init_notebook_plotting

# Acknowledgements


Changing normal datatypes to tensors: 
https://towardsdatascience.com/deep-learning-on-dataframes-with-pytorch-66b21be54ef6
https://stackoverflow.com/questions/44617871/how-to-convert-a-list-of-strings-into-a-tensor-in-pytorch

pytorch nn model
https://machinelearningmastery.com/pytorch-tutorial-develop-deep-learning-models/

In [130]:

#SG encoder is declared so that it can be used to inverse transform the predicted result
sg_encoder = LabelEncoder()
df = read_csv('clean.csv')

# Label encode all predictors for the training data
for col in df.columns:
    if df.dtypes[col] == "object" and col != 'status_group':
        df[col] = LabelEncoder().fit_transform(df[col])
    if col == 'status_group':
        df['status_group'] = sg_encoder.fit_transform(df['status_group'])

cols_at_end = ['status_group']
df = df[[c for c in df if c not in cols_at_end] 
        + [c for c in cols_at_end if c in df]]

#Store it into a temporary csv
pd.DataFrame(df).to_csv("clean-kt.csv")
print(df.shape)

print(df.status_group.unique())

(59400, 22)
[2 0 1]


In [131]:
df = read_csv('clean_test.csv')

# Label encode all predictors
for col in df.columns:
    if df.dtypes[col] == "object" and col != 'status_group':
        df[col] = LabelEncoder().fit_transform(df[col])

#Store it into a temporary csv
print(df.shape)
pd.DataFrame(df).to_csv("clean_test-kt.csv")

(14850, 22)


In [132]:
class CSVDataset(Dataset):
    # load the dataset
    def __init__(self, path):
        # load the csv file as a dataframe
        df = read_csv(path)
        
        #Drop the unused columns. Unnamed: 0 is generated after saving the dataset.
        df = df.drop('Unnamed: 0', axis=1)
        df = df.drop('id', axis=1)
        
        #Assign x to all input values
        self.X = df.values[:, :-1]
        
        #Assign y to all target values
        self.y = df.values[:, -1]
        
        # ensure input data is floats
        self.X = self.X.astype('float32')        
 
    # number of rows in the dataset
    def __len__(self):
        return len(self.X)
 
    # get a row at an index
    def __getitem__(self, idx):
        return [self.X[idx], self.y[idx]]
 
    # get indexes for train and test rows
    def get_splits(self, n_test=0.33):
        # determine sizes
        test_size = round(n_test * len(self.X))
        train_size = len(self.X) - test_size
        # calculate the split
        return random_split(self, [train_size, test_size])
 
# model definition
class MLP(Module):
    # define model elements
    def __init__(self, n_inputs):
        super(MLP, self).__init__()
        
        #Determine the input and output of each layer. Could also be passed as params for optimization
        layers = [300,200,100]
        total_layers = []
        input_size = n_inputs
        
        for i in layers:
            total_layers.append(nn.Linear(input_size, i))
            total_layers.append(nn.ReLU(inplace=True))
            total_layers.append(nn.BatchNorm1d(i))
            total_layers.append(nn.Dropout(0.2))
            input_size = i
        
        total_layers.append(nn.Linear(layers[-1], 3))

        self.layers = nn.Sequential(*total_layers)


 
    # forward propagate input
    def forward(self, X):
        X = self.layers(X)
        return X

In [133]:
class CSVTestDataset(Dataset):
    # load the dataset
    def __init__(self, path):
        # load the csv file as a dataframe
        df = read_csv(path)

        # Drop unused columns
        df = df.drop('id', axis=1)
        df = df.drop('Unnamed: 0', axis=1)
        if 'Unnamed: 0.1' in df.columns:
            df = df.drop('Unnamed: 0.1', axis=1)
        print('CSVTestDataset =', df.shape)
        print(df.columns)
        
        #Assign x all input values
        self.X = df.values[:, :]
        # ensure input data is floats
        self.X = self.X.astype('float32')
 
    # number of rows in the dataset
    def __len__(self):
        return len(self.X)
 
    # get a row at an index
    def __getitem__(self, idx):
        return [self.X[idx]]
 
    # returns all inputs from test dataset
    def get_test(self):
        return self.X;

In [134]:
# # prepare the dataset
def prepare_data(path):
    # load the dataset
    dataset = CSVDataset(path)
    # calculate split
    train, test = dataset.get_splits()
    # prepare data loaders
    train_dl = DataLoader(train, batch_size=32, shuffle=True)
    test_dl = DataLoader(test, batch_size=1024, shuffle=False)
    return train_dl, test_dl


In [135]:
path = 'clean-kt.csv'
train_dl, test_dl = prepare_data(path)

In [122]:
## helper train/fit function

def train(model, parameterization, train_dl):
    
    #Gradient descent optimizer
    optimizer = optim.SGD(model.parameters(), lr=parameterization["lr"], momentum=parameterization["momentum"])
    
    #Use cross entropy loss function
    criterion = CrossEntropyLoss()
    
    for epoch in range(2):  # loop over the dataset multiple times

        running_loss = 0.0
        for i, (inputs, targets) in enumerate(train_dl, 0):

            # clear the gradients
            optimizer.zero_grad()
            
            # compute the model output
            yhat = model(inputs)
            
            # calculate loss
            loss = criterion(yhat, targets.long())

            # credit assignment
            loss.backward()
            
            # update model weights
            optimizer.step()
            
            # print statistics
            running_loss += loss.item()
            if i % 2000 == 1999:    # print every 2000 mini-batches
                print('[%d, %5d] loss: %.3f' %
                      (epoch + 1, i + 1, running_loss / 2000))
                running_loss = 0.0
        
    return model

            
## helper function to evaluate the accuracy for the tested model
def evaluate(model, test_dl):
    predictions, actuals = list(), list()
    for i, (inputs, targets) in enumerate(test_dl):
        # evaluate the model on the test set
        yhat = model(inputs)
        _, predicted = torch.max(yhat, axis=1)

        # retrieve numpy array
        yhat = yhat.detach().numpy()
        actual = targets.numpy()

        # round to class values
        yhat = yhat.round()

        # store predictions
        predictions.append(yhat)
        
        # transform 1d data and store eg. [0,1,2] => [[1,0,0] [0,1,0] [0,0,1]]
        actual = actual.astype(int)
        act = np.zeros((actual.size, actual.max()+1))
        act[np.arange(actual.size),actual] = 1
        actuals.append(act)

    predictions, actuals = vstack(predictions), vstack(actuals)

    # transform result to a numpy array of results eg. [0,1,2]
    actuals = np.argmax(actuals, axis=1)
    predictions = np.argmax(predictions, axis=1)
    
    # Determine accuracy
    acc = np.sum(predictions == actuals) / actuals.shape[0]
    print('Accuracy of model:' , np.sum(predictions == actuals)/ actuals.shape[0])

    return acc

# make a class prediction for one row of data
def _predict(data, model):
    predictions = list()
    for i, inputs in enumerate(data):
        yhat = model(inputs)
        # retrieve numpy array
        yhat = yhat.detach().numpy()
        predictions.append(yhat)

    # Get prediction numpy arr with 3 columns
    prediction_list = vstack(predictions)
    
    # Get prediction results eg [0,1,2]
    results = np.argmax(prediction_list, axis=1)
    return results

        
## helper function train-evaluate to pass as the function to be optimized
def train_evaluate(parameterization):
    model = MLP(20)
    model = train(model, parameterization, train_dl)
    return evaluate(model, test_dl)
    

In [123]:
# The code below is to test if my model is working properly.
model = MLP(20)
print(model.parameters())
trained_model = train(model, {"lr": 0.01, "momentum": 0.5}, train_dl)
acc = evaluate(trained_model,test_dl)


<generator object Module.parameters at 0x7fd7cd01dac0>
Accuracy of model: 0.5489745944291399


In [124]:
# The code below finds the best parameters to run the model based on the params given into the function. 
# Eventually, the best_parameters will be used to generate the model that will predict the y values for the test 
# dataset.
best_parameters, values, experiment, model = optimize(
    parameters=[
        {"name": "lr", "type": "range", "bounds": [1e-6, 0.4], "log_scale": True},
        {"name": "momentum", "type": "range", "bounds": [0.0, 1.0]},
    ],
    evaluation_function=train_evaluate,
    objective_name='accuracy',
    total_trials=15
)

[INFO 11-10 22:01:47] ax.service.utils.instantiation: Inferred value type of ParameterType.FLOAT for parameter lr. If that is not the expected value type, you can explicity specify 'value_type' ('int', 'float', 'bool' or 'str') in parameter dict.
[INFO 11-10 22:01:47] ax.service.utils.instantiation: Inferred value type of ParameterType.FLOAT for parameter momentum. If that is not the expected value type, you can explicity specify 'value_type' ('int', 'float', 'bool' or 'str') in parameter dict.
[INFO 11-10 22:01:47] ax.modelbridge.dispatch_utils: Using Bayesian Optimization generation strategy: GenerationStrategy(name='Sobol+GPEI', steps=[Sobol for 5 trials, GPEI for subsequent trials]). Iterations after 5 will take longer to generate due to  model-fitting.
[INFO 11-10 22:01:47] ax.service.managed_loop: Started full optimization with 15 steps.
[INFO 11-10 22:01:47] ax.service.managed_loop: Running optimization trial 1...
[INFO 11-10 22:02:09] ax.service.managed_loop: Running optimizati

Accuracy of model: 0.5506070809101112


[INFO 11-10 22:02:35] ax.service.managed_loop: Running optimization trial 3...


Accuracy of model: 0.5168860320375472


[INFO 11-10 22:02:56] ax.service.managed_loop: Running optimization trial 4...


Accuracy of model: 0.4463830221405979


[INFO 11-10 22:03:19] ax.service.managed_loop: Running optimization trial 5...


Accuracy of model: 0.5560146923783288


[INFO 11-10 22:03:39] ax.service.managed_loop: Running optimization trial 6...


Accuracy of model: 0.5542291602897663



To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).

[INFO 11-10 22:04:00] ax.service.managed_loop: Running optimization trial 7...


Accuracy of model: 0.5535659626568717



To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).

[INFO 11-10 22:04:20] ax.service.managed_loop: Running optimization trial 8...


Accuracy of model: 0.5099479644934191



To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).

[INFO 11-10 22:04:43] ax.service.managed_loop: Running optimization trial 9...


Accuracy of model: 0.5447913478216508



To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).

[INFO 11-10 22:05:08] ax.service.managed_loop: Running optimization trial 10...


Accuracy of model: 0.5404040404040404



To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).

[INFO 11-10 22:05:33] ax.service.managed_loop: Running optimization trial 11...


Accuracy of model: 0.5585144373023161



To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).

[INFO 11-10 22:05:54] ax.service.managed_loop: Running optimization trial 12...


Accuracy of model: 0.4406183042546679



To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).

[INFO 11-10 22:06:17] ax.service.managed_loop: Running optimization trial 13...


Accuracy of model: 0.5543311906948271



To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).

[INFO 11-10 22:06:38] ax.service.managed_loop: Running optimization trial 14...


Accuracy of model: 0.5349454137332925



To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).

[INFO 11-10 22:07:00] ax.service.managed_loop: Running optimization trial 15...


Accuracy of model: 0.5417304356698296



To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).



Accuracy of model: 0.55331088664422


In [100]:
best_parameters

{'lr': 0.3999999999999993, 'momentum': 0.31699246915975815}

In [101]:
values

({'accuracy': 0.5655545198696273},
 {'accuracy': {'accuracy': 2.9158741001568507e-10}})

In [136]:
# Use best params found to create model to determine the score.
model = MLP(20)
# test = train_evaluate(best_parameters)
# print('test =', test)
print(model.parameters())
trained_model = train(model, best_parameters, train_dl)
acc = evaluate(trained_model,test_dl)


<generator object Module.parameters at 0x7fd7ce88ea50>
Accuracy of model: 0.555045403530252


In [138]:
# Get test dataset from the csv file.
test_dataset = CSVTestDataset('clean_test-kt.csv')

# Get self.x of CSVTestDataset
test_df = test_dataset.get_test()

# Pass test_df into dataloader
test_df = DataLoader(test_df, batch_size=10, shuffle=False)

# Predict results
test_results = _predict(test_df, trained_model)

CSVTestDataset = (14850, 20)
Index(['amount_tsh', 'funder', 'gps_height', 'installer', 'longitude',
       'latitude', 'basin', 'region', 'district_code', 'lga', 'population',
       'scheme_management', 'extraction_type', 'management', 'payment_type',
       'water_quality', 'quantity', 'source', 'waterpoint_type',
       'operational_year'],
      dtype='object')


In [139]:
#Check data prediction
print(test_results)
np.unique(test_results)
results_numpy = sg_encoder.inverse_transform(test_results)
print(results_numpy)

[0 0 0 ... 0 0 0]
['functional' 'functional' 'functional' ... 'functional' 'functional'
 'functional']


In [141]:

# Get dataframe
test_data = pd.read_csv('clean_test-kt.csv')

# Create dataframe with predictions and id
submission_df = pd.DataFrame(results_numpy, columns=['status_group'])
submission_df['id'] = test_data.id
submission_df = submission_df[['id','status_group']]

# Create new csv
pd.DataFrame(submission_df).to_csv("nnmodel_results.csv", index=False)