# Part I - Dataset Analysis

1. Import the **training set ONLY** using Pandas (**HINT**: the dataset is in csv format). We have given you the `feature_names` already below.

2. Report the type of every feature (**HINT**: `DataFrame.dtype` might be handy).

3. Report if the dataset is balanced.

4. Print the feature values of every feature. In the case of a numerical feature print the range and average (with 3 decimal points) instead.

5. Using plots of histograms, bar-graphs or heatmaps *briefly* comment on the following (don't forget to include the plots in your .ipynb notebook):
    - Does the `age` feature follow a normal distribution (just by eyeballing)?
    - Is the `poutcome` feature unimodal (just by eyeballing)?
    - Is the `education` feature unimodal (just by eyeballing)?
    - Taking into account **only** the numerical features do you notice any correlation between pairs of features?

In [None]:
import pandas as pd
import numpy as np

feature_names = ["age","job","marital","education","default","balance","housing",
                              "loan","contact","day","month","duration","campaign","pdays","previous","poutcome", "category"]

# TODO: Fill-in (Answer questions 2)
train = pd.read_csv('bank_train', index_col=False, sep=',', names=feature_names)   #read file

# display all variable types exclusing 'category' as it is not a feature
for i in range(len(feature_names[:-1])):
    if feature_names[i] == 'day':   # 'day' is categorical
        print(f'{feature_names[i]}: categorical')
    elif train.dtypes[i] == 'int64':
        print(f'{feature_names[i]}: numerical')
    else: 
        print(f'{feature_names[i]}: categorical')

In [None]:
train   # display train dataframe

In [None]:
train['category'].value_counts(normalize=True)   # to check whether the dataset is balanced
# Reference: https://dfrieds.com/data-analysis/value-counts-python-pandas.html

In [None]:
#TODO Fill-in (Answer question 3)
for i in range(len(feature_names[:-1])):
    if feature_names[i] == 'day': 
        print('{}: number={}, {}'.format(feature_names[i], len(set(train[feature_names[i]])), 
                                         list(set(train[feature_names[i]]))))
    elif train.dtypes[i] == 'int64':   # if variable is numeric, then display range and average
        print('{}: min={}, max={}, avg={:0.3f}'.format(feature_names[i], min(set(train[feature_names[i]])), 
                        max(set(train[feature_names[i]])), train[feature_names[i]].mean()))
    else:    # if variable is string, then display all feature values
        print('{}: number={}, {}'.format(feature_names[i], len(set(train[feature_names[i]])), 
                                         list(set(train[feature_names[i]]))))

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

#TODO Fill-in (Answer question 4)
# plot histograms
train['age'].hist()
plt.show()
train['poutcome'].hist()
plt.show()
train['education'].hist()
plt.show()

fhm = train.select_dtypes(exclude=['object'])   # drop non-numeric variables
fhm = fhm.drop('day',axis=1)   # drop 'day' feature
corr = fhm.corr()   # calculate correlations
sns.heatmap(corr, annot = True)   # plot heatmap
plt.show()

## Part II (Decision Trees)

- Load data without any preprocessing
- Perform hyperparameter tuning on depth of DT
- Plot train, dev accuracy curves
- Test on test set ONCE

In [None]:
# Dataset loading
from BankDataset import load_dataset
import pandas as pd

train_dataset = load_dataset('bank_train', preprocess_onehot=True)
dev_dataset = load_dataset('bank_dev', preprocess_onehot=True)
test_dataset = load_dataset('bank_test', preprocess_onehot=True)

In [None]:
# display object attributes
for attr, value in train_dataset.__dict__.items():
        print(attr, value)

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
from sklearn import metrics
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
# TODO: Fill-in (Hyperparameter Tuning)
max_depth = range(1, 30)   # set suitable max depth range
accuracy_train = []   # create empty list for training accuracy

# loop accuracy for different max depth
for i in max_depth:
        BankTree = DecisionTreeClassifier(criterion="gini", max_depth = i)
        BankTree.fit(train_dataset.X, train_dataset.y)   # fit training dataset
        
        pred_train = BankTree.predict(train_dataset.X)   # predict category with trained model
        
        accuracy_train.append(metrics.accuracy_score(train_dataset.y, pred_train))   # add accuracy to list

In [None]:
accuracy_dev = []   # create empty list for development accuracy

# loop accuracy for different max depth
for i in max_depth:
        BankTree = tree.DecisionTreeClassifier(criterion="gini", max_depth = i)
        BankTree.fit(train_dataset.X, train_dataset.y)   # fit development dataset
        
        pred_dev = BankTree.predict(dev_dataset.X)   # predict category with trained model
        
        accuracy_dev.append(metrics.accuracy_score(dev_dataset.y, pred_dev))   # add accuracy to list

In [None]:
plt.plot(max_depth, accuracy_train, label="Train")   # plot training accuracy curve
plt.plot(max_depth, accuracy_dev, label="Dev")   # plot development accuracy curve

# add legend, title, axis labels, and save the plot
plt.legend(bbox_to_anchor=(0.8, 0.3), loc='upper left', borderaxespad=0.)
plt.title('Accuracy against Max Depth')
plt.xlabel('Max Depth')
plt.ylabel('Accuracy')
plt.savefig('Q2a.jpg')

In [None]:
#get depth with the maximum accuracy
print("The maximum accuracy is {:1.3f} at max_depth = {}".format(max(accuracy_dev), accuracy_dev.index(max(accuracy_dev))+1))

In [None]:
# TODO: Fill-in (Test on test set)   
BankTree = DecisionTreeClassifier(criterion="gini", max_depth = accuracy_dev.index(max(accuracy_dev))+1)
BankTree.fit(train_dataset.X, train_dataset.y)

# predict with test set
pred_test = BankTree.predict(test_dataset.X)
print("DecisionTrees's Accuracy (Test Set): ", metrics.accuracy_score(test_dataset.y, pred_test))

## Part III (k Nearest Neighbours)

- Load data with 1-hot encoding and scaling
- Perform hyperparameter tuning on number of neighbours k
- Plot train, dev accuracy curves
- Test on test set ONCE...more!

In [None]:
# Dataset loading (load data again! Careful with preprocessing parameters)
from BankDataset import load_dataset

# TODO: Fill-in
train_dataset = load_dataset('bank_train', preprocess_onehot=True, apply_scaling=True)
dev_dataset = load_dataset('bank_dev', preprocess_onehot=True, apply_scaling=True)
test_dataset = load_dataset('bank_test', preprocess_onehot=True, apply_scaling=True)

In [None]:
# display object attributes
for attr, value in train_dataset.__dict__.items():
        print(attr, value)

In [None]:
from sklearn.neighbors import KNeighborsClassifier

import numpy as np
import seaborn as sns
from sklearn import metrics
import matplotlib.pyplot as plt

# TODO: Fill-in (Hyperparameter Tuning)

n_neighbors = range(1,30,2)   # set suitable nearest neighbors range taking odd values only
accuracy_train = []   # create empty list for training accuracy

# loop accuracy for different number of nearest neighbors
for i in n_neighbors:
        BankTree = KNeighborsClassifier(n_neighbors=i)
        BankTree.fit(train_dataset.X, train_dataset.y)   # fit training dataset
        
        pred_train = BankTree.predict(train_dataset.X)   # predict category with trained model
        
        accuracy_train.append(metrics.accuracy_score(train_dataset.y, pred_train))   # add accuracy to list

In [None]:
accuracy_dev = []   # create empty list for development accuracy

# loop accuracy for different number of nearest neighbors
for i in n_neighbors:
        BankTree = KNeighborsClassifier(n_neighbors=i)
        BankTree.fit(train_dataset.X, train_dataset.y)   # fit development dataset
        
        pred_dev = BankTree.predict(dev_dataset.X)   # predict category with trained model
        
        accuracy_dev.append(metrics.accuracy_score(dev_dataset.y, pred_dev))   # add accuracy to list

In [None]:
plt.plot(n_neighbors, accuracy_train, label="Train")   # plot training accuracy curve
plt.plot(n_neighbors, accuracy_dev, label="Dev")   # plot development accuracy curve

# add legend, title, axis labels, and save the plot
plt.legend(bbox_to_anchor=(0.8, 0.8), loc='upper left', borderaxespad=0.)
plt.title('Accuracy against K Value')
plt.xlabel('K Value')
plt.ylabel('Accuracy')
plt.savefig('Q3a.jpg')

In [None]:
# get optimum k value where validation accuracy is smaller than training accuracy
difference = []
zip_object = zip(accuracy_dev, accuracy_train)

for accuracy_dev_i, accuracy_train_i in zip_object:
    difference.append(accuracy_dev_i-accuracy_train_i)

ind = next(x for x, val in enumerate(difference) if val > 0)
max(accuracy_dev[:ind])   # get max accuracy of corresponding k value

In [None]:
# get global maximum of validation accuracy
print("The maximum accuracy of development set is {:1.4f} at k={}. ".format(max(accuracy_dev[:ind]), 
                                                    n_neighbors[accuracy_dev.index(max(accuracy_dev[:ind]))]))

In [None]:
# TODO: Fill-in (Test on test set)
BankTree = KNeighborsClassifier(n_neighbors=(n_neighbors[accuracy_dev.index(max(accuracy_dev[:ind]))]))
BankTree.fit(train_dataset.X, train_dataset.y)

# predict with test set
pred_test = BankTree.predict(test_dataset.X)
print("DecisionTrees's Accuracy (Test Set): ", metrics.accuracy_score(test_dataset.y, pred_test))