# CS 7641 Machine Learning
# Assignment 2 Randomized Optimization

## Neural Network Weight Optimization : Random Hill Climbing

In [56]:
#
# 0. Import Packages
#

RANDOM_SEED = 27

# Math tools for ML
import numpy as np
import pandas as pd
import math
import time
import copy
from numpy import arange

# Randomized Optimization 
import mlrose_hiive

# Progress bar
#from tqdm import tqdm

# Graph visualization
import matplotlib.pyplot as plt

# Data Preparation
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import KFold

from sklearn.preprocessing import MinMaxScaler, OneHotEncoder

import torch
import torch.nn as nn
#import torch.optim as optim
#import tqdm

from sklearn.model_selection import cross_val_score, cross_validate
from sklearn.model_selection import StratifiedKFold

# Model 
from sklearn.tree import DecisionTreeClassifier # Decision Tree
from sklearn.ensemble import AdaBoostClassifier # Boosted Decision Tree
from sklearn.svm import SVC # SVM
from sklearn.neighbors import KNeighborsClassifier # KNN

#from sklearn import tree

import sklearn.metrics as mt
from sklearn.metrics import roc_auc_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_curve
from sklearn.preprocessing import StandardScaler


In [57]:
#
# 1. Data Preparation
#

df = pd.read_csv("./data/dropout.csv", sep=";")
df.info()

# Target 데이터 분석 및 제거
num_droupout = df.query('Target=="Dropout"')
print(len(num_droupout))
num_enrolled = df.query('Target=="Enrolled"')
print(len(num_enrolled))
num_graduated = df.query('Target=="Graduate"')
print(len(num_graduated))

# Enrolled 데이타 삭제 (pending)
df = df.drop(df[df['Target'] == "Enrolled"].index)

# pandas dataframe replace (from https://heytech.tistory.com/441)
df["Target"].replace({'Dropout':0, 'Graduate':1}, inplace = True)
print(df["Target"])

X_raw = df.values[:,:-1]
y_raw = df.values[:,-1]

print("x_raw dimension: ", X_raw.shape)
print("y_raw dimension: ", y_raw.shape)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4424 entries, 0 to 4423
Data columns (total 37 columns):
 #   Column                                          Non-Null Count  Dtype  
---  ------                                          --------------  -----  
 0   Marital status                                  4424 non-null   int64  
 1   Application mode                                4424 non-null   int64  
 2   Application order                               4424 non-null   int64  
 3   Course                                          4424 non-null   int64  
 4   Daytime/evening attendance	                     4424 non-null   int64  
 5   Previous qualification                          4424 non-null   int64  
 6   Previous qualification (grade)                  4424 non-null   float64
 7   Nacionality                                     4424 non-null   int64  
 8   Mother's qualification                          4424 non-null   int64  
 9   Father's qualification                   

In [58]:
#
# 1.1 Tensor Data Type (Pytorch)
#

#X_raw = torch.tensor(X_raw, dtype=torch.float32)
#y_raw = torch.tensor(y_raw, dtype=torch.float32).reshape(-1, 1)

In [59]:
#
# 1.2 Split train and test sets
#

# data type change 
#y_raw = y_raw.astype(int)

X_train, X_test, y_train, y_test = train_test_split(X_raw, y_raw, stratify=y_raw, test_size=0.2, random_state=RANDOM_SEED)

scaler = MinMaxScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

one_hot = OneHotEncoder()

y_train_hot = one_hot.fit_transform(y_train.reshape(-1, 1)).todense()
y_test_hot = one_hot.transform(y_test.reshape(-1, 1)).todense()

# print # of X_train
print(y_train_hot.shape)
print(y_test_hot.shape)

(2904, 2)
(726, 2)


In [60]:
#
# 1.3 Helper Function (Pytorch)
# 

In [61]:
# 
# 1.4 NN Class (mlrose-hiive)
#

In [52]:
#
# 2. Learning Curve
#

max_attempts = [ 1, 5, 10, 20]
max_iters = [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]
restarts = [0, 20, 40, 60]

results_column = ["max_attempts", "max_iters", "restart", "accuracy_train", "accuracy_test", "train_time"]
results_list = []
results_df = pd.DataFrame(columns=results_column)

for max_attempt in max_attempts:
    for max_iter in max_iters:
        for restart in restarts: 
            
            kfold = KFold(n_splits=5, shuffle=True)
            cv_scores  = []
            
            start_time = time.perf_counter()
            nn_model = mlrose_hiive.NeuralNetwork(hidden_nodes = [2], 
                                                  activation = 'relu' ,
                                                  algorithm = 'random_hill_climb',
                                                  is_classifier = True,
                                                  early_stopping = True,
                                                  random_state = RANDOM_SEED,
                                                  max_attempts = max_attempt,
                                                  max_iters = max_iter,
                                                  restarts = restart)

            for train, validate in kfold.split(X_train_scaled, y_train_hot):
                nn_model.fit(X_train_scaled[train], y_train_hot[train])
                y_pred_hot = nn_model.predict(X_train_scaled[validate])
                accuracy = mt.accuracy_score(np.asarray(y_pred_hot), np.asarray(y_train_hot[validate]))
                cv_scores.append(accuracy)
            
            train_time = time.perf_counter() - start_time

            accuracy_train = np.mean(cv_scores)
    
            with torch.no_grad():
                y_pred_hot = nn_model.predict(X_test_scaled)
            accuracy_test = mt.accuracy_score(np.asarray(y_test_hot), np.asarray(y_pred_hot))
            
            results_df.loc[len(results_df.index)] = [max_attempt, max_iter, restart, accuracy_train, accuracy_test, train_time]
            print(max_attempt, max_iter, restart, accuracy_train, accuracy_test, train_time)

1 200 0 0.5864371772805509 0.6157024793388429 0.03648037500170176
1 200 20 0.6177725681049321 0.6336088154269972 0.3535527079984604
1 200 40 0.6084717193898748 0.6349862258953168 0.7651867919994402
1 200 60 0.6064039408866995 0.6308539944903582 1.0222414589989057
1 400 0 0.5864294616891209 0.6157024793388429 0.015033667001262074
1 400 20 0.6177725681049321 0.6336088154269972 0.3440122909996717
1 400 40 0.6084717193898748 0.6349862258953168 0.7371117500006221
1 400 60 0.6064039408866995 0.6308539944903582 1.1629577500025334
1 600 0 0.5864294616891209 0.6157024793388429 0.014857000001939014
1 600 20 0.6177725681049321 0.6336088154269972 0.3861012909983401
1 600 40 0.6084717193898748 0.6349862258953168 0.7297011250011565
1 600 60 0.6064039408866995 0.6308539944903582 0.9697528750002675
1 800 0 0.5864294616891209 0.6157024793388429 0.014911374997609528
1 800 20 0.6177725681049321 0.6336088154269972 0.353586458000791
1 800 40 0.6084717193898748 0.6349862258953168 0.7727379579991975
1 800 60

In [53]:
#print(results_df)
print("Best Test Accuracy: \n", results_df.loc[results_df['accuracy_test'].idxmax()])  

Best Test Accuracy: 
 max_attempts        20.000000
max_iters         2000.000000
restart             40.000000
accuracy_train       0.893939
accuracy_test        0.892562
train_time         531.009398
Name: 158, dtype: float64


In [55]:
results_df.to_csv('nn_random_hill_climbing.csv')