# CS 7641 Machine Learning
# Assignment 2 Randomized Optimization

## Neural Network Weight Optimization : Simulated Annealing

In [10]:
#
# 0. Import Packages
#

# han: check if random seed is necessary
RANDOM_SEED = 27

# Math tools for ML
import numpy as np
import pandas as pd
import math
import time
import copy
from numpy import arange

# Randomized Optimization 
import mlrose_hiive

# Progress bar
#from tqdm import tqdm

# Graph visualization
import matplotlib.pyplot as plt

# Data Preparation
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import KFold

from sklearn.preprocessing import MinMaxScaler, OneHotEncoder

import torch
import torch.nn as nn
#import torch.optim as optim
#import tqdm

from sklearn.model_selection import cross_val_score, cross_validate
from sklearn.model_selection import StratifiedKFold

# Model 
from sklearn.tree import DecisionTreeClassifier # Decision Tree
from sklearn.ensemble import AdaBoostClassifier # Boosted Decision Tree
from sklearn.svm import SVC # SVM
from sklearn.neighbors import KNeighborsClassifier # KNN

#from sklearn import tree

import sklearn.metrics as mt
from sklearn.metrics import roc_auc_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_curve
from sklearn.preprocessing import StandardScaler


In [11]:
#
# 1. Data Preparation
#

df = pd.read_csv("./data/dropout.csv", sep=";")
df.info()

# Target 데이터 분석 및 제거
num_droupout = df.query('Target=="Dropout"')
print(len(num_droupout))
num_enrolled = df.query('Target=="Enrolled"')
print(len(num_enrolled))
num_graduated = df.query('Target=="Graduate"')
print(len(num_graduated))

# Enrolled 데이타 삭제 (pending)
df = df.drop(df[df['Target'] == "Enrolled"].index)

# pandas dataframe replace (from https://heytech.tistory.com/441)
df["Target"].replace({'Dropout':0, 'Graduate':1}, inplace = True)
print(df["Target"])

X_raw = df.values[:,:-1]
y_raw = df.values[:,-1]

print("x_raw dimension: ", X_raw.shape)
print("y_raw dimension: ", y_raw.shape)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4424 entries, 0 to 4423
Data columns (total 37 columns):
 #   Column                                          Non-Null Count  Dtype  
---  ------                                          --------------  -----  
 0   Marital status                                  4424 non-null   int64  
 1   Application mode                                4424 non-null   int64  
 2   Application order                               4424 non-null   int64  
 3   Course                                          4424 non-null   int64  
 4   Daytime/evening attendance	                     4424 non-null   int64  
 5   Previous qualification                          4424 non-null   int64  
 6   Previous qualification (grade)                  4424 non-null   float64
 7   Nacionality                                     4424 non-null   int64  
 8   Mother's qualification                          4424 non-null   int64  
 9   Father's qualification                   

In [12]:
#
# 1.1 Tensor Data Type (Pytorch)
#

#X_raw = torch.tensor(X_raw, dtype=torch.float32)
#y_raw = torch.tensor(y_raw, dtype=torch.float32).reshape(-1, 1)

In [17]:
#
# 1.2 Split train and test sets
#

# data type change 
#y_raw = y_raw.astype(int)

X_train, X_test, y_train, y_test = train_test_split(X_raw, y_raw, stratify=y_raw, test_size=0.2, random_state=RANDOM_SEED)

scaler = MinMaxScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

one_hot = OneHotEncoder()

y_train_hot = one_hot.fit_transform(y_train.reshape(-1, 1)).todense()
y_test_hot = one_hot.transform(y_test.reshape(-1, 1)).todense()

# print # of X_train
print(y_train_hot.shape)
print(y_test_hot.shape)

(2904, 2)
(726, 2)


In [20]:
#
# 2. Learning Curve
#

max_attempts = [ 1, 5, 10, 20]
#max_iters = [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]
max_iters = [400, 800, 1200, 1600, 2000]

results_column = ["max_attempts", "max_iters", "accuracy_train", "accuracy_test", "train_time"]
results_list = []
results_df = pd.DataFrame(columns=results_column)

for max_attempt in max_attempts:
    for max_iter in max_iters:
        
            
        kfold = KFold(n_splits=5, shuffle=True)
        cv_scores  = []
            
        start_time = time.perf_counter()
        nn_model = mlrose_hiive.NeuralNetwork(hidden_nodes = [2], 
                                                  activation = 'relu' ,
                                                  algorithm = 'simulated_annealing',
                                                  is_classifier = True,
                                                  early_stopping = True,
                                                  random_state = RANDOM_SEED,
                                                  max_attempts = max_attempt,
                                                  max_iters = max_iter)

        for train, validate in kfold.split(X_train_scaled, y_train_hot):
            nn_model.fit(X_train_scaled[train], y_train_hot[train])
            y_pred_hot = nn_model.predict(X_train_scaled[validate])
            accuracy = mt.accuracy_score(np.asarray(y_pred_hot), np.asarray(y_train_hot[validate]))
            cv_scores.append(accuracy)
            
        train_time = time.perf_counter() - start_time

        accuracy_train = np.mean(cv_scores)
    
        with torch.no_grad():
            y_pred_hot = nn_model.predict(X_test_scaled)
        accuracy_test = mt.accuracy_score(np.asarray(y_test_hot), np.asarray(y_pred_hot))
            
        results_df.loc[len(results_df.index)] = [max_attempt, max_iter, accuracy_train, accuracy_test, train_time]
        print(max_attempt, max_iter, accuracy_train, accuracy_test, train_time)

1 400 0.5196450827942312 0.5247933884297521 1.457298000000037
1 800 0.5196450827942312 0.5247933884297521 1.3727799579999669
1 1200 0.5196450827942312 0.5247933884297521 1.3781834579999668
1 1600 0.5196450827942312 0.5247933884297521 1.3706042079999747
1 2000 0.5196450827942312 0.5247933884297521 1.3696000840000124
5 400 0.6477304291055849 0.640495867768595 3.2573983339999586
5 800 0.6942156804558134 0.7107438016528925 6.495298208000008
5 1200 0.7575844263754525 0.7617079889807162 9.527335249999965
5 1600 0.7923479138227789 0.7892561983471075 12.430790624999986
5 2000 0.8102569885453142 0.7947658402203857 15.445627666000007
10 400 0.6546275743367558 0.640495867768595 3.2836145830000305
10 800 0.691456466259125 0.6983471074380165 6.355262666000044
10 1200 0.747589174431717 0.7396694214876033 9.351558000000068
10 1600 0.7906255564128435 0.7548209366391184 12.455930250000051
10 2000 0.8202409638554217 0.7920110192837465 16.01862887499999
20 400 0.6529028428986884 0.6473829201101928 3.7719

In [21]:
#print(results_df)
print("Best Test Accuracy: \n", results_df.loc[results_df['accuracy_test'].idxmax()])  

Best Test Accuracy: 
 max_attempts         5.000000
max_iters         2000.000000
accuracy_train       0.810257
accuracy_test        0.794766
train_time          15.445628
Name: 9, dtype: float64


In [22]:
results_df.to_csv('nn_simulated_annealing.csv')