# CS 7641 Machine Learning
# Assignment 2 Randomized Optimization

## Neural Network Weight Optimization : Genetic Algorithm

In [1]:
#
# 0. Import Packages
#

RANDOM_SEED = 27

# Math tools for ML
import numpy as np
import pandas as pd
import math
import time
import copy
from numpy import arange

# Randomized Optimization 
import mlrose_hiive

# Progress bar
#from tqdm import tqdm

# Graph visualization
import matplotlib.pyplot as plt

# Data Preparation
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import KFold

from sklearn.preprocessing import MinMaxScaler, OneHotEncoder

import torch
import torch.nn as nn
#import torch.optim as optim
#import tqdm

from sklearn.model_selection import cross_val_score, cross_validate
from sklearn.model_selection import StratifiedKFold

# Model 
from sklearn.tree import DecisionTreeClassifier # Decision Tree
from sklearn.ensemble import AdaBoostClassifier # Boosted Decision Tree
from sklearn.svm import SVC # SVM
from sklearn.neighbors import KNeighborsClassifier # KNN

#from sklearn import tree

import sklearn.metrics as mt
from sklearn.metrics import roc_auc_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_curve
from sklearn.preprocessing import StandardScaler


In [2]:
#
# 1. Data Preparation
#

df = pd.read_csv("./data/dropout.csv", sep=";")
df.info()

# Target 데이터 분석 및 제거
num_droupout = df.query('Target=="Dropout"')
print(len(num_droupout))
num_enrolled = df.query('Target=="Enrolled"')
print(len(num_enrolled))
num_graduated = df.query('Target=="Graduate"')
print(len(num_graduated))

# Enrolled 데이타 삭제 (pending)
df = df.drop(df[df['Target'] == "Enrolled"].index)

# pandas dataframe replace (from https://heytech.tistory.com/441)
df["Target"].replace({'Dropout':0, 'Graduate':1}, inplace = True)
print(df["Target"])

X_raw = df.values[:,:-1]
y_raw = df.values[:,-1]

print("x_raw dimension: ", X_raw.shape)
print("y_raw dimension: ", y_raw.shape)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4424 entries, 0 to 4423
Data columns (total 37 columns):
 #   Column                                          Non-Null Count  Dtype  
---  ------                                          --------------  -----  
 0   Marital status                                  4424 non-null   int64  
 1   Application mode                                4424 non-null   int64  
 2   Application order                               4424 non-null   int64  
 3   Course                                          4424 non-null   int64  
 4   Daytime/evening attendance	                     4424 non-null   int64  
 5   Previous qualification                          4424 non-null   int64  
 6   Previous qualification (grade)                  4424 non-null   float64
 7   Nacionality                                     4424 non-null   int64  
 8   Mother's qualification                          4424 non-null   int64  
 9   Father's qualification                   

In [3]:
#
# 1.1 Tensor Data Type (Pytorch)
#

#X_raw = torch.tensor(X_raw, dtype=torch.float32)
#y_raw = torch.tensor(y_raw, dtype=torch.float32).reshape(-1, 1)

In [4]:
#
# 1.2 Split train and test sets
#

# data type change 
#y_raw = y_raw.astype(int)

X_train, X_test, y_train, y_test = train_test_split(X_raw, y_raw, stratify=y_raw, test_size=0.2, random_state=RANDOM_SEED)

scaler = MinMaxScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

one_hot = OneHotEncoder()

y_train_hot = one_hot.fit_transform(y_train.reshape(-1, 1)).todense()
y_test_hot = one_hot.transform(y_test.reshape(-1, 1)).todense()

# print # of X_train
print(y_train_hot.shape)
print(y_test_hot.shape)

(2904, 2)
(726, 2)


In [5]:
#
# 1.3 Helper Function (Pytorch)
# 

In [6]:
# 
# 1.4 NN Class (mlrose-hiive)
#

In [7]:
#
# 2. Learning Curve
#

max_attempts = [ 1, 5, 10, 20]

#max_iters = [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]
#pop_sizes = [100, 200, 300, 400, 500] #default: 200
#mutation_probs = [0.05, 0.1, 0.2, 0.3, 0.4] #default: 0.1
max_iters = [400, 2000]
pop_sizes = [100, 200, 300] #default: 200
mutation_probs = [0.05, 0.1, 0.2] #default: 0.1

results_column = ["max_attempts", "max_iters", "pop_sizes", "mutation_probs", "accuracy_train", "accuracy_test", "train_time"]
results_list = []
results_df = pd.DataFrame(columns=results_column)

for mutation_prob in mutation_probs: 
    for pop_size in pop_sizes:
        for max_attempt in max_attempts:
            for max_iter in max_iters:
                
                kfold = KFold(n_splits=5, shuffle=True)
                cv_scores  = []
            
                start_time = time.perf_counter()
                nn_model = mlrose_hiive.NeuralNetwork(hidden_nodes = [2], 
                                                  activation = 'relu' ,
                                                  algorithm = 'genetic_alg',
                                                  is_classifier = True,
                                                  early_stopping = True,
                                                  random_state = RANDOM_SEED,
                                                  max_attempts = max_attempt,
                                                  max_iters = max_iter,
                                                  pop_size = pop_size, 
                                                  mutation_prob = mutation_prob)

                for train, validate in kfold.split(X_train_scaled, y_train_hot):
                    nn_model.fit(X_train_scaled[train], y_train_hot[train])
                    y_pred_hot = nn_model.predict(X_train_scaled[validate])
                    accuracy = mt.accuracy_score(np.asarray(y_pred_hot), np.asarray(y_train_hot[validate]))
                    cv_scores.append(accuracy)
            
                train_time = time.perf_counter() - start_time

                accuracy_train = np.mean(cv_scores)
    
                with torch.no_grad():
                    y_pred_hot = nn_model.predict(X_test_scaled)
                accuracy_test = mt.accuracy_score(np.asarray(y_test_hot), np.asarray(y_pred_hot))
            
                results_df.loc[len(results_df.index)] = [max_attempt, max_iter, pop_size, mutation_prob, accuracy_train, accuracy_test, train_time]
                print(max_attempt, max_iter, pop_size, mutation_prob, accuracy_train, accuracy_test, train_time)

1 400 100 0.05 0.39151641046946406 0.39118457300275483 1.360982416999999
1 2000 100 0.05 0.39187429521039824 0.39118457300275483 1.3904567500000002
5 400 100 0.05 0.39187429521039824 0.39118457300275483 7.353772875000001
5 2000 100 0.05 0.39152531307495997 0.39118457300275483 7.33661
10 400 100 0.05 0.391536589708588 0.39118457300275483 20.212600958000003
10 2000 100 0.05 0.39152650008902606 0.39118457300275483 17.301983999999997
20 400 100 0.05 0.3915312481452905 0.39118457300275483 68.382317541
20 2000 100 0.05 0.3915318416523236 0.39118457300275483 63.861331417
1 400 200 0.05 0.391535996201555 0.39118457300275483 2.993848583000016
1 2000 200 0.05 0.39152531307495997 0.39118457300275483 3.893297582999992
5 400 200 0.05 0.3915276871030922 0.39118457300275483 13.873726041999987
5 2000 200 0.05 0.39118523354501755 0.39118457300275483 15.62275704199999
10 400 200 0.05 0.3915318416523236 0.39118457300275483 46.613379958999985
10 2000 200 0.05 0.3915181909905633 0.39118457300275483 32.3695

In [8]:
#print(results_df)
print("Best Test Accuracy: \n", results_df.loc[results_df['accuracy_test'].idxmax()])  

Best Test Accuracy: 
 max_attempts        1.000000
max_iters         400.000000
pop_sizes         200.000000
mutation_probs      0.200000
accuracy_train      0.392238
accuracy_test       0.393939
train_time          2.604213
Name: 56, dtype: float64


In [9]:
results_df.to_csv('nn_genetic_algorithm.csv')