## CS 7641 Machine Learning
## Assignment 3 Unsupervised Learning and Dimensional Deduction
#### Experiment: Step 5 Neural Network & Clustering
#### Algorithms: KMeans, EM (GaurssianMixture)
#### Data      : Dropout (Step4)

In [5]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import time
import copy
import math

from scipy.stats import kurtosis 

from sklearn.preprocessing import StandardScaler

from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score

from sklearn.tree import DecisionTreeClassifier
from sklearn.cluster import KMeans
from sklearn.mixture import GaussianMixture
from sklearn.decomposition import FastICA, PCA
from sklearn.decomposition import TruncatedSVD
from sklearn.random_projection import SparseRandomProjection


from yellowbrick.cluster import SilhouetteVisualizer
from yellowbrick.cluster import InterclusterDistance

import torch
import torch.nn as nn
import torch.optim as optim
import tqdm
#from tqdm import tqdm

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

## Predict Students' Dropout and Academic Success

In [6]:
##
## Data Load
##

df = pd.read_csv("./data/dropout.csv", sep=";")

# Target 데이터 분석 및 제거
num_droupout = df.query('Target=="Dropout"')
num_enrolled = df.query('Target=="Enrolled"')
num_graduated = df.query('Target=="Graduate"')

# Enrolled 데이타 삭제 (pending)
df = df.drop(df[df['Target'] == "Enrolled"].index)

# pandas dataframe replace (from https://heytech.tistory.com/441)
df["Target"].replace({'Dropout':0, 'Graduate':1}, inplace = True)

X_raw = df.values[:,:-1]
y_raw = df.values[:,-1]

In [7]:
# Helper function to train one model
def model_train(model, X_train, y_train, X_val, y_val, n_epochs = 100, lr=0.001):
#def model_train(model, X_train, y_train, X_val, y_val):
    # loss function and optimizer
    loss_fn = nn.BCELoss()  # binary cross entropy
    #optimizer = optim.Adam(model.parameters(), lr=0.0001)
    #optimizer = optim.Adam(model.parameters(), lr=lr)  # modified 
    optimizer = optim.Adam(model.parameters(), lr=0.001)  # modified 
 
    #n_epochs = 300   # number of epochs to run
    #n_epochs = n_epochs   # number of epochs to run # modified
    n_epochs = 100   # number of epochs to run # modified
    batch_size = 10  # size of each batch
    batch_start = torch.arange(0, len(X_train), batch_size)
 
    # Hold the best model
    best_acc = - np.inf   # init to negative infinity
    best_weights = None
 
    for epoch in range(n_epochs):
        model.train()
        with tqdm.tqdm(batch_start, unit="batch", mininterval=0, disable=True) as bar:
            bar.set_description(f"Epoch {epoch}")
            for start in bar:
                # take a batch
                X_batch = X_train[start:start+batch_size]
                y_batch = y_train[start:start+batch_size]
                # forward pass
                y_pred = model(X_batch)
                loss = loss_fn(y_pred, y_batch)
                # backward pass
                optimizer.zero_grad()
                loss.backward()
                # update weights
                optimizer.step()
                # print progress
                acc = (y_pred.round() == y_batch).float().mean()
                bar.set_postfix(
                    loss=float(loss),
                    acc=float(acc)
                )
        # evaluate accuracy at end of each epoch
        model.eval()
        y_pred = model(X_val)
        acc = (y_pred.round() == y_val).float().mean()
        acc = float(acc)
        if acc > best_acc:
            best_acc = acc
            best_weights = copy.deepcopy(model.state_dict())
    # restore model and return best accuracy
    model.load_state_dict(best_weights)
    return best_acc

## Neural Network + KMeans

In [10]:
##
## KMeans
##

num_clusters = 9

model = SparseRandomProjection(n_components=num_clusters, random_state=0)
X_raw_rp = model.fit_transform(X_raw)

print(X_raw.shape)
print(X_raw_rp.shape)

model = KMeans(n_clusters = 2, random_state = 0, n_init = 'auto')
model.fit(X_raw_rp)
y_pred = model.predict(X_raw_rp)

X_raw_tensor = torch.tensor(X_raw_rp, dtype=torch.float32)
y_raw_tensor = torch.tensor(y_pred, dtype=torch.float32).reshape(-1, 1)

X_train, X_test, y_train, y_test = train_test_split(X_raw_tensor, y_raw_tensor, stratify=y_raw_tensor, test_size=0.2, random_state=42)

(3630, 36)
(3630, 9)


In [13]:
## 
## Neural Network + Kmeans
##

class Model (nn.Module):
    def __init__(self, x):      
        super().__init__()
        self.layer1 = nn.Linear(9, x)
        self.act1 = nn.ReLU()
        self.layer2 = nn.Linear(x, x)
        self.act2 = nn.ReLU()
        self.output = nn.Linear(x, 1)
        self.sigmoid = nn.Sigmoid()
        
    def forward(self, x):
        x = self.act1(self.layer1(x))
        x = self.act2(self.layer2(x))
        x = self.sigmoid(self.output(x))
        return x

In [12]:
kfold = StratifiedKFold(n_splits=5, shuffle=True)
cv_scores  = []

start_time = time.perf_counter()
for train, validate in kfold.split(X_train, y_train):
    model = Model(36)
    acc = model_train(model, X_train[train], y_train[train], X_train[validate], y_train[validate], 100, 0.001)
    cv_scores.append(acc)
training_time = time.perf_counter() - start_time
print("Training Time: ", training_time)

acc_mean = np.mean(cv_scores)
    
with torch.no_grad():
    y_pred = model(X_test)
    
acc_test = metrics.accuracy_score(y_test.numpy(), np.rint(y_pred.numpy()))    

print("Cross Validation Score: " + str(acc_mean))
print("Test Accuracy: " + str(acc_test))

Training Time:  56.58988358301576
Cross Validation Score: 0.9314623951911927
Test Accuracy: 0.90633608815427


## Neural Network + EM

In [15]:
##
## EM
##

num_clusters = 9

model = SparseRandomProjection(n_components=num_clusters, random_state=0)
X_raw_rp = model.fit_transform(X_raw)

print(X_raw.shape)
print(X_raw_rp.shape)

model = GaussianMixture(n_components = 2)
model.fit(X_raw_rp)
y_pred = model.predict(X_raw_rp)

X_raw_tensor = torch.tensor(X_raw_rp, dtype=torch.float32)
y_raw_tensor = torch.tensor(y_pred, dtype=torch.float32).reshape(-1, 1)

X_train, X_test, y_train, y_test = train_test_split(X_raw_tensor, y_raw_tensor, stratify=y_raw_tensor, test_size=0.2, random_state=42)

(3630, 36)
(3630, 9)


In [16]:
## 
## Neural Network + Kmeans
##

class Model (nn.Module):
    def __init__(self, x):      
        super().__init__()
        self.layer1 = nn.Linear(9, x)
        self.act1 = nn.ReLU()
        self.layer2 = nn.Linear(x, x)
        self.act2 = nn.ReLU()
        self.output = nn.Linear(x, 1)
        self.sigmoid = nn.Sigmoid()
        
    def forward(self, x):
        x = self.act1(self.layer1(x))
        x = self.act2(self.layer2(x))
        x = self.sigmoid(self.output(x))
        return x

In [17]:
kfold = StratifiedKFold(n_splits=5, shuffle=True)
cv_scores  = []

start_time = time.perf_counter()
for train, validate in kfold.split(X_train, y_train):
    model = Model(36)
    acc = model_train(model, X_train[train], y_train[train], X_train[validate], y_train[validate], 100, 0.001)
    cv_scores.append(acc)
training_time = time.perf_counter() - start_time
print("Training Time: ", training_time)

acc_mean = np.mean(cv_scores)
    
with torch.no_grad():
    y_pred = model(X_test)
    
acc_test = metrics.accuracy_score(y_test.numpy(), np.rint(y_pred.numpy()))    

print("Cross Validation Score: " + str(acc_mean))
print("Test Accuracy: " + str(acc_test))

Training Time:  59.30186904093716
Cross Validation Score: 0.9721093297004699
Test Accuracy: 0.9614325068870524
