# CSC 578D / Data Mining / Fall 2018 / University of Victoria
# Python Notebook explaining Assignment 02 / Problem 03

### Notes:
1. Dataset can be found __[here](http://www.apkc.net/data)__.

**Author:** Andreas P. Koenzen (akoenzen => uvic.ca)
<br>
**Version:** 0.1

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [None]:
data = pd.read_csv(
    'http://www.apkc.net/data/csc_578d/assignment02/ex3data1.txt', 
    header=None, 
    names=['Exam 1', 'Exam 2', 'Admitted']
)
data.head()

### Preprocessing functions:

In [None]:
def prepare(data):
    x = data.values[:,0:-1]
    # print(x[:10, :])
    
    max_x = np.max(x, axis=0)
    min_x = np.min(x, axis=0)
    x = (x - min_x) / (max_x - min_x)
    # print(x[:10, :])

    x = np.insert(x, 0, 1, axis=1)
    # print(x[:10, :])
    
    y = data.values[:,-1:]
    # print(y[:10, :])

    y[(y == 0)] = -1
    # print(y[:10, :])
    
    return x, y

x, y = prepare(data)

### Plot the data:

In [None]:
def visualize(x, y, col_1=1, col_2=2):
    positive = x[np.where(y == 1)[0]]
    negative = x[np.where(y == -1)[0]]

    _, ax = plt.subplots(figsize=(8, 6))
    ax.scatter(positive[:,1:2], positive[:,2:], s=50, c='b', marker='o', label='Accepted')
    ax.scatter(negative[:,1:2], negative[:,2:], s=50, c='r', marker='x', label='Rejected')
    ax.legend()
    ax.set_xlabel("Exam {}".format(col_1), fontsize=14)
    ax.set_ylabel("Exam {}".format(col_2), fontsize=14)
    ax.set_xlim([-0.1, 1.1])
    ax.set_ylim([-0.1, 1.1])

    plt.grid(True)
    plt.show()
    
visualize(x, y)

### Build the model and plot the error curve:

In [None]:
def error(x, y, w):
    return np.log(1 + np.exp(-y * (x @ w.T)))

def error_mean(x, y, w):
    return np.asscalar(((1 / len(x)) * np.sum(
        error(x, y, w), 
        axis=0, 
        keepdims=True)).ravel())

def grad(x, y, w):
    return (y * x) / (1 + np.exp(y * (x @ w.T)))

def grad_mean(x, y, w):
    return ((-1 / len(x)) * np.sum(
        grad(x, y, w), 
        axis=0, 
        keepdims=True)).ravel()

def fit(x, y, kappa, iterations):
    w = np.zeros((1, x.shape[1]))
    e = []

    for k in range(iterations):
        e.append(error_mean(x, y, w))
        w = w - (kappa * grad_mean(x, y, w))
    
    return w, e

w, e = fit(x, y, 1, 1000)
print("Weight vector: {}".format(w))
print()
print("Error curve:")
_ = plt.plot(e)
plt.show()

### Make predictions and plot:

In [None]:
# Compute Pr(y=1 | x) and if Pr > 0.5 is 1, else is -1
# Return the prediction's array
def predict(x, w, real_y=None):
    p = 1 / (1 + np.exp(-1 * (x @ w.T)))
    
    if type(real_y) is np.ndarray:
        print()
        print("Prediction plotting:")
        
        pred_accepted = x[np.where(p > 0.5)[0]]
        pred_rejected = x[np.where(p <= 0.5)[0]]
    
        # labeled data
        positive = x[np.where(real_y == 1)[0]]
        negative = x[np.where(real_y == -1)[0]]

        _, ax = plt.subplots(figsize=(12, 8))
        ax.scatter(positive[:,1:2], positive[:,2:], s=100, c='c', marker='s', label='Accepted')
        ax.scatter(negative[:,1:2], negative[:,2:], s=100, c='y', marker='s', label='Rejected')
        ax.scatter(pred_accepted[:,1:2], pred_accepted[:,2:], s=25, c='b', marker='o', label='Predicted Accepted')
        ax.scatter(pred_rejected[:,1:2], pred_rejected[:,2:], s=25, c='r', marker='o', label='Predicted Rejected')
        ax.legend()
        ax.set_xlabel("Exam {}".format(1), fontsize=14)
        ax.set_ylabel("Exam {}".format(2), fontsize=14)
        ax.set_xlim([-0.1, 1.1])
        ax.set_ylim([-0.1, 1.1])

        plt.grid(True)
        plt.show()
    
    return p

def accuracy(y, y_pred):
    y_pred = np.where(y_pred > 0.5, 1, -1)
    
    return np.count_nonzero(y == y_pred) / len(y)

acc = accuracy(y, predict(x, w, real_y=y))
print("Prediction accuracy: {} ({}%)".format(acc, acc * 100))

### Cross-validate using the Holdout method:

In [None]:
def split_train_test(x, y, pct=80):
    n = x.shape[0]
    s = round(n * pct / 100)
    
    indices = np.random.permutation(n)
    train_idx, test_idx = indices[:s], indices[s:]
    
    x_train, x_test = x[train_idx,:], x[test_idx,:]
    y_train, y_test = y[train_idx,:], y[test_idx,:]
    
    return x_train, y_train, x_test, y_test
    
x_train, y_train, x_test, y_test = split_train_test(x, y, pct=80)

w, e = fit(x_train, y_train, 1, 1000)
print("Weight vector: {}".format(w))
print()
print("Error curve:")
_ = plt.plot(e)
plt.show()

acc = accuracy(y_test, predict(x_test, w, real_y=y_test))
print("Prediction Accuracy: {} ({}%)".format(acc, acc * 100))

***
# END