In [1]:
# import some library we are gonna use
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import sklearn

In [2]:
# Read the data from the iris.data file and show the first five item in the dataset
# the iris dataset have four features: sepal_len, sepal_wid, petal_len and petal_wid
# the last column is class which is the thing we want
df = pd.read_csv('data/glass_ident/glass.data')
df.head()

Unnamed: 0,id,RI,Na,Mg,Al,Si,K,Ca,Ba,Fe,class
0,1,1.52101,13.64,4.49,1.1,71.78,0.06,8.75,0.0,0.0,1
1,2,1.51761,13.89,3.6,1.36,72.73,0.48,7.83,0.0,0.0,1
2,3,1.51618,13.53,3.55,1.54,72.99,0.39,7.78,0.0,0.0,1
3,4,1.51766,13.21,3.69,1.29,72.61,0.57,8.22,0.0,0.0,1
4,5,1.51742,13.27,3.62,1.24,73.08,0.55,8.07,0.0,0.0,1


In [3]:
# We can do slicing in pandas DataFrame using the iloc method as in NumPy Array
X, y = df.iloc[:, 1:-1], df.iloc[:, -1]

In [4]:
# Transform the DataFrame to numpy array
X, y = np.array(X), np.array(y)
for idx, class_name in enumerate(sorted(list(set(y)))):
    y[y == class_name] = idx

In [5]:
def get_classifier(X_train, y_train, num_epoch=1000, alpha=0.01):
    theta = np.zeros((X_train.shape[1]))
    for epoch in range(num_epoch):
        # forward pass
        logits = np.dot(X_train, theta)
        h = 1 / (1 + np.exp(-logits))
        cross_entropy_loss = (-y_train * np.log(h) - (1 - y_train) * np.log(1 - h)).mean()
    
        # backward pass
        gradient = np.dot((h - y_train), X_train) / y.size
        theta = theta - alpha * gradient
    return theta

In [6]:
def multi_classifier(X_train, y_train):
    num_class = np.unique(y_train)
    param = np.zeros((len(num_class), X_train.shape[1]))
    
    for i in num_class:
        num_class = np.unique(y_train)
        label_t = (y_train == num_class[i]).astype(np.float)
        param[i, :] = get_classifier(X_train, label_t)
    
    return param

In [7]:
from sklearn.model_selection import train_test_split

In [56]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

f_mean, f_std = np.mean(X_train, axis=0), np.std(X_train, axis=0)
X_train = (X_train - f_mean) / f_std
X_test = (X_test - f_mean) / f_std

X_train = np.concatenate((np.ones((X_train.shape[0], 1)), X_train), axis=1)
X_test = np.concatenate((np.ones((X_test.shape[0], 1)), X_test), axis=1)

In [57]:
params = multi_classifier(X_train, y_train)

In [58]:
def pred(param, X_test, y_test):
    f_size = X_test.shape
    l_size = y_test.shape
    assert (f_size[0] == l_size[0])
    
    logits = np.dot(X_test, np.transpose(param)).squeeze()
    prob = 1 / (1 + np.exp(-logits))
    
    pred = np.argmax(prob, axis=1)
    
    accuracy = np.sum(pred == y_test) / l_size[0] * 100
    
    return prob, pred, accuracy

In [59]:
_, preds, accu = pred(params, X_test, y_test)
print("Prediction: {}\n".format(preds))
print("Accuracy: {:.3f}%".format(accu))

Prediction: [5 0 1 0 0 0 1 0 3 0 0 5 1 0 5 5 0 0 0 1 0 0 3 1 1 0 0 3 4 0 0 1 5 0 1 0 0
 0 0 0 0 5 0 5 0 0 0 1 5 0 0 1 0 1 0 0 5 0 0 1 0 0 0 0 3]

Accuracy: 52.308%


In [43]:
param = get_classifier(X_train, (y_train == 2).astype(np.float), )

In [44]:
logits = np.dot(X_test, np.transpose(param))
prob = 1 / (1 + np.exp(-logits))

In [45]:
prob

array([0.1679436 , 0.16624701, 0.18288532, 0.15027829, 0.16925888,
       0.12982249, 0.11428009, 0.20250414, 0.2515051 , 0.09782803,
       0.18143907, 0.19644584, 0.20779502, 0.18027823, 0.15339476,
       0.24978949, 0.09537728, 0.08160454, 0.17151049, 0.17213498,
       0.19534846, 0.17503042, 0.13719427, 0.13395506, 0.18582037,
       0.17633815, 0.17846326, 0.18202715, 0.19403391, 0.10284758,
       0.19426269, 0.16230607, 0.1782496 , 0.13020355, 0.23422052,
       0.24677979, 0.14243671, 0.1852543 , 0.18076227, 0.17242956,
       0.22841532, 0.21256977, 0.28534368, 0.18081499, 0.20989615,
       0.19124231, 0.13109459, 0.25803406, 0.19188068, 0.22119288,
       0.1731835 , 0.12094036, 0.12711064, 0.22349197, 0.17481338,
       0.14994777, 0.11132037, 0.22935117, 0.20430271, 0.18555453,
       0.23407659, 0.16692783, 0.19468248, 0.17099886, 0.22733705])

In [46]:
((y_test == 2).astype(np.float) == (prob > 0.5)).mean()

0.8923076923076924