# Assignment 2: Classification
Goal: compare the Naive Bayes (NB) model to Logistic Regression (LR) on data with categorical attributes. 
- train/fit an NB and an LR model on the [Car Evaluation data set](https://archive.ics.uci.edu/ml/datasets/Car+Evaluation) 
- compare their performance by looking how well they predict in terms of 0/1-loss (misclassification rate, error percentage) 

In [29]:
import numpy as np
import pandas as pd
import pickle

import numpy as np
import pandas as pd
import random
from tqdm import tqdm

### sklearn
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import CategoricalNB
from sklearn.linear_model import LogisticRegression

### plotting
import matplotlib.pyplot as plt
import seaborn as sns 
%matplotlib inline
%config InlineBackend.figure_format ='retina'

## Data

In [30]:
file = '../data/car.data.txt'
myfile = open(file, 'r') 
mydata = []
for line in myfile:
    row = line.strip().split(',') 
    if row[6] == 'unacc':
        row[6] = 'Negative'
    else:
        row[6] = 'Positive' 
    mydata.append(row)

In [31]:
attr = ['buying','maint','doors','persons','lug_boot','safety','outcome']
df = pd.DataFrame(mydata, columns= attr)

In [32]:
for col in df:
    print(col,df[col].unique())

buying ['vhigh' 'high' 'med' 'low']
maint ['vhigh' 'high' 'med' 'low']
doors ['2' '3' '4' '5more']
persons ['2' '4' 'more']
lug_boot ['small' 'med' 'big']
safety ['low' 'med' 'high']
outcome ['Negative' 'Positive']


In [33]:
n = int(df.shape[0]/2) # train/test size
q = int(df.shape[1]-1) # num of features

In [34]:
## results dataframe
models = ['NB_1','NB_0.1','NB_10','LG_full','LG_2','LG_full_l2']
results = pd.DataFrame(0, index=np.arange(1,n+1), columns=models)

## Naive Bayes maximum likelihood estimation

In [35]:
def train_NB(X_train, X_test, y_train, y_test, a):
    nb = CategoricalNB(alpha = a)
    y_pred = nb.fit(X_train, y_train).predict(X_test)
    loss = (y_test!= y_pred).sum()/n
    return loss

## Logistic regression conditional likelihood maximization


In [36]:
def train_LG(X_train, X_test, y_train, y_test, col = q, p='none'):
    X_train = X_train[:,0:col]
    X_test = X_test[:,0:col]
    lg = LogisticRegression(random_state=2019, penalty = p) # not use regularization.
    y_pred = lg.fit(X_train, y_train).predict(X_test)
    loss = (y_test!= y_pred).sum()/n
    return loss

## Training

In [37]:
def train_once():
    results = pd.DataFrame(np.nan, index=np.arange(1,n+1), columns=models)
    for i in range(n):
        X_tr = X_train_cat[:i+1]
        y_tr = y_train[:i+1]
        try:
            results.iloc[i+1,0] = train_NB(X_tr, X_test_cat, y_tr, y_test, 1)
        except:
            pass
        try:
            results.iloc[i+1,1] = train_NB(X_tr, X_test_cat, y_tr, y_test, 0.1)
        except:
            pass
        try:
            results.iloc[i+1,2] = train_NB(X_tr, X_test_cat, y_tr, y_test, 10)
        except:
            pass
        try:
            results.iloc[i+1,3] = train_LG(X_tr, X_test_cat, y_tr, y_test, q)
        except:
            pass
        try:
            results.iloc[i+1,4] = train_LG(X_tr, X_test_cat, y_tr, y_test, 2)
        except:
            pass
        try:
            results.iloc[i+1,5] = train_LG(X_tr, X_test_cat, y_tr, y_test, q, 'l2')
        except:
            pass
    return results

In [None]:
M = 20
enc = OrdinalEncoder()
for i in tqdm(range(M)):
    X_train, X_test, y_train, y_test = train_test_split(df[attr[0:6]],df.outcome,train_size=0.5,random_state=M)
    X_train_cat = enc.fit_transform(X_train)
    X_test_cat = enc.transform(X_test)
    temp = train_once()
    results = results + temp
results = results / M

 20%|██        | 4/20 [01:33<06:15, 23.48s/it]

In [None]:
RESULTS = "results.pkl"

with open(RESULTS, "wb") as dataset_outfile:
    pickle.dump(results, dataset_outfile)

In [27]:
RESULTS = "results.pkl"

with open(RESULTS, "rb") as dataset_infile:
        results = pickle.load(dataset_infile)

In [None]:
f, ax = plt.subplots(figsize=(16,8))
ax.set(xscale="log")
sns.lineplot(data = results, ax=ax)
plt.xlabel("Traing Data Size")
plt.ylabel("1/0 Error")
plt.title("Average 1/0 Error for models {:s} over {:d} runs. ".format(str(models),M))
