In [2]:
import pandas as pd
import numpy as np
import math
import operator

In [3]:
df = pd.io.parsers.read_csv(
    filepath_or_buffer='digits.csv',
    header=None,
    sep=',',
    )

data_group = df.groupby(df.iloc[:,-1])
df.insert(0, "-1", np.ones((df.shape[0],), dtype=int), True) 
original_df = pd.io.parsers.read_csv(
    filepath_or_buffer='digits.csv',
    header=None,
    sep=',',
    )





In [4]:
#Logistic methods

def sigmoid(z):
    return 1 / (1 + np.exp(-z))

def get_cost(theta, x, y):
    sigm = sigmoid(x @ theta)
    cost = 1 / len(y) * np.sum(-y * np.log(sigm) - (1 - y) * np.log(1 - sigm))
    return cost
def get_gradient(theta, x , y):
    return 1 / len(y) * ((y - sigmoid(x @ theta)) @ x)
def fit(x, y, iteration_count=500, learn_rate=0.1):
    thetas = []
    costs = np.zeros(iteration_count)
    for target_class in classes:
        theta = np.zeros(x.shape[1])
        target_class_val = np.where(y == target_class, 1, 0)
        for iteration_n in range(iteration_count):
            costs[iteration_n]= get_cost(theta, x, target_class_val)
            gradient = get_gradient(theta, x, target_class_val)
            theta += learn_rate * gradient
            
        thetas.append(theta)
    return thetas, costs

def predict(classes, thetas, x):
    prediction = []
    for index, xi in x.iterrows():
        pred_row = []
        for theta in thetas:
            sig = sigmoid(xi @ theta)
            pred_row.append(sig)
        prediction.append(np.argmax(pred_row))
    
    return [classes[p] for p in prediction]
            

In [13]:
#Naive methods

def std(numbers):
    avg = np.mean(numbers)
    variance = sum([(x-avg)**2 for x in numbers]) / float(len(numbers)-1)
    return math.sqrt(variance)
def summarize_dataset(dataframe):
    summaries = [(np.mean(dataframe[column]), std(dataframe[column]), dataframe[column].shape[0]) for column in dataframe]
    del(summaries[-1])
    return summaries

def pdf(x, mean, stdev):
    try:
        exponent = math.exp(-((x-mean)**2 / (2 * stdev**2 )))
        return (1 / (math.sqrt(2 * math.pi) * stdev)) * exponent
    except ZeroDivisionError:
        return 1
    
def calculate_class_probabilities(summaries, row):
    total_rows = sum([summaries[label][0][2] for label in summaries])
    probabilities = dict()
    for class_value, class_summaries in summaries.items():
#         print(class_summaries)
        probabilities[class_value] = summaries[class_value][0][2]/float(total_rows)
        for i in range(len(class_summaries)):
            mean, stdev, _ = class_summaries[i]
            probabilities[class_value] *= pdf(row[i], mean, stdev)
    return probabilities

def do_naive_bayes(dataset):
    data_group = dataset.groupby(original_df.iloc[:,-1])
    summary = summarize_dataset(dataset)
    summaries_by_class = {}

    for key in data_group.groups.keys():
        dataframe = data_group.get_group(key)
        summaries_by_class[key] = summarize_dataset(dataframe)
        
        
    error_count = 0

    for i,rows in dataset.iterrows():
#         print(rows)
        class_val = original_df.iloc[i][64]
        probabilities = calculate_class_probabilities(summaries_by_class, rows)
        largest_probable_class = max(probabilities.items(), key=operator.itemgetter(1))[0]
        error_count = error_count + ( 0 if class_val == largest_probable_class else 1)
        
    return error_count / float(dataset.shape[0])

In [14]:
# Logistic 80-20
# train_data_count = int(.8 * len(df)) 
# train_test_data = df
# train_data = train_test_data.iloc[0: train_data_count]
# test_data = train_test_data.drop(train_test_data.index[0: train_data_count])
# test_classes = test_data.iloc[:, -1]
# train_classes = train_data.iloc[:, -1]
# train_data.shape, test_data.shape

# classes = np.unique(train_classes)
# thetas, costs = fit(train_data, train_classes)

# train_predicted_value = predict(classes, thetas, train_data)
# train_error_count = sum(train_predicted_value != train_classes)
# print("Train error {0:.3f}%".format(train_error_count/len(train_predicted_value)*100))
# test_predicted_value = predict(classes, thetas, test_data)
# test_error_count = sum(test_predicted_value != test_classes)
# print("Test error {0:.3f}%".format(test_error_count/len(test_predicted_value)*100))


In [15]:
# Naive bayes 80-20

num_train = int(.8 * len(original_df))  # 80/20 train/test split
train_test_data = original_df
train_data = train_test_data.iloc[0: num_train]
test_data = train_test_data.drop(train_test_data.index[0: num_train])

test_error = do_naive_bayes(test_data)
# train_error = do_naive_bayes(train_data)

print("Test error {0:.3f}".format(test_error*100))
# print("Train error {0:.3f}".format(train_error*100))



  
  


Test error 7.778


In [9]:
#10 fold cross validation
fold = 10
ratio = 1. / fold
index_at = 0
length = df.shape[0]
test_data_size = math.ceil(length * ratio)

error_array = []

for k in range(fold):
    print("Iteration ", k)
    train_test_data = original_df
    train_data = {}
    test_data = {}

    test_data = train_test_data.iloc[index_at: index_at + test_data_size]
    train_data = train_test_data.drop(df.index[index_at: index_at + test_data_size])
    test_classes = test_data.iloc[:, -1]
    train_classes = train_data.iloc[:, -1]
    
    classes = np.unique(train_classes)
    thetas, costs = fit(train_data, train_classes)
    
    train_predicted_value = predict(classes, thetas, train_data)
    train_error_count = sum(train_predicted_value != train_classes)
    print("Logistic Train error {0:.3f}%".format(train_error_count/len(train_predicted_value)*100))
    test_predicted_value = predict(classes, thetas, test_data)
    test_error_count = sum(test_predicted_value != test_classes)
    print("Logistic Test error {0:.3f}%".format(test_error_count/len(test_predicted_value)*100))
    
    logistic_train_error =  "{0:.3f}".format(train_error_count/len(train_predicted_value)*100)
    logistic_test_error =  "{0:.3f}".format(test_error_count/len(test_predicted_value)*100)
    
    
    
    #naive bayes
    
    bayes_test_error = "{0:.3f}".format(do_naive_bayes(test_data)*100)
    bayes_train_error = "{0:.3f}".format(do_naive_bayes(train_data)*100)
    
    
    print("Naive Bayes Test error ", bayes_test_error)
    print("Naive Bayes Train error ", bayes_train_error)
    
    
    
    
    fold_error_array = [logistic_train_error,logistic_test_error,bayes_test_error,bayes_train_error ]

    error_array.append(fold_error_array)
    
    
    index_at = index_at + test_data_size

    

Iteration  0
Logistic Train error 2.226%
Logistic Test error 6.111%


  
  


Naive Bayes Test error  2.222
Naive Bayes Train error  7.978
Iteration  1
Logistic Train error 0.804%
Logistic Test error 5.556%
Naive Bayes Test error  0.556
Naive Bayes Train error  9.524
Iteration  2
Logistic Train error 1.237%
Logistic Test error 11.111%
Naive Bayes Test error  4.444
Naive Bayes Train error  8.967
Iteration  3
Logistic Train error 0.866%
Logistic Test error 2.778%
Naive Bayes Test error  1.667
Naive Bayes Train error  8.967
Iteration  4
Logistic Train error 0.557%
Logistic Test error 3.889%
Naive Bayes Test error  0.556
Naive Bayes Train error  9.029
Iteration  5
Logistic Train error 0.680%
Logistic Test error 1.667%
Naive Bayes Test error  1.667
Naive Bayes Train error  9.338
Iteration  6
Logistic Train error 0.742%
Logistic Test error 2.222%
Naive Bayes Test error  1.667
Naive Bayes Train error  9.400
Iteration  7
Logistic Train error 0.618%
Logistic Test error 3.333%
Naive Bayes Test error  0.556
Naive Bayes Train error  9.091
Iteration  8
Logistic Train error 0

In [10]:
error_df = pd.DataFrame(error_array, columns = ["Logistic Train error(%)", "Logistic Test error(%)", "Naive Bayes Train error(%)", "Naive Bayes Test error(%)"], index=list(range(1,11)))
error_df

Unnamed: 0,Logistic Train error(%),Logistic Test error(%),Naive Bayes Train error(%),Naive Bayes Test error(%)
1,2.226,6.111,2.222,7.978
2,0.804,5.556,0.556,9.524
3,1.237,11.111,4.444,8.967
4,0.866,2.778,1.667,8.967
5,0.557,3.889,0.556,9.029
6,0.68,1.667,1.667,9.338
7,0.742,2.222,1.667,9.4
8,0.618,3.333,0.556,9.091
9,0.495,10.0,5.0,7.978
10,0.679,5.085,2.26,8.272
