In [69]:
# Part 1

import numpy as np

def load_data(csv_filename): # reading files
    wine_data = [] # empty list for the data
    with open(csv_filename, 'r') as file: # opens file using with, no need to close
        next(file) # skips header row
        for line in file:
            line = line.strip()
            data = [float(value.strip()) for value in line.split(';')][:-1] #  take a float of the stripped values, split by ;, excluding last column (quality)
            wine_data.append(data)
    return np.array(wine_data)  


In [70]:
red_wine = load_data('redwine.csv') # loading data
white_wine = load_data('whitewine.csv')
#print(red_wine)
#print(white_wine)

In [71]:
def split_data(dataset, ratio): # splitting to training and testing portions
    num_rows  = dataset.shape[0]
    if ratio * num_rows is not int:
        splitting_point = int(ratio * num_rows)
    else:
        splitting_point = ratio * num_rows
    training_set = dataset[0:splitting_point]
    testing_set = dataset[splitting_point:num_rows]
    set_tuple = (training_set, testing_set)
    return set_tuple

In [72]:
red_wine_training, red_wine_testing = split_data(red_wine, .9) # unpacking training and testing tuplel, as an example we will use 0.9 as the ratio
white_wine_training, white_wine_testing = split_data(white_wine, .9)

In [73]:
# printing shapes
print("Red wine file shape:", red_wine.shape, "Training set shape:", red_wine_training.shape, "Testing set shape:", red_wine_testing.shape)
print("White wine file shape:", white_wine.shape, "Training set shape:", white_wine_training.shape, "Testing set shape:", white_wine_testing.shape)
#print(len(white_wine))

Red wine file shape: (1599, 11) Training set shape: (1439, 11) Testing set shape: (160, 11)
White wine file shape: (1599, 11) Training set shape: (1439, 11) Testing set shape: (160, 11)


In [74]:
# PART 2

# centroid function (from lecture):
import math

def compute_centroid(labeled_examples): 
    return sum(labeled_examples[:,:]) / labeled_examples.shape[0]

# euclidean distance function (from lecture):
def euclidean_distance(a,b): 
    total = 0
    for i in range(len(a)):
        total += (a[i] - b[i])**2
    return math.sqrt(total)

#experiment function
def experiment(ww_training, rw_training, ww_test, rw_test):
    correct_for_ww = 0
    correct_for_rw = 0
    total = 0
    ww_centroid = compute_centroid(ww_training) # compute the centroids for both red and white wine
    rw_centroid = compute_centroid(rw_training)
    #print(red_centroid.shape, white_centroid.shape) # checking to see if this matches the shapes of the four data sets (should be 11)

    for data in ww_test:
        distance_to_rw_centroid = euclidean_distance(data, rw_centroid)
        distance_to_ww_centroid = euclidean_distance(data, ww_centroid)
        total += 1
        if distance_to_ww_centroid < distance_to_rw_centroid: # only correct if the data in the white wine test set is closer to the white wine centroid than the red wine centroid
            correct_for_ww += 1
    
    for data in rw_test: # same thing but for red wine 
        distance_to_rw_centroid = euclidean_distance(data, rw_centroid)
        distance_to_ww_centroid = euclidean_distance(data, ww_centroid)
        total += 1
        if distance_to_rw_centroid < distance_to_ww_centroid:
            correct_for_rw += 1
    
    #print(correct_for_ww, correct_for_rw) #just wanted to see which type of wine it was more accurate for
    correct = correct_for_ww + correct_for_rw
    accuracy = correct/total
    print("Total number of predictions made:", total, "\nTotal number of correct predictions", correct, "\nAccuracy of the model:", accuracy)
    return accuracy

experiment(white_wine_training, red_wine_training, white_wine_testing, red_wine_testing)

Total number of predictions made: 320 
Total number of correct predictions 291 
Accuracy of the model: 0.909375


0.909375

In [75]:
# PART 3
def cross_validation(ww_data, rw_data, k):
    if len(ww_data) == len(rw_data):
        fold_size = len(ww_data)//k  #len(ww_data) should be the same as len(rw_data)   
        accuracy_sum = 0
        accuracy_count = 0
        
    for x in range(k):
        if x < (k-1):
            i = x * fold_size
            j = (x + 1) * fold_size 
            ww_test_set = ww_data[i:j]
            rw_test_set = rw_data[i:j]
            ww_1 = ww_data[:i] #slicing the data into ww_1 (data before the test set) and ww_2(data after the test set)
            ww_2 = ww_data[j+1:len(ww_data)] # skipping over the fold that is being used as the test set, should account for "extra" data even if k does not divide evenly into the data since it adds the entirety of the rest of the data into the training set
            ww_training_set = np.vstack((ww_1, ww_2)) # stack ww_1 and ww_2 to combine them into the training set
            rw_1 = rw_data[:i]
            rw_2 = rw_data[j+1:len(rw_data)]
            rw_training_set = np.vstack((rw_1, rw_2))
            accuracy = experiment(ww_training_set, rw_training_set, ww_test_set, rw_test_set)
            accuracy_sum += accuracy
            accuracy_count += 1
        elif x == (k-1): # to account for when k doesn't divide evenly into the data, we will add the "extra" data into the last fold
            i = x * fold_size
            ww_test_set = ww_data[i:len(ww_data)+1]
            rw_test_set = rw_data[i: len(rw_data)+1]
            ww_training_set = ww_data[:i]
            rw_training_set = rw_data[:i]
            accuracy = experiment(ww_training_set, rw_training_set, ww_test_set, rw_test_set)
            accuracy_sum += accuracy
            accuracy_count += 1
        
    avg_accuracy = accuracy_sum/accuracy_count
    print("Average Accuracy from Cross Validation:", avg_accuracy)
    return avg_accuracy

cross_validation(white_wine, red_wine, 5)

Total number of predictions made: 638 
Total number of correct predictions 544 
Accuracy of the model: 0.8526645768025078
Total number of predictions made: 638 
Total number of correct predictions 561 
Accuracy of the model: 0.8793103448275862
Total number of predictions made: 638 
Total number of correct predictions 571 
Accuracy of the model: 0.8949843260188087
Total number of predictions made: 638 
Total number of correct predictions 568 
Accuracy of the model: 0.890282131661442
Total number of predictions made: 646 
Total number of correct predictions 579 
Accuracy of the model: 0.8962848297213623
Average Accuracy from Cross Validation: 0.8827052418063415


0.8827052418063415