### Victor Castellanos
### Ian Schenck
### CECS 550 Project 3

In [28]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from pathlib import Path
from scipy import stats
from sklearn.model_selection import GridSearchCV, train_test_split, cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler

## Set User Variables

In [29]:
# k is initial k
k = 4
# T is number of groups for cross validation
T = 5
# R1 is start of range for optimizing k
R1 = 1
# R2 is end of range for optimizing k
R2 = 26

In [30]:
# load a file that uses , as delimiter
def load_file(path, names):
    if not path.is_file():
        raise FileNotFoundError(str(path))
    data = pd.read_csv(path, sep=",", names=names, header=None)
    return data

# load data for hmm
def load_df():
    cols = ["Fixed Acidity", "Volatile Acidity","Citric Acid", "Residual Sugar", "Chlorides", "Free Sulfur Dioxide", "Total Sulfur Dioxide", "Density","pH", "Sulphates", "Alcohol" , "Quality"]
    file = Path.cwd() / "wine_quality.csv"
    return load_file(file, cols)

# removes any rows that have empty data slots ie null
def remove_empty_rows(data):
    data.dropna(axis=0, how='any', thresh=None, subset=None, inplace=True)
    return data

# removes outliers based on zscore greater than 3
def remove_outliers_Zscore(data):
    z = np.abs(stats.zscore(data))
    data_df_z_out =data[(z<3).all(axis=1)]
    return data_df_z_out

# removes outliers based on IQR
def remove_outliers_IQR(data):
    Q1 = data.quantile(.25)
    Q3 = data.quantile(.75)
    IQR = Q3-Q1
    data_df_IQR_out = data[~((data < (Q1 - 1.5 * IQR)) |(data > (Q3 + 1.5 * IQR))).any(axis=1)]
    return data_df_IQR_out
    
# graphs histogram
def plotHistogram(data):
    data.hist(bins=50, color='steelblue', edgecolor='black', linewidth=1.0,
        xlabelsize=8, ylabelsize=8, grid=False)    
    return plt.tight_layout(rect=(0, 0, 1.2, 1.2)) 

# makes a 2d graph comparing all pairs of features in dataframe
def plot2dScatter(data):
    return sns.pairplot(data,diag_kind="kde")

    

In [31]:
data = load_df()

## Part 1 - Data Wrangling

In [32]:
#uncomment to see histogram of data
#hist = plotHistogram(data)
print("Current datasize before cleaning" , data.shape[0])
data = remove_empty_rows(data)


#uncomment to see data b4 cleaning in 2d graph
#plot2d = plot2dScatter(data)
classes = data.groupby('Quality')
print("Number of classes before cleaning data" , classes.sum().shape[0])


print("Removing outliers using z-score")

#data = remove_outliers_IQR(data)
data = remove_outliers_Zscore(data)
print("Current data size after removing outliers", data.shape[0])

#Uncomment this to look at 2d plot of cleaned data
#plot2d = plot2dScatter(data)

classes = data.groupby('Quality')
print("Number of classes after cleaning data", classes.count().shape[0])


Current datasize before cleaning 4901
Number of classes before cleaning data 9
Removing outliers using z-score
Current data size after removing outliers 4558
Number of classes after cleaning data 5


## Part 2 - Building and training the kNN model

In [33]:
X = data.drop(columns=["Quality"])
y = data["Quality"]

# split train and test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)

# scale data
standard_scaler = StandardScaler()
X_train = standard_scaler.fit_transform(X_train)
X_test = standard_scaler.transform(X_test)


# change this to try different k
k = 4

# make model and fit data
model = KNeighborsClassifier(n_neighbors=k)
model.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=4, p=2,
                     weights='uniform')

## Part 3 - Testing kNN Model

In [34]:
model.predict(X_test)

array([6., 6., 6., 6., 7., 6., 4., 6., 7., 7., 6., 5., 7., 5., 5., 5., 6.,
       5., 5., 5., 7., 5., 5., 6., 6., 5., 5., 6., 6., 4., 6., 5., 5., 6.,
       6., 6., 5., 6., 7., 5., 6., 6., 5., 5., 6., 5., 6., 5., 5., 6., 5.,
       5., 6., 6., 6., 6., 6., 5., 6., 5., 5., 6., 6., 5., 5., 7., 6., 6.,
       5., 6., 7., 6., 6., 6., 5., 6., 7., 7., 4., 6., 7., 5., 5., 6., 7.,
       6., 6., 6., 7., 6., 7., 5., 5., 5., 6., 6., 6., 6., 7., 7., 5., 6.,
       6., 5., 6., 5., 5., 6., 6., 8., 6., 6., 6., 5., 6., 6., 5., 5., 6.,
       6., 6., 4., 7., 5., 6., 5., 5., 6., 5., 6., 5., 6., 6., 6., 7., 7.,
       4., 5., 8., 6., 6., 5., 6., 7., 6., 5., 6., 5., 5., 5., 5., 6., 6.,
       5., 5., 5., 8., 8., 6., 6., 5., 5., 6., 8., 4., 5., 5., 5., 6., 6.,
       6., 6., 5., 6., 6., 5., 6., 5., 7., 6., 6., 5., 5., 6., 5., 6., 6.,
       7., 5., 6., 6., 5., 4., 4., 6., 6., 5., 5., 6., 5., 6., 6., 5., 5.,
       5., 6., 6., 6., 5., 6., 6., 6., 5., 7., 5., 6., 6., 5., 7., 6., 6.,
       6., 5., 5., 7., 7.

In [35]:
train_score = model.score(X_train, y_train)
test_score = model.score(X_test, y_test)
print(f"Train score: {train_score}")
print(f"Test score: {test_score}")

Train score: 0.7246297312122875
Test score: 0.555921052631579


## Part 4 - Cross validation

In [36]:
score = cross_val_score(model, X_train, y_train, cv=T)
print(f"Cross Validation Score: {score}")

Cross Validation Score: [0.53561644 0.56104252 0.54595336 0.53909465 0.55006859]


## Part 5 - Optimizing n-neighbor parameter

In [37]:
grid_params = {
    'n_neighbors': np.arange(R1,R2,1),
}

gs = GridSearchCV(KNeighborsClassifier(), grid_params, cv = 5, n_jobs=-1)

gs_results = gs.fit(X_train, y_train)

best_score = gs_results.best_score_
best_k = gs_results.best_params_['n_neighbors']
print(f"Grid Search Best Score: {best_score}")
print(f"Grid Search Best K: {best_k}")

Grid Search Best Score: 0.6058694459681843
Grid Search Best K: 1
