## Project 1 - Mislabeled data ##
Initialize and split data

In [248]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis as QDA
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, classification_report, confusion_matrix

Mislabel data, three levels 

In [274]:
dataset = pd.read_csv('diabetes.csv')
print(dataset.shape)
print(dataset.value_counts('Outcome')) #check class imbalance

# Split the dataset into training and test sets
train_data = dataset.sample(frac=0.8, random_state=42) # 80% of the data for training
test_data = dataset.drop(train_data.index) # remaining 20% for testing, due to the seed this data will always be the same

# split test data into features and labels
X_test = test_data.drop('Outcome', axis=1)
y_test = test_data['Outcome']

n_zeros = len(y_test) - np.count_nonzero(y_test)
print("Accuracy from predicting only zeros in test data: ", n_zeros/len(y_test))

# Print the number of rows in each dataset to check that the split worked correctly
print("Number of rows in training data: ", len(train_data))
print("Number of rows in test data: ", len(test_data))

# mislabel a fraction of the training data
fraction = 0.01 # fraction of data to mislabel
mislabeled_indices = np.random.choice(train_data.index, int(fraction*len(train_data)), replace=False)
train_data.loc[mislabeled_indices, 'Outcome'] = np.abs(train_data.loc[mislabeled_indices, 'Outcome']-1) # flip 0 to 1, or 1 to 0

(768, 9)
Outcome
0    500
1    268
dtype: int64
Accuracy from predicting only zeros in test data:  0.6233766233766234
Number of rows in training data:  614
Number of rows in test data:  154


Analyse Data \
kNN

In [275]:
X_train = train_data.drop('Outcome', axis=1)
y_train = train_data['Outcome']

# Define k-fold cross validation object
k_values = list(range(1, 20))
num_folds = 10
kf = KFold(n_splits=num_folds, shuffle=True)
mean_scores = [] # Will contain the mean cross validation scores for each k value


# Loop over k values to test
for k in k_values:
    # Create kNN classifier with k neighbors
    knn = KNeighborsClassifier(n_neighbors=k)
    
    # Use cross_val_score to perform k-fold cross validation and calculate mean accuracy
    scores = cross_val_score(knn, X_train, y_train, cv=kf, scoring='accuracy')
    mean_scores.append(np.mean(scores))

best_k = k_values[np.argmax(mean_scores)]
print(f"Best k value: {best_k}")


# Evaluate the classifier on the test set
knn = KNeighborsClassifier(n_neighbors=best_k)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print("Test Accuracy:")
print("Accuracy: ", accuracy)

Best k value: 18
Test Accuracy:
Accuracy:  0.7532467532467533


Logistic Regression

In [276]:
model = LogisticRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

print("Test Accuracy:", accuracy_score(y_test, y_pred))

Test Accuracy: 0.7467532467532467


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LDA

In [277]:
clf = LDA(solver='eigen')
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
print("Test Accuracy:", accuracy_score(y_test, y_pred))

Test Accuracy: 0.7402597402597403
