# HW 5

Muyuan Zhang

u1430770

07/09/2023

In [None]:
# Imports and setup. 

import pandas as pd
import numpy as np
import math
from sklearn import tree, svm, metrics
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split, cross_val_predict, cross_val_score, KFold
from sklearn.datasets import load_digits
from sklearn.preprocessing import scale

import matplotlib.pyplot as plt
%matplotlib inline
plt.rcParams['figure.figsize'] = (10, 6)
plt.style.use('ggplot')

## Part 1: MNIST Handwritten Digits
### Task 1.1: Classification with Support Vector Machines (SVM)

In [None]:
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix

digits = load_digits()
X = scale(digits.data)
y = digits.target

# split the data into a training and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, test_size=0.8)

# use SVM with an rbf kernel and the cost parameter C=5 
# to build a classifier using the training dataset
svm = SVC(kernel='rbf', C=5)
svm.fit(X_train, y_train)

# using the test dataset, evaluate the accuracy of the model
accuracy = svm.score(X_test, y_test)
print("The accuracy of the model is", accuracy)

# using the test dataset, compute the confusion matrix
y_pred = svm.predict(X_test)
confusion = confusion_matrix(y_test, y_pred)

print("\nConfusion matrix")
print(confusion)

# the most common mistake that the classifier makes
np.fill_diagonal(confusion, 0)
row, col = np.unravel_index(np.argmax(confusion), confusion.shape)

print("\nPredicted most common mistake", col, " Actual", row)

# display all of the misclassified digits as images
misclassified_digits = np.where(y_test != y_pred)[0]
plt.figure(figsize=(15, 15))
num_rows = int(np.ceil(np.sqrt(len(misclassified_digits))))

for i, index in enumerate(misclassified_digits):
    plt.subplot(num_rows, num_rows, i + 1)
    plt.imshow(np.reshape(X_test[index], (8, 8)), cmap='Greys')
    plt.title(f"Predicted: {y_pred[index]}\nActual: {y_test[index]}")
    plt.axis('off')

print("\nAll the misclassified digits")
plt.tight_layout()  
plt.show()

# evaluate the accuracy of the SVM for different values of the parameter C
C_values = np.concatenate((np.arange(0.5, 5.1, 0.1), np.arange(10, 51, 20)))
accuracy_values = []

for C in C_values:
    svm = SVC(kernel='rbf', C=C)
    scores = cross_val_score(svm, X, y, cv=5)
    accuracy_values.append(np.mean(scores))

plt.plot(C_values, accuracy_values, marker='o')
plt.xlabel('C')
plt.ylabel('Accuracy')
plt.title('Accuracy of the SVM for different C values')
plt.grid(True)
plt.show()
print("The best value of C is", C_values[np.argmax(accuracy_values)])

# train and test the algorithm on the raw data
X_train_raw, X_test_raw, y_train_raw, y_test_raw = train_test_split(digits.data, digits.target, random_state=1, test_size=0.8)
svm_raw = SVC(kernel='rbf', C=5)
svm_raw.fit(X_train_raw, y_train_raw)
accuracy_raw = svm_raw.score(X_test_raw, y_test_raw)
print("\nThe accuracy of the raw data is", accuracy_raw)


Interpretation of the results:

The accuracy of the raw data is higher than that of the scaled data, because placing the data on a new scale means that unimportant or noise features cloud the signal.

### Task 1.2: Prediction with K-nearest Neighbors

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split

# split the data into a training and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, test_size=0.8)

# use k-NN to build a classifier using the training dataset
knn = KNeighborsClassifier(n_neighbors=10)
knn.fit(X_train, y_train)

# using the test dataset, evaluate the accuracy of the model
accuracy = knn.score(X_test, y_test)
print("The accuracy of the k-NN model is", accuracy)

# using the test dataset, compute the confusion matrix
y_pred = knn.predict(X_test)
confusion = confusion_matrix(y_test, y_pred)

print("\nConfusion matrix")
print(confusion)

# the most common mistake that the classifier makes
np.fill_diagonal(confusion, 0)
row, col = np.unravel_index(np.argmax(confusion), confusion.shape)

print("\nPredicted most common mistake", col, " Actual", row)

# display all of the misclassified digits as images
plt.style.use('ggplot')
misclassified_digits = np.where(y_test != y_pred)[0]
num_rows = math.ceil(len(misclassified_digits) / 5)
num_cols = min(len(misclassified_digits), 5)
plt.figure(figsize=(10, 2 * num_rows + 2))

for i, index in enumerate(misclassified_digits):
    predicted_digit = y_pred[index]
    actual_digit = y_test[index]
    plt.subplot(num_rows, num_cols, i + 1)
    plt.imshow(np.reshape(X_test[index], (8, 8)), cmap='Greys', interpolation='nearest')
    plt.title(f"Predicted {predicted_digit}\nActual {actual_digit}")
    plt.axis('off')

print("\nAll the misclassified digits")
plt.tight_layout()
plt.show()

# evaluate the accuracy of the k-NN for different values of the parameter k
k_values = np.arange(1, 10)
accuracy_values = []

for k in k_values:
    knn = KNeighborsClassifier(n_neighbors=k)
    scores = cross_val_score(knn, X, y, cv=5)
    accuracy_values.append(np.mean(scores))

plt.plot(k_values, accuracy_values, marker='o')
plt.xlabel('k')
plt.ylabel('Accuracy')
plt.title('Accuracy of the k-NN for different k values')
plt.grid(True)
plt.show()
print("The best value of k is", k_values[np.argmax(accuracy_values)])

X_train_raw, X_test_raw, y_train_raw, y_test_raw = train_test_split(
    digits.data, digits.target, random_state=1, test_size=0.8)

# train and test the algorithm on the raw data
knn_raw = KNeighborsClassifier(n_neighbors=10)
knn_raw.fit(X_train_raw, y_train_raw)
accuracy_raw = knn_raw.score(X_test_raw, y_test_raw)
print("\nThe accuracy of the raw data is", accuracy_raw)


Interpretation of the results:

Again, the accuracy of the raw data is higher than that of the scaled data, but accuracy is not the best metric.

## Part 2: Popularity of online news

### Task 2.1: Import the Data

In [None]:
# import the dataset
df = pd.read_csv('OnlineNewsPopularity/OnlineNewsPopularity.csv')
df.columns = df.columns.str.strip()
df = df.drop(['url', 'timedelta'], axis=1)
X = df.drop('shares', axis=1).values
# export the number of shares as a separate numpy array
shares = df['shares'].values
# create a binary numpy array which indicates whether or not each article is popular
y = np.where(shares > np.median(shares), 1, 0)

### Task 2.2: Exploratory Data Analysis

In [None]:
# check to see if the values are reasonable
print(df['shares'].describe())

### Task 2.3: Classification Using k-NN

In [None]:
from sklearn.preprocessing import StandardScaler

# split the data into a training and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

k_values = list(range(60, 81))
accuracy_values = []

# use cross validation to choose the best value of k
for k in k_values:
    knn = KNeighborsClassifier(n_neighbors=k)
    scores = cross_val_score(knn, X_train_scaled, y_train, cv=5)
    accuracy_values.append(scores.mean())

print("The best k value is", k_values[accuracy_values.index(max(accuracy_values))])
print("The best accuracy is", max(accuracy_values))

The best value of k is 73.

The best accuracy I can obtain on the test data is 0.64.

### Task 2.4 Classification using SVM

In [None]:
from sklearn.metrics import accuracy_score

fraction = 5000
X_fraction = X[:fraction]
y_fraction = y[:fraction]

# split the data into a training and test set
X_train, X_test, y_train, y_test = train_test_split(X_fraction, y_fraction, test_size=0.2, random_state=42)
C_values = list(range(48500, 49501, 100))
accuracy_values = []

# experiment with different Cs
for C in C_values:
    svm = SVC(C=C)
    svm.fit(X_train, y_train)
    y_pred = svm.predict(X_test)
    accuracy_values.append(accuracy_score(y_test, y_pred))

In [None]:
best_C_values = [C_values[i]
                    for i, acc in enumerate(accuracy_values) 
                    if acc == max(accuracy_values)]

print("The best C values are", best_C_values)

### Task 2.5 Classification using decision trees

In [None]:
from sklearn.tree import DecisionTreeClassifier

max_depth_values = list(range(5, 10))
min_samples_split_values = list(range(15, 21))
accuracy_values = []

for max_depth in max_depth_values:
    accuracy_row = []

    for min_samples_split in min_samples_split_values:
        tree = DecisionTreeClassifier(max_depth=max_depth, min_samples_split=min_samples_split)
        scores = cross_val_score(tree, X, y, cv=5)
        accuracy_row.append(scores.mean())
    accuracy_values.append(accuracy_row)

best_indices = np.unravel_index(np.argmax(accuracy_values), np.array(accuracy_values).shape)
best_max_depth = max_depth_values[best_indices[0]]
best_min_samples_split = min_samples_split_values[best_indices[1]]

print("The best value of max tree depth is", best_max_depth)
print("The best value of minimum samples split is", best_min_samples_split)

### Task 2.6 Describe your findings

Q: Which method (k-NN, SVM, Decision Tree) worked best?

A: k-NN works best in accuracy.

Q: How did different parameters influence the accuracy?

A: 

* For k-NN, a small K leads to unstable decision boundaries, and a greater K value is better for classification as it leads to smoothening the decision boundaries but makes it computationally expensive. A simple approach to select k is to set k = n^(1/2).

* For SVM, larger Cs allows for more complex decision boundaries, which can lead to overfitting, while smaller Cs may result in underfitting.

* For Decision tree, the larger max_depth is, the more splits it has, and it captures more information about the data. Increasing min_samples_split value may cause underfitting.

Q: Which model is easiest to interpret?

A: Decision tree (easy to understand, can be visualized).

Q: How would you interpret your results?

A: The best value of each parameter in each model can be decided by iteration. In the cases above, the accuracy of the raw data is higher than that of the scaled data, but accuracy is not the best metric.