Question 1: Please use the "breast cancer dataset" included in the scikit-learn library to conduct the following tasks. 

Task 1: Randomly split this dataset into two parts: 60% for training and 40% for testing. Use KNN (k = 5) algorithm to conduct INDEPENDENT TEST, show the results using Classification Report;

Task 2: Use the WHOLE dataset and Decision Tree algorithm (parameter setting: criterion is "entropy", max-depth is 15), conduct SELF TEST, show the test accuracy and draw the corresponding confusion matrix (in a figure);

Task 3: Use the WHOLE dataset and SVC algorithm (parameter setting: C is 5, kernel is "rbf", gamma is 0.05), conduct a 5-fold cross validation test and show the average accuracy.

In [None]:
import pandas as pd
from sklearn import datasets
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import numpy as np
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, classification_report, accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold 

# Print the confusion matrix, classification report and accuracy score
def printResults(targetTest, prediction):
    print("\nClassification Report:\n", classification_report(targetTest, prediction))
    print("Accuracy Score:", accuracy_score(targetTest, prediction))
 

# Task 1

dataset = datasets.load_breast_cancer()
df = pd.DataFrame(dataset.data,columns=dataset.feature_names)
df['target'] = pd.Series(dataset.target)

X = dataset['data']
y = dataset['target']
 
X_train, X_test, y_train, y_test = \
    train_test_split(X, y, test_size=0.6, random_state=42, stratify=y)

# Scale the training and testing data
scaler = StandardScaler()
train_data_scaled = scaler.fit_transform(X_train)
test_data_scaled = scaler.transform(X_test)

knn_classifier = KNeighborsClassifier(n_neighbors=5, algorithm="brute", weights='distance')

knn_classifier.fit(train_data_scaled, y_train)

knn_independent_test_predict = knn_classifier.predict(test_data_scaled)

print("Independent test KNN Results:")
printResults(y_test, knn_independent_test_predict)  # knn results

# Task 2
self_test_data_scaled = scaler.fit_transform(X)

dt_classifier = DecisionTreeClassifier(criterion='entropy', max_depth=15)

# Get predictions
dt_self_test_predict = dt_classifier.predict(self_test_data_scaled)

# Print results for self-test
print("Self-test DT Results:")
printResults(y, dt_self_test_predict)  # dt results
dtconfusionMatrix = confusion_matrix(y, dt_self_test_predict)
print("\nConfusion Matrix:\n", dtconfusionMatrix)
# Display a heatmap using matplotlib and the sklearn toolset to display a confusion matrix
matrixDisplay = ConfusionMatrixDisplay(confusion_matrix = dtconfusionMatrix)
fig, ax = plt.subplots(figsize=(10, 8))  # Create layout and structure figure
matrixDisplay.plot(ax = ax, cmap = 'Reds')  # Create Plot
# Plot labels
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.title('Confusion Matrix')
# plt.savefig('confusion_matrix.png')  # Save plot as png
plt.show()  # Display plot

svc = SVC(kernel = 'rbf', C = 5, gamma = 0.05, random_state = 7)  # Instantiate an SVCClassifier object with optimized parameters

# Implementing cross validation
k = 5  # number of fold
kf = KFold(n_splits=k, random_state=None)
 
acc_score = []
 
for train_index , test_index in kf.split(X):
    X_train, X_test = X.iloc[train_index,:], X.iloc[test_index,:]
    y_train, y_test = y[train_index], y[test_index]
     
    svc.fit(X_train, y_train)
    pred_values = svc.predict(X_test)
     
    acc = accuracy_score(pred_values, y_test)
    acc_score.append(acc)
     
avg_acc_score = sum(acc_score)/k

print('accuracy of each fold - {}'.format(acc_score))
print('Avg accuracy : {}'.format(avg_acc_score))


Question 2: Please use the "Iris Dataset" included in the scikit-learn library to conduct the following clustering tasks. 

Task 1: Conduct PCA analysis on the dataset and find out how many principal components are needed to keep at least 95% variance (i.e. the ratio of variance loss, η, is less than 5%). Assume m principal components are needed, transform the dataset to m dimensions.

Task 2: Using the transformed data, conduct k-means clustering (k = 3, each cluster is a type of iris plant),
1) output the corresponding center of each cluster;
2) output the clustering accuracy (i.e. out of all samples, how many samples are correctly identified);
3) show the corresponding confusion matrix in a figure.

In [None]:
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
import pandas as pd
import numpy as np
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.neighbors import KNeighborsClassifier
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from scipy.stats import mode

dataset = load_iris()
df = pd.DataFrame(dataset.data, columns=dataset.feature_names)
X = dataset['data']
y = dataset['target']

# Standardize the matrix
digits_mean = np.mean(X, axis=0)
digits_std = np.std(X, axis=0)
digits_std[digits_std == 0] = 1e-10  # Change stds of 0 to a small value to avoid division by zero
digits_normalized = (X - digits_mean) / digits_std

# Perform Principal Component Analysis (PCA) with normalized data
pca_normalized = PCA(n_components=0.95)
digits_data_new = pca_normalized.fit_transform(digits_normalized)

# Get the covariance matrix with normalized data
covariance_matrix = pca_normalized.get_covariance()

# Dsiplay covariance matrix with normalized data
print("PCA Analysis with normalized data")
print("Covariance Matrix:")
print(covariance_matrix)

# Number of principal components required to keep at least 95% variance with normalized data
print("Percentage of variance explained by each component to the total variance:\n", pca_normalized.explained_variance_ratio_)
print(f"Total explained variance ratio: {np.sum(pca_normalized.explained_variance_ratio_):.2f}")
print(f"Number of principal components to keep at least 95% variance: {pca_normalized.n_components_}")

# Perform PCA
pca_final = PCA(n_components=pca_normalized.n_components_)
digits_data_transformed = pca_final.fit_transform(digits_normalized)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(digits_data_transformed, y, test_size=0.3, random_state=7)

# Initialize the KNeighborsClassifier object
knn_pca = KNeighborsClassifier(n_neighbors=4)  # Use number of samples in the training set as neighbors

# Fit the model to the training data using the actual target labels
knn_pca.fit(X_train, y_train)

# Output the center of each cluster
print(f"Train score after PCA: {knn_pca.score(X_train, y_train):.6f}")
print(f"Test score after PCA: {knn_pca.score(X_test, y_test):.6f}")

# Perform k-means clustering
kmeans = KMeans(n_clusters=3, random_state=7)
cluster_labels = kmeans.fit_predict(digits_data_transformed)

# Output the center of each cluster
print("Center of each cluster (each cluster represents a digit):")
for i, center in enumerate(kmeans.cluster_centers_):
    print(f"Cluster {i}: {center}")

# Determine the mapping between cluster labels and true labels 
mapped_labels = np.zeros_like(cluster_labels)
for cluster in range(kmeans.n_clusters):
    mask = (cluster_labels == cluster)
    mapped_labels[mask] = mode(y[mask])[0]

# Calculate and print clustering accuracy
print(f"Clustering Accuracy: {accuracy_score(y, mapped_labels):.6f}")

# Generate confusion matrix
confusionMatrix = confusion_matrix(y, cluster_labels)

# Visualize the confusion matrix using matplotlib and the sklearn toolset
matrixDisplay = ConfusionMatrixDisplay(confusion_matrix = confusionMatrix)
fig, ax = plt.subplots(figsize=(10, 8))  # Create layout and structure figure
matrixDisplay.plot(ax = ax, cmap = 'Blues')  # Create Plot
# Plot labels
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.title('Clustering Confusion Matrix')
plt.show()
