In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import joblib

###A

####Read Data for A




In [None]:
def read(data, label):
  mp = {' ': 0, '+': 1, '#': 1}
  X = []
  with open(data, 'r') as file:

    while True:
      img = []

      for _ in range(28):
        line = file.readline()

        if not line:
          break

        row = [mp[c] for c in line if c != '\n']
        img.append(row)

      if not line:
        break
      while len(img) != 28:
        img.append([0] * 28)
      X.append(img)

    file.close()

  with open(label, 'r') as file:
    y = [int(line) for line in file]
    file.close()


  y = np.array(y)
  X = np.array(X)

  return X, y

def accuracy_fn(y_true, y_pred):
  return np.sum(y_true == y_pred) / len(y_true)

In [None]:
X_train, y_train = read('/content/trainingimages', '/content/traininglabels')

FileNotFoundError: [Errno 2] No such file or directory: '/content/trainingimages'

In [None]:
x_test, y_test = read('/content/testimages', '/content/testlabels')

####visualize

In [None]:
def view_with_matplotlib(X, idx):
    """
    Hiển thị một hình ảnh trong mảng 3D X bằng matplotlib.
    """
    plt.imshow(X[idx], cmap='gray')
    plt.axis('off')
    plt.show()

In [None]:
view_with_matplotlib(X_train, 0)

####A: Multinomial Naive Bayes

In [None]:
class MultinomialNaiveBayes:
    def __init__(self, k=1.0):
        self.k = k  # Smoothing parameter
        self.class_priors = None
        self.feature_probs = None
        self.classes = None

    def fit(self, X, y):
        """
        Fit mô hình với dl X, y.
        """
        n_samples, n_features = X.shape[0], X.shape[1] * X.shape[2]
        self.classes = np.unique(y)
        n_classes = len(self.classes)


        self.class_priors = np.zeros(n_classes)
        self.feature_probs = np.zeros((n_classes, n_features))

        X_reshaped = X.reshape(n_samples, n_features)

        for idx, c in enumerate(self.classes):
            X_c = X_reshaped[y == c]
            self.class_priors[idx] = X_c.shape[0] / n_samples
            self.feature_probs[idx, :] = (X_c.sum(axis=0) + self.k) / (X_c.sum() + self.k * 2)

    def predict(self, X):

        return np.array([self._predict_single(x) for x in X])

    def _predict_single(self, x):
        log_probs = []
        for idx, c in enumerate(self.classes):
            log_class_prior = np.log(self.class_priors[idx])
            log_likelihood = np.sum(np.log(self.feature_probs[idx]) * x)
            log_probs.append(log_class_prior + log_likelihood)
            predicted = self.classes[np.argmax(log_probs)]
        return predicted

####Find k

In [None]:
accs = []
k_ = [0.1, 0.5, 1, 2, 3, 4, 5, 6, 7, 8, 9]
for i in k_:
  naive = MultinomialNaiveBayes(k=i)
  naive.fit(X_train, y_train)
  pred1 = naive.predict(x_test.reshape(1000,784))
  accs.append(accuracy_fn(y_test, pred1))

In [None]:
accs

####Save model as a file

In [None]:
naive = MultinomialNaiveBayes(k=0.1)
naive.fit(X_train, y_train)

In [None]:
joblib.dump(naive, '/content/naive1.pkl')

####Load model

In [None]:
model = joblib.load('/content/naive1.pkl')

####Classify

In [None]:
predx = model.predict(x_test.reshape(1000,784))
print(accuracy_fn(y_test, predx))

In [None]:
#Nhãn dự đoán của của mẫu x_test[1]
x = model.predict(x_test[1].reshape(1,784))
x

In [None]:
#Nhãn thật của mẫu được dự đoán
print(y_test[1])
view_with_matplotlib(x_test, 1)

####Confusion matrix

In [None]:
def metrics(y_true, y_pred, average="macro"):
    # Lấy các lớp duy nhất
    classes = np.unique(np.concatenate([y_true, y_pred]))
    num_classes = len(classes)

    #Tính accuracy
    accuracy = np.sum(y_true == y_pred) / len(y_true)

    # Tính confusion matrix
    cm = confusion_matrix(y_true, y_pred)

    # Tính Precision, Recall cho từng lớp
    precision_per_class = []
    recall_per_class = []
    f1_per_class = []
    support_per_class = []

    for i in range(num_classes):
        tp = cm[i, i]
        fp = cm[:, i].sum() - tp
        fn = cm[i, :].sum() - tp
        support = cm[i, :].sum()

        precision = tp / (tp + fp) if tp + fp > 0 else 0
        recall = tp / (tp + fn) if tp + fn > 0 else 0
        f1 = 2 * precision * recall / (precision + recall) if precision + recall > 0 else 0

        precision_per_class.append(precision)
        recall_per_class.append(recall)
        f1_per_class.append(f1)
        support_per_class.append(support)

    if average == "macro":
        precision = np.mean(precision_per_class)
        recall = np.mean(recall_per_class)
        f1_score = np.mean(f1_per_class)
    elif average == "weighted":
        total_support = np.sum(support_per_class)
        precision = np.sum(np.array(precision_per_class) * np.array(support_per_class)) / total_support
        recall = np.sum(np.array(recall_per_class) * np.array(support_per_class)) / total_support
        f1_score = np.sum(np.array(f1_per_class) * np.array(support_per_class)) / total_support

    return {
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1_score": f1_score
    }


In [None]:
#vẽ dạng ma trận
def confusion_matrix(y_true, y_pred):
  num_classes = len(np.unique(y_true))
  confusion_matrix = np.zeros((num_classes, num_classes), dtype=int)

  for true_label, pred_label in zip(y_true, y_pred):
    confusion_matrix[true_label, pred_label] += 1

  return confusion_matrix
def accuracy_fn(y_true, y_pred):
  return np.sum(y_true == y_pred) / len(y_true)


def plot_confusion_matrix(y_true, y_pred, labels=None, normalize=False, cmap=plt.cm.Blues):
    cm = confusion_matrix(y_true, y_pred)

    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1, keepdims=True)  # Chuẩn hóa theo hàng
    plt.figure(figsize=(8, 6))
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title('Confusion Matrix', fontsize=16)
    plt.colorbar()

    if labels is None:
        labels = np.unique(y_true)
    tick_marks = np.arange(len(labels))
    plt.xticks(tick_marks, labels, fontsize=12)
    plt.yticks(tick_marks, labels, fontsize=12)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
            plt.text(j, i, format(cm[i, j], fmt),
                     horizontalalignment="center",
                     color="white" if cm[i, j] > thresh else "black", fontsize=12)

    plt.ylabel('True label', fontsize=14)
    plt.xlabel('Predicted label', fontsize=14)
    plt.tight_layout()
    plt.grid(False)
    plt.show()

In [None]:
z = metrics(y_test, predx)
print(z)

In [None]:
#Confusion matrix
plot_confusion_matrix(y_test, predx, normalize=True)

####Extra credit

In [None]:
def majority_value(block):
    """Return the majority value from a 2x2 pixel block."""
    return mode(block, axis=None).mode[0]

def extract_features(image):
    """Extract majority features from a 2D image using 2x2 pixel blocks."""
    # Get the dimensions of the image
    height, width = image.shape
    # Calculate the dimensions of the feature array
    feature_height = height // 2
    feature_width = width // 2

    # Initialize the feature array
    features = np.zeros((feature_height, feature_width), dtype=image.dtype)

    # Iterate over the image with a step of 2
    for i in range(0, height - 1, 2):
        for j in range(0, width - 1, 2):
            # Extract the 2x2 block
            block = image[i:i+2, j:j+2]
            # Calculate the majority value and store it
            features[i//2, j//2] = majority_value(block)

    return features

In [None]:
# def extract_features(X):
#   num_images, height, width = X.shape
#   features = np.zeros((num_images, 14, 14), dtype=int)

#   for idx in range(num_images):
#     for i in range(0, height, 2):
#       for j in range(0, width, 2):
#         block = X[idx, i:i+2, j:j+2]
#         majority_value = Counter(block.flatten()).most_common(1)[0][0]
#         features[idx, i // 2, j // 2] = majority_value

#     return features

###lib for compare A

In [None]:
from sklearn.naive_bayes import MultinomialNB

a = MultinomialNB()
a.fit(X_train.reshape(X_train.shape[0], -1), y_train)

In [None]:
y_pred = a.predict(x_test.reshape(x_test.shape[0], -1))

In [None]:
print(accuracy_fn(y_test, y_pred))

###B

####Load Data for B

In [None]:
df = pd.read_csv('/content/Mall_Customers.csv', index_col='CustomerID')
df.head(5)

In [None]:
#{'Female': 1, 'Male': 2}
def label_encoding(data):
    data_array = np.array(data)

    unique_labels, encoded_data = np.unique(data_array, return_inverse=True)
    encoded_data += 1
    label_dict = {label: idx + 1 for idx, label in enumerate(unique_labels)}

    return encoded_data.tolist(), label_dict

def standard_scaler(data):
    means = [sum(column) / len(column) for column in zip(*data)]
    std_devs = [(sum((x - mean) ** 2 for x in column) / len(column)) ** 0.5 for column, mean in zip(zip(*data), means)]

    scaled_data = [[(x - mean) / std for x, mean, std in zip(row, means, std_devs)]for row in data]

    return scaled_data

In [None]:
numeric_columns = ['Age', 'Annual Income (k$)', 'Spending Score (1-100)']
scaled_data = standard_scaler(df[numeric_columns].values.tolist())

categorical_columns = ['Gender']
encoded, _= label_encoding(df[categorical_columns].values.flatten())
encoded = np.array(encoded).reshape(-1, 1)

In [None]:
x = np.array(np.concatenate([encoded,scaled_data], axis=1))

####B: KMeans

In [None]:
def euclidean_distance(point1, point2):
  return np.linalg.norm(np.array(point1) - np.array(point2))

In [None]:
class Kmeans:
    def __init__(self, n_clusters=2, max_iters=100):

        self.n_clusters = n_clusters
        self.max_iters = max_iters
        self.centroids = None
        self.labels = None
    # def initialize_centroids_kmeanspp(self, data):
    #   """Khởi tạo các centroid theo thuật toán K-means++."""
    #   centroids = [data[np.random.randint(data.shape[0])]]

    #   for _ in range(1, self.n_clusters):
    #     #distances = np.array([min([np.linalg.norm(x - c) ** 2 for c in centroids]) for x in data])
    #     distances = np.array([min([euclidean_distance(x, c) ** 2 for c in centroids]) for x in data])
    #     probabilities = distances / distances.sum()
    #     cumulative_probabilities = np.cumsum(probabilities)
    #     r = np.random.rand()

    #     for j, p in enumerate(cumulative_probabilities):
    #         if r < p:
    #             centroids.append(data[j])
    #             break

    #   self.centroids = np.array(centroids)

    def initialize_centroids(self, data):
        self.centroids = data[np.random.choice(range(data.shape[0]), self.n_clusters, replace=False)]

    def assign_clusters(self, data):
        clusters = []
        for point in data:
            distances = [euclidean_distance(point, centroid) for centroid in self.centroids]
            cluster = distances.index(min(distances))
            clusters.append(cluster)
        return clusters

    def update_centroids(self, data):
        new_centroids = []
        for i in range(self.n_clusters):
            cluster_points = [data[j] for j in range(len(data)) if self.labels[j] == i]
            if cluster_points:
                new_centroid = np.mean(cluster_points, axis=0).tolist()
            else:
                new_centroid = [0] * len(data[0])
            new_centroids.append(new_centroid)
        return new_centroids



    def fit(self, data):
        self.initialize_centroids(data)
        #self.initialize_centroids_kmeanspp(data)

        for _ in range(self.max_iters):
            self.labels = self.assign_clusters(data)
            new_centroids = self.update_centroids(data)
            if np.all(np.linalg.norm(np.array(new_centroids) - np.array(self.centroids), axis=1) < 1e-6):
              break
            self.centroids = new_centroids

        return self


    def predict(self, data):
        return self.assign_clusters(data)

####2. Run to find the optimal k

In [None]:
def calculate_wcss(data, centroids):
    labels = np.argmin(np.linalg.norm(data[:, np.newaxis] - centroids, axis=2), axis=1)
    wcss = sum(np.sum((data[labels == i] - centroid) ** 2) for i, centroid in enumerate(centroids))

    return wcss

In [None]:
losses = []
k = 20
for i in range(1,k):
  kmeans_i = Kmeans(n_clusters=i)
  kmeans_i.fit(x)

  d2centroids = calculate_wcss(x, kmeans_i.centroids)

  min_distance = np.min(d2centroids, axis=0)
  loss = np.sum(min_distance)
  losses.append(loss)

In [None]:
plt.figure(figsize=(10, 6))
plt.plot(range(1, k), losses, marker='o', linestyle='-')
plt.xlabel('Number of Clusters (k)')
plt.ylabel('Wcss')

####Run with optimal k = 5

In [None]:
model = Kmeans(n_clusters=5, max_iters=100)
model.fit(x)

####Visualize k mean clustering result

In [None]:
df = pd.DataFrame(x, columns=['Gender', 'Age', 'Annual Income (k$)', 'Spending Score (1-100)'])
df['Cluster'] = model.labels

sns.pairplot(df, hue='Cluster', palette='viridis')
plt.suptitle('Pairplot of Data with Clusters', y=1.02)

In [None]:
sns.scatterplot(df,x=df['Annual Income (k$)'], y =df['Spending Score (1-100)'], hue='Gender',palette='viridis', legend='full')
plt.xlabel('Annual Income (k$)')
plt.ylabel('Spending Score (1-100)')

plt.show()