## Load datasets

In [None]:
import pandas as pd
import os

dfs_train = {}
dfs_val = {}
for file in os.listdir("vectorized_data"):
    if file.endswith("train.csv"):
        df = pd.read_csv("vectorized_data/" + file, index_col=0)
        key = file.split("_")[-2]
        dfs_train[key] = df
    elif file.endswith("val.csv"):
        df = pd.read_csv("vectorized_data/" + file, index_col=0)
        key = file.split("_")[-2]
        dfs_val[key] = df

for key in dfs_train:
    dfs_train[key] = {
        'label': dfs_train[key]["category1"],
        'data': dfs_train[key].drop(columns=["category1"])
    }
    
    print(f'DFs train {key} (data):{dfs_train[key]['data'].shape}')

for key in dfs_val:
    dfs_val[key] = {
        'label': dfs_val[key]["category1"],
        'data': dfs_val[key].drop(columns=["category1"])
    }
    
    print(f'DFs validation {key} (data):{dfs_val[key]['data'].shape}')

## Prepare datasets

## Evaluate accuracy with k-NN

### Determine best vectorizer min_df value

In [2]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

def evaluate_classification_single(knn, data, data_class):
    y_pred = knn.predict(data)
    return accuracy_score(data_class, y_pred)

def evaluate_classification_dataset(df_train, df_val, n_neighbors):
    accuracy_train = []
    accuracy_val = []
    for k in n_neighbors:
        knn = KNeighborsClassifier(n_neighbors=k, metric='cosine')
        knn.fit(df_train['data'], df_train['label'])
        accuracy_train.append(evaluate_classification_single(knn, df_train['data'], df_train['label']))
        accuracy_val.append(evaluate_classification_single(knn, df_val['data'], df_val['label']))
    return accuracy_train, accuracy_val



In [None]:
n_neighbors = [2, 5, 10, 15, 20, 25]
# n_neighbors = [1, 3, 5, 7, 9, 11, 13, 15]
accuracies = {}

for key in dfs_train:
    print(f'Evaluating dataset {key}')
    acc_train, acc_val = evaluate_classification_dataset(dfs_train[key], dfs_val[key], n_neighbors)
    accuracies[key] = {
        'train': acc_train,
        'val': acc_val
    }
    print(f'--> Accuracy lengths ({key}): \n {len(acc_train)}\n {len(acc_val)}')

In [None]:
def plot_accuracy(accuracies, n_neighbors):
    import matplotlib.pyplot as plt

    plt.figure(figsize=(10, 5))

    for key in accuracies:
        plt.subplot(1, 2, 1)
        plt.plot(n_neighbors, accuracies[key]['train'], label=f'Train {key}')
        plt.xlabel('Number of Neighbors')
        plt.ylabel('Accuracy')
        plt.title('Train Accuracy')
        plt.legend()

        plt.subplot(1, 2, 2)
        plt.plot(n_neighbors, accuracies[key]['val'], label=f'Validation {key}')
        plt.xlabel('Number of Neighbors')
        plt.ylabel('Accuracy')
        plt.title('Validation Accuracy')
        plt.legend()

    plt.show()

In [None]:
# plot_accuracy(accuracies, n_neighbors)

### Create reduced datasets

In [None]:
# Based on the plot, we can see that the best value for min_df is:
min_df_val = '0.01'

#### Correlation

In [None]:
def calculateMeanAbsCorrelation(df):
    return df.corr().abs().mean()

def selectLowestCorrelationFeatures(df, num_features):
    mean_abs_correlation = calculateMeanAbsCorrelation(df)
    mean_abs_correlation.sort_values(ascending=True, inplace=True)
    mean_abs_correlation = mean_abs_correlation[:num_features]
    list_of_features = mean_abs_correlation.index.tolist()
    return list_of_features, df[list_of_features]

In [None]:
list_of_features, df = selectLowestCorrelationFeatures(dfs_train[min_df_val]['data'], 3)
print(f'Features: {list_of_features}')
dfs_train_corr3 = {
    'label': dfs_train[min_df_val]['label'],
    'data': df,
    'features': list_of_features
}

list_of_features, df = selectLowestCorrelationFeatures(dfs_train[min_df_val]['data'], 9)
print(f'Features: {list_of_features}')
dfs_train_corr9 = {
    'label': dfs_train[min_df_val]['label'],
    'data': df,
    'features': list_of_features
}

dfs_train_corr = {
    '3': dfs_train_corr3,
    '9': dfs_train_corr9
}

dfs_val_corr = {
    '3': {
        'label': dfs_val[min_df_val]['label'],
        'data': dfs_val[min_df_val]['data'][dfs_train_corr3['features']]
    },
    '9': {
        'label': dfs_val[min_df_val]['label'],
        'data': dfs_val[min_df_val]['data'][dfs_train_corr9['features']]
    }
}

#### PCA

In [None]:
from sklearn.decomposition import PCA

pca = PCA(n_components=3, whiten=True).fit(dfs_train[min_df_val]['data'])
print(f'PCA components: {pca.n_components_}')
dfs_train_pca3 = {
    'label': dfs_train[min_df_val]['label'],
    'data': pca.transform(dfs_train[min_df_val]['data']),
    'pca': pca
}

pca = PCA(n_components=0.9, whiten=True).fit(dfs_train[min_df_val]['data'])
print(f'PCA components: {pca.n_components_}')
dfs_train_pca90 = {
    'label': dfs_train[min_df_val]['label'],
    'data': pca.transform(dfs_train[min_df_val]['data']),
    'pca': pca
}

dfs_train_pca = {
    '3': dfs_train_pca3,
    '90': dfs_train_pca90
}

dfs_val_pca = {
    '3': {
        'label': dfs_val[min_df_val]['label'],
        'data': dfs_train_pca3['pca'].transform(dfs_val[min_df_val]['data'])
    },
    '90': {
        'label': dfs_val[min_df_val]['label'],
        'data': dfs_train_pca90['pca'].transform(dfs_val[min_df_val]['data'])
    }
}

### Select the most accurate dataset for each reduction method

In [None]:
# accuracies_corr = {}
# accuracies_pca = {}

# n_neighbors = [2, 5, 10, 15, 20, 25]

# for key in dfs_train_corr:
#     print('Correlation')
#     print(f'Evaluating dataset {key}')
#     acc_train, acc_val = evaluate_classification_dataset(dfs_train_corr[key], dfs_val_corr[key], n_neighbors)
#     accuracies_corr[key] = {
#         'train': acc_train,
#         'val': acc_val
#     }

# for key in dfs_train_pca:
#     print('PCA')
#     print(f'Evaluating dataset {key}')
#     acc_train, acc_val = evaluate_classification_dataset(dfs_train_pca[key], dfs_val_pca[key], n_neighbors)
#     accuracies_pca[key] = {
#         'train': acc_train,
#         'val': acc_val
#     }
#     print(f'--> Accuracy lengths ({key}): \n {len(acc_train)}\n {len(acc_val)}')

# plot_accuracy(accuracies_corr, n_neighbors)
# plot_accuracy(accuracies_pca, n_neighbors)

In [None]:
best_dfs_train = {
    'corr' : dfs_train_corr['3'],
    'pca' : dfs_train_pca['90'],
}

best_dfs_val = {
    'corr' : dfs_val_corr['3'],
    'pca' : dfs_val_pca['90'],
}

### Evaluate the clustering methods

#### Define helper functions

In [None]:

from sklearn.metrics import davies_bouldin_score, adjusted_rand_score, silhouette_score
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt

def evaluate_performance_single(data_train, data_test, labels, k, method):
  trained = method(data_train, k)

  y_pred = trained.fit_predict(data_test)

  return [davies_bouldin_score(data_test, y_pred), silhouette_score(data_test, y_pred, metric='cosine'), adjusted_rand_score(labels, y_pred)]

def evaluate_performance_method(dbs_train, dbs_val, labels, nclusters, method, dbs_names=[]):
  metrics = []
  i = 0
  len_dbs = len(dbs_train)
  for i in range(len_dbs):
    metrics_for_db = []
    for k in nclusters:
      metrics_for_db.append(evaluate_performance_single(dbs_train[i], dbs_val[i], labels[i], k, method))
    # print(metrics_for_db)
    metrics_for_db = list(zip(*metrics_for_db))
    # print(metrics_for_db)
    if dbs_names:
      metrics.append([dbs_names[i], metrics_for_db])
    else:
      metrics.append([f'db{i}', metrics_for_db])
    i += 1
  return metrics

def plot_results(results, number_clusters):
  fig, axs = plt.subplots(1, 3, figsize=(15, 5))

  for i in range(len(results)):
    axs[0].plot(number_clusters, results[i][1][0], label=results[i][0])
  axs[0].set_xlim(number_clusters[0], number_clusters[-1])
  axs[0].legend()
  axs[0].set_xlabel('k')
  axs[0].set_ylabel('Davies-Bouldin Score')

  for i in range(len(results)):
    axs[1].plot(number_clusters, results[i][1][1], label=results[i][0])
  axs[1].set_xlim(number_clusters[0], number_clusters[-1])
  axs[1].legend()
  axs[1].set_xlabel('k')
  axs[1].set_ylabel('Silhouette Score')

  for i in range(len(results)):
    axs[2].plot(number_clusters, results[i][1][2], label=results[i][0])
  axs[2].set_xlim(number_clusters[0], number_clusters[-1])
  axs[2].legend()
  axs[2].set_xlabel('k')
  axs[2].set_ylabel('Adjusted Rand Score')

  plt.tight_layout()
  plt.show()

#### k-Means

In [None]:
def kmeans(dataset, n_clusters):
  from sklearn.cluster import KMeans

  kmeans = KMeans(n_clusters=n_clusters, init='k-means++', n_init=300, max_iter=10, random_state=37)

  kmeans.fit(dataset)

  return kmeans

In [None]:
number_clusters = [2, 5, 10, 15, 20, 25]
kmeans_results = evaluate_performance_method(
        [best_dfs_train['corr']['data'], best_dfs_train['pca']['data']],
        [best_dfs_val['corr']['data'], best_dfs_val['pca']['data']],
        [best_dfs_val['corr']['label'], best_dfs_val['pca']['label']],
        number_clusters,
        kmeans,
        dbs_names=['Correlation', 'PCA']
    )

In [None]:
plot_accuracy(kmeans_results, number_clusters)