## Load datasets

In [None]:
import pandas as pd
import os

dfs_train = {}
dfs_val = {}
for file in os.listdir("vectorized_data"):
    if file.endswith("train.csv"):
        df = pd.read_csv("vectorized_data/" + file, index_col=0)
        key = file.split("_")[-2]
        dfs_train[key] = df
    elif file.endswith("val.csv"):
        df = pd.read_csv("vectorized_data/" + file, index_col=0)
        key = file.split("_")[-2]
        dfs_val[key] = df

for key in dfs_train:
    dfs_train[key] = {
        'label': dfs_train[key]["category1"],
        'data': dfs_train[key].drop(columns=["category1"])
    }
    
    print(f'DFs train {key} (data):{dfs_train[key]['data'].shape}')

for key in dfs_val:
    dfs_val[key] = {
        'label': dfs_val[key]["category1"],
        'data': dfs_val[key].drop(columns=["category1"])
    }
    
    print(f'DFs validation {key} (data):{dfs_val[key]['data'].shape}')

## Prepare datasets

In [39]:
def calculateMeanAbsCorrelation(df):
    return df.corr().abs().mean()

def selectLowestCorrelationFeatures(df, num_features):
    mean_abs_correlation = calculateMeanAbsCorrelation(df)
    mean_abs_correlation.sort_values(ascending=True, inplace=True)
    mean_abs_correlation = mean_abs_correlation[:num_features]
    list_of_features = mean_abs_correlation.index.tolist()
    return list_of_features, df[list_of_features]

### Correlation

In [None]:
dfs_train_corr3 = {}
for key, item in dfs_train.items():
    print(f'Generating datasets with 3 least correlated features for training data {key}')
    list_of_features, df = selectLowestCorrelationFeatures(item['data'], 3)
    print(f'Features: {list_of_features}')
    dfs_train_corr3[key] = {
        'label': item['label'],
        'data': df,
        'features': list_of_features
    }

dfs_train_corr9 = {}
for key, item in dfs_train.items():
    print(f'Generating datasets with 9 least correlated features for training data {key}')
    list_of_features, df = selectLowestCorrelationFeatures(item['data'], 9)
    print(f'Features: {list_of_features}')
    dfs_train_corr9[key] = {
        'label': item['label'],
        'data': df,
        'features': list_of_features
    }

### PCA

In [None]:
from sklearn.decomposition import PCA

dfs_train_pca3 = {}
for key, item in dfs_train.items():
    print(f'Generating datasets with 3 PCA features for training data {key}')
    df = PCA(n_components=3, whiten=True).fit_transform(item['data'])
    dfs_train_pca3[key] = {
        'label': item['label'],
        'data': df
    }

dfs_train_pca90 = {}
for key, item in dfs_train.items():
    print(f'Generating datasets with 90 PCA features for training data {key}')
    df = PCA(n_components=0.9, whiten=True).fit_transform(item['data'])
    dfs_train_pca90[key] = {
        'label': item['label'],
        'data': df
    }

## Clusterization

### Define helper functions