In [None]:
import sys
from pathlib import Path

proj_path = Path('/cluster') / 'work' / 'stefandt' / 'pers-pred'
proj_path = proj_path.resolve()
if proj_path not in sys.path: sys.path.append(str(proj_path))

import pandas as pd
import gc
from src.utils import get_commons
import numpy as np
import matplotlib.pyplot as plt
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.neighbors import NearestNeighbors
from sklearn.cluster import KMeans
from fcmeans import FCM  

In [None]:
paths, constants, config, logger, device = get_commons()

In [None]:
pandora_authors = pd.read_csv(paths['raw']['pandora_authors'])
pandora_comments = pd.read_csv(paths['raw']['pandora_comments'])
pandora_authors = pandora_authors.rename({
        'author': 'AUTHOR', 
        'introverted': 'mbtiEXT', # Flip
        'intuitive': 'mbtiSEN', # Flip
        'thinking': 'mbtiTHI', 
        'perceiving': 'mbtiJUD', # Flip 
        'agreeableness': 'sAGR', 
        'openness': 'sOPN', 
        'conscientiousness': 'sCON', 
        'extraversion': 'sEXT',
        'neuroticism': 'sNEU'
    },
axis ='columns')
pandora_authors[['mbtiEXT', 'mbtiSEN', 'mbtiJUD']] = 1 - pandora_authors[['mbtiEXT', 'mbtiSEN', 'mbtiJUD']]
pandora_authors['gender'] = pandora_authors['gender'].map({'m': True, 'f': False}).astype('boolean')
pandora_comments = pandora_comments.rename({
        'author': 'AUTHOR',
        'body': 'TEXT'
    },
axis='columns')
pandora2 = pd.merge(pandora_authors, pandora_comments, on='AUTHOR')

In [None]:
def calculate_rmse(actual, predicted):
    return np.sqrt(((actual - predicted) ** 2).mean())

def find_optimal_k(X, y, k_values):
    errors = []
    for k in k_values:
        nbrs = NearestNeighbors(n_neighbors=k).fit(X)
        distances, indices = nbrs.kneighbors(X)
        imputed_values = np.mean(y[indices], axis=1)
        rmse = calculate_rmse(y, imputed_values)
        errors.append(rmse)
    optimal_k = k_values[np.argmin(errors)]
    return optimal_k

def knn_iterative_imputation(df, target_cols, k_values):
    imputer = IterativeImputer(max_iter=10, random_state=0)
    numeric_data = df.select_dtypes(include=[np.number])
    for col in target_cols:
        if df[col].isnull().sum() > 0:
            X_complete = numeric_data.dropna(subset=[col])
            y_complete = X_complete[col]
            X_complete = X_complete.drop(columns=target_cols)
            X_missing = numeric_data[numeric_data[col].isnull()].drop(columns=target_cols)
            optimal_k = find_optimal_k(X_complete, y_complete, k_values)
            nbrs = NearestNeighbors(n_neighbors=optimal_k).fit(X_complete)
            distances, indices = nbrs.kneighbors(X_missing)
            imputed_values = np.mean(y_complete.values[indices], axis=1)
            df.loc[df[col].isnull(), col] = imputed_values
    numeric_imputed_data = pd.DataFrame(imputer.fit_transform(numeric_data), columns=numeric_data.columns)
    df[numeric_data.columns] = numeric_imputed_data
    return df

def fcki_imputation(df, target_cols, c_clusters, k_values):
    numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
    relevant_cols = [col for col in numeric_cols if col not in target_cols]
    fcm = FCM(n_clusters=c_clusters)
    fcm.fit(df[relevant_cols].values)
    labels = fcm.predict(df[relevant_cols].values)
    imputed_data = df.copy()
    for cluster in np.unique(labels):
        cluster_data = df[labels == cluster]
        imputed_cluster_data = knn_iterative_imputation(cluster_data, target_cols, k_values)
        imputed_data.loc[labels == cluster] = imputed_cluster_data
        # Clear memory
        del cluster_data, imputed_cluster_data
        gc.collect()
    return imputed_data

def calculate_wcss(data):
    wcss = []
    for k in range(1, 11):
        kmeans = KMeans(n_clusters=k, init='k-means++', max_iter=300, n_init=10, random_state=0)
        kmeans.fit(data)
        wcss.append(kmeans.inertia_)
    return wcss

def plot_elbow(wcss):
    plt.plot(range(1, 11), wcss)
    plt.title('Elbow Method')
    plt.xlabel('Number of clusters')
    plt.ylabel('WCSS')
    plt.show()


target_cols = ['mbtiEXT', 'mbtiSEN', 'mbtiTHI', 'mbtiJUD', 'sAGR', 'sOPN', 'sCON', 'sEXT', 'sNEU']

In [None]:
# Select a subset without missing values
subset = pandora.dropna(subset=target_cols)

# Fill missing values in numeric columns with the mean of the subset
numeric_cols = subset.select_dtypes(include=[np.number]).columns

if False:
    subset = pandora.dropna(subset=target_cols)

    numeric_cols = subset.select_dtypes(include=[np.number]).columns
    subset[numeric_cols] = subset[numeric_cols].fillna(subset[numeric_cols].mean())

    relevant_cols = [col for col in numeric_cols if col not in target_cols]

    wcss = calculate_wcss(subset[relevant_cols].values)
    plot_elbow(wcss)

In [None]:
c_clusters = 3 
k_values = range(2, 11)

# Apply FCKI imputation
# Ensure all necessary columns are filled with mean values before clustering
pandora[numeric_cols] = pandora[numeric_cols].fillna(pandora[numeric_cols].mean())
imputed_data_fcki = fcki_imputation(pandora, target_cols, c_clusters, k_values)

In [None]:
imputed_data_fcki

In [None]:
pandora

In [None]:
path_pandora = Path('/cluster') / 'work' / 'stefandt' / 'pers-pred' / 'data' / 'filled' / 'pandora.csv'

In [None]:
pandora.to_csv(path_pandora)

In [None]:
pandora.loc[pandora['AUTHOR'] == '-BigSexy-'][target_cols].head(10)

In [None]:
pandora = pd.read_csv(path_pandora)

In [None]:
pand_cols = constants['bigfive_s_columns'] + constants['mbti_columns']

In [None]:
pandora[pand_cols]

In [None]:
pandora[constants['mbti_columns']] = (pandora[constants['mbti_columns']] > 0.5).astype(int)
pandora[constants['bigfive_c_columns']] = (pandora[constants['bigfive_s_columns']] > 50).astype(int)

In [None]:
pandora[constants['label_columns']]

In [None]:
pandora.loc[pandora['cEXT'] != pandora['mbtiEXT']].groupby('AUTHOR')[constants['label_columns']].mean()

In [None]:
pandora2 = pandora2.dropna(subset=pand_cols)

In [None]:
pandora2[constants['bigfive_c_columns']] = (pandora2[constants['bigfive_s_columns']] > 50).astype(int)

In [None]:
pandora2.groupby('AUTHOR')[constants['label_columns']].mean().shape

In [None]:
pandora2.loc[pandora2['cEXT'] != pandora2['mbtiEXT']].groupby('AUTHOR')[constants['label_columns']].mean().shape