In [1]:
import sys
from pathlib import Path

proj_path = Path('/cluster') / 'work' / 'stefandt' / 'pers-pred'
proj_path = proj_path.resolve()
if proj_path not in sys.path: sys.path.append(str(proj_path))

import pandas as pd
import numpy as np
from src.utils import get_commons
from sklearn.metrics import roc_curve


In [2]:
paths, constants, config, logger, device = get_commons()

2024-05-24 11:36:18,766 - ArgumentLogger - INFO - Arguments:
2024-05-24 11:36:18,768 - ArgumentLogger - INFO - dataframe: {'generate': False, 'mbti_frac': 0.1, 'bigfive_c_frac': 1.0, 'bigfive_s_frac': 1.0}
2024-05-24 11:36:18,768 - ArgumentLogger - INFO - eda: {'generate': False}
2024-05-24 11:36:18,769 - ArgumentLogger - INFO - reduce: {'generate': False, 'use_full': False}
2024-05-24 11:36:18,770 - ArgumentLogger - INFO - preprocessing: {'generate_features': False, 'generate_partially_cleaned': False, 'generate_cleaned': False, 'generate_embeddings': False, 'generate_aggregated': True, 'generate_glove': False, 'generate_filled': True, 'model_name': 'distilbert'}
2024-05-24 11:36:18,771 - ArgumentLogger - INFO - dataloaders: {'train': {'num_workers': 1, 'pin_memory': False, 'batch_size': 1028, 'shuffle': True, 'drop_last': True}, 'test': {'num_workers': 1, 'pin_memory': False, 'batch_size': 1028, 'shuffle': False, 'drop_last': False}}
2024-05-24 11:36:18,771 - ArgumentLogger - INFO - 

device: cpu


In [3]:
datas = []
for task in constants["tasks"]:
    data = pd.read_csv(paths["split"]["distilbert"][task], header=[0, 1], index_col=0)
    data["TASK", task] = True
    datas.append(data)

In [4]:
data = pd.concat(datas, copy=False, axis=1)

In [5]:
data["TASK"] = data["TASK"].fillna(False)

  data["TASK"] = data["TASK"].fillna(False)
  data["TASK"] = data["TASK"].fillna(False)
  data["TASK"] = data["TASK"].fillna(False)
  data["TASK"] = data["TASK"].fillna(False)


In [6]:
drops = [["TARGET", big] for big in constants["bigfive_c_columns"]]

In [7]:
data = data.drop(drops, axis = 1)

In [8]:
data["TARGET"]

FEATURE,mbtiEXT,mbtiJUD,mbtiSEN,mbtiTHI,sAGR,sCON,sEXT,sNEU,sOPN
AUTHOR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
-9221022384933360074,0.0,0.0,0.0,1.0,,,,,
-9220031623198266213,0.0,1.0,1.0,1.0,,,,,
-9219633155989415906,0.0,0.0,0.0,1.0,,,,,
-9219237589017844173,0.0,0.0,0.0,0.0,,,,,
-9214568075844254832,0.0,0.0,0.0,1.0,,,,,
...,...,...,...,...,...,...,...,...,...
9154871534596062825,,,,,11.0,63.0,89.0,9.0,78.0
9157088328270352664,,,,,15.0,18.0,12.0,19.0,30.0
9183557572572801954,,,,,17.0,35.0,25.0,84.0,9.0
9186055652387137827,,,,,10.0,94.0,18.0,66.0,84.0


In [9]:
# Regression coefficients from the provided tables
coefficients = {
    'EI': {'Neuroticism': -0.01, 'Extraversion': 0.16, 'Openness': 0.00, 'Agreeableness': 0.02, 'Conscientiousness': -0.01},
    'SN': {'Neuroticism': -0.02, 'Extraversion': -0.01, 'Openness': -0.03, 'Agreeableness': 0.05, 'Conscientiousness': -0.05},
    'TF': {'Neuroticism': -0.12, 'Extraversion': -0.04, 'Openness': -0.08, 'Agreeableness': -0.39, 'Conscientiousness': 0.17},
    'JP': {'Neuroticism': 0.04, 'Extraversion': -0.02, 'Openness': -0.11, 'Agreeableness': 0.03, 'Conscientiousness': 0.13}
}

# Function to predict MBTI scores from Big Five scores
def predict_mbti(big_five_scores):
    mbti_scores = {}
    for mbti, coeffs in coefficients.items():
        score = sum(big_five_scores[trait] * coeff for trait, coeff in coeffs.items())
        mbti_scores[mbti] = score
    return mbti_scores

# Function to find the optimal threshold using Youden's J statistic
def find_optimal_threshold(y_true, y_scores):
    print(y_true, y_scores)
    fpr, tpr, thresholds = roc_curve(y_true, y_scores)
    youden_j = tpr - fpr
    optimal_idx = np.argmax(youden_j)
    optimal_threshold = thresholds[optimal_idx]
    return optimal_threshold

# Function to convert MBTI scores to binary labels using optimal thresholds
def mbti_to_binary_with_thresholds(mbti_scores, optimal_thresholds):
    binary_labels = {}
    for dimension, score in mbti_scores.items():
        binary_labels[dimension] = 1 if score > optimal_thresholds[dimension] else 0
    return binary_labels

# Function to predict Big Five scores from MBTI scores
def predict_big_five(mbti_scores):
    # Create a matrix of coefficients
    coeff_matrix = np.array([
        [coefficients['EI']['Neuroticism'], coefficients['EI']['Extraversion'], coefficients['EI']['Openness'], coefficients['EI']['Agreeableness'], coefficients['EI']['Conscientiousness']],
        [coefficients['SN']['Neuroticism'], coefficients['SN']['Extraversion'], coefficients['SN']['Openness'], coefficients['SN']['Agreeableness'], coefficients['SN']['Conscientiousness']],
        [coefficients['TF']['Neuroticism'], coefficients['TF']['Extraversion'], coefficients['TF']['Openness'], coefficients['TF']['Agreeableness'], coefficients['TF']['Conscientiousness']],
        [coefficients['JP']['Neuroticism'], coefficients['JP']['Extraversion'], coefficients['JP']['Openness'], coefficients['JP']['Agreeableness'], coefficients['JP']['Conscientiousness']]
    ])
    
    # Create a vector of MBTI scores
    mbti_vector = np.array([mbti_scores['EI'], mbti_scores['SN'], mbti_scores['TF'], mbti_scores['JP']])
    
    # Solve the linear system to find Big Five scores
    big_five_scores = np.linalg.lstsq(coeff_matrix, mbti_vector, rcond=None)[0]
    
    # Map the results to the Big Five traits
    big_five_traits = ['Neuroticism', 'Extraversion', 'Openness', 'Agreeableness', 'Conscientiousness']
    big_five_dict = {trait: score for trait, score in zip(big_five_traits, big_five_scores)}
    
    return big_five_dict

# Function to fill missing values in the DataFrame
def fill_missing_values(df, optimal_thresholds):
    for index, row in df.iterrows():
        if pd.isna(row[('TARGET', 'mbtiEXT')]):
            big_five_scores = {
                'Neuroticism': row[('TARGET', 'sNEU')],
                'Extraversion': row[('TARGET', 'sEXT')],
                'Openness': row[('TARGET', 'sOPN')],
                'Agreeableness': row[('TARGET', 'cAGR')],
                'Conscientiousness': row[('TARGET', 'sCON')]
            }
            predicted_mbti = predict_mbti(big_five_scores)
            binary_mbti = mbti_to_binary_with_thresholds(predicted_mbti, optimal_thresholds)
            df.at[index, ('TARGET', 'mbtiEXT')] = binary_mbti['EI']
            df.at[index, ('TARGET', 'mbtiJUD')] = binary_mbti['JP']
            df.at[index, ('TARGET', 'mbtiSEN')] = binary_mbti['SN']
            df.at[index, ('TARGET', 'mbtiTHI')] = binary_mbti['TF']
        elif pd.isna(row[('TARGET', 'sNEU')]):
            mbti_scores = {
                'EI': row[('TARGET', 'mbtiEXT')],
                'SN': row[('TARGET', 'mbtiSEN')],
                'TF': row[('TARGET', 'mbtiTHI')],
                'JP': row[('TARGET', 'mbtiJUD')]
            }
            big_five_scores = predict_big_five(mbti_scores)
            df.at[index, ('TARGET', 'sNEU')] = big_five_scores['Neuroticism']
            df.at[index, ('TARGET', 'sEXT')] = big_five_scores['Extraversion']
            df.at[index, ('TARGET', 'sOPN')] = big_five_scores['Openness']
            df.at[index, ('TARGET', 'cAGR')] = big_five_scores['Agreeableness']
            df.at[index, ('TARGET', 'sCON')] = big_five_scores['Conscientiousness']
    return df

# Extract rows with both MBTI and Big Five scores
complete_rows = data.dropna()

# Calculate optimal thresholds for each MBTI dimension using complete rows
optimal_thresholds = {}
for mbti_dimension in ['mbtiEXT', 'mbtiJUD', 'mbtiSEN', 'mbtiTHI']:
    y_true = complete_rows[('TARGET', mbti_dimension)].values
    select = [["TARGET", big] for big in constants["bigfive_s_columns"]]
    big_five_scores = complete_rows[select].values
    y_scores = [predict_mbti({
        'Neuroticism': row[0],
        'Extraversion': row[1],
        'Openness': row[2],
        'Agreeableness': row[3],
        'Conscientiousness': row[4]
    })[mbti_dimension[-3:]] for row in big_five_scores]
    optimal_thresholds[mbti_dimension[-3:]] = find_optimal_threshold(y_true, y_scores)

# Fill missing values in the DataFrame
df_filled = fill_missing_values(data, optimal_thresholds)

print(df_filled)


  y_true = complete_rows[('TARGET', mbti_dimension)].values


[] []


ValueError: unknown format is not supported