In [None]:
import logging
from datetime import datetime

current_file_name = "12_PDU_Aggregations_and_Models"

dt_string = datetime.now().strftime("%Y%m%d_%H%M%S")
log_file = f"logs/{current_file_name}/{dt_string}.log"
logging.basicConfig(level=logging.INFO, filename=log_file,filemode="w", format="%(asctime)s %(levelname)s %(message)s")

# https://blog.sentry.io/logging-in-python-a-developers-guide/

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import sys

from scipy.spatial import distance
import plotly.express as px
from sklearn.cluster import KMeans
from umap import UMAP

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score

from sklearn import svm

from sklearn.ensemble import GradientBoostingClassifier

from sklearn.linear_model import LogisticRegression

import xgboost as xgb

from imblearn.ensemble import BalancedRandomForestClassifier

from imblearn.ensemble import BalancedBaggingClassifier

import random

from sklearn.svm import LinearSVC
from sklearn.feature_selection import SelectFromModel

from numpy import mean, std
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from sklearn.model_selection import RepeatedStratifiedKFold

import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import scipy.stats as stats
import statsmodels.api as sm
import statsmodels.stats.api as sms
import statsmodels.stats as sm_stats

import textwrap
import shap

In [None]:
from helpers.pages import *
from helpers.constants import *
from helpers.questions import *
from helpers.utils import *
from helpers.machine_learning import *

In [None]:
pd.set_option('display.max_columns', 500)

In [None]:
def get_dict_of_paths(root_path, file_extension=".csv"):
    dict_of_paths = {}
    for root, dirs, files in os.walk(root_path):
        if len(files) > 0:
            files = [f for f in files if f.endswith(file_extension)]
            files = [os.path.join(root, f) for f in files]
            
            folder_name = root.split("\\")[-1]
            dict_of_paths[folder_name] = files
    return dict_of_paths

In [None]:
words_analysis_fg_path = "data\\11_Pause_Defined_Units\\FG"
words_analysis_h_path = "data\\11_Pause_Defined_Units\\H"

In [None]:
fg_paths = get_dict_of_paths(words_analysis_fg_path)
h_paths = get_dict_of_paths(words_analysis_h_path)

In [None]:
# Function to normalize features safely
def z_normalize(group):
    std = group.std()
    if std == 0:
        return group * 0  # Return zero or leave as the mean of the group
    else:
        return (group - group.mean()) / std

In [None]:
pdu_cols = ['word', 'start', 'end', 'articulation_duration', 'word_count',
            'syllables_count', 'unit_duration', 'pause_duration_before_unit',
            'unit_duration_with_pause', 'word_speach_rate', 'syllables_speach_rate',
            'word_articulation_rate', 'syllables_articulation_rate', 'noun', 'verb',
            'adjective', 'adverb', 'pronoun', 'determiner_article',
            'preposition_postposition', 'numeral', 'conjunction', 'particle',
            'punctuation', 'other', 'coordinating_conjunction', 'cardinal_digit',
            'determiner', 'existential_there', 'foreign_word',
            'preposition_subordinating_conjunction', 'adjective_comparative',
            'adjective_superlative', 'list_marker', 'modal', 'noun_singular',
            'noun_plural', 'proper_noun_singular', 'proper_noun_plural',
            'predeterminer', 'possessive_ending', 'personal_pronoun',
            'possessive_pronoun', 'adverb_comparative', 'adverb_superlative',
            'infinite_marker', 'interjection', 'verb_gerund', 'verb_past_tense',
            'verb_past_participle', 'verb_present_tense_not_3rd_person_singular',
            'verb_present_tense_with_3rd_person_singular', 'wh_determiner',
            'wh_pronoun', 'wh_adverb', 'total_words', 'unique_words',
            'average_word_length', 'lexical_diversity', 'female', 'variant',
            'respondent', 'voiceID', 'duration', 'meanF0Hz', 'medianF0Hz',
            'stdevF0Hz', 'HNR', 'localJitter', 'localabsoluteJitter', 'rapJitter',
            'ppq5Jitter', 'ddpJitter', 'localShimmer', 'localdbShimmer',
            'apq3Shimmer', 'apq5Shimmer', 'apq11Shimmer', 'ddaShimmer',
            'hesitation', 'disfluency', 'tense', 'qualifiers', 'contradictions']

voice_features = ['meanF0Hz', 'medianF0Hz', 'stdevF0Hz', 'HNR',
                     'localJitter', 'localabsoluteJitter', 'rapJitter',
                     'ppq5Jitter', 'ddpJitter', 'localShimmer',
                     'localdbShimmer', 'apq3Shimmer', 'apq5Shimmer',
                     'apq11Shimmer', 'ddaShimmer']

potentional_duplicates = ['meanF0Hz']

binary_features = ['hesitation', 'disfluency', 'tense', 'qualifiers', 'contradictions']

pdu_cols_to_remove = ["word", "start", "end", "variant",
                      "respondent", "voiceID", *potentional_duplicates]

pdu_features = [f for f in pdu_cols if f not in pdu_cols_to_remove]

voice_features_to_normalize = [f for f in voice_features if f not in potentional_duplicates and f not in binary_features]

pdu_features_not_normalized = [f for f in pdu_features if f not in voice_features_to_normalize]

In [None]:
pd.set_option('future.no_silent_downcasting', True)

def direct_pdu(dict_of_paths, pdu_features, variant):
    dfs = []
    for key, value in dict_of_paths.items():
        for file in value:
            
            df = pd.read_csv(file, sep=";")
            elaboration = file.split("\\")[-1][:-4]
            respondent = df["respondent"].unique()[0]

            logging.info(f"Processing respondent {respondent} elaboration {elaboration} variant {variant}")

            df = df[pdu_features]

            df["respondent"] = respondent
            df["elaboration"] = elaboration
            df["variant"] = variant
            
            # Set value of female to the first value of female column
            df["female"] = df["female"].iloc[0].astype(int)

            # Encode 'true' to 1 and 'false' to zero in binary_features
            for feature in binary_features:
                # If type not bool, to lower, replace with number and convert to bool
                if df[feature].dtype != bool:
                    df[feature] = df[feature].str.lower().replace({"true": True, "false": False, "unknown": False}).astype(bool)
            
            dfs.append(df)

    return pd.concat(dfs)

In [None]:
import warnings
warnings.simplefilter(action='ignore', category=RuntimeWarning)

In [None]:
pdu_fg = direct_pdu(fg_paths, pdu_features, "FG")
pdu_h = direct_pdu(h_paths, pdu_features, "H")

# Identify NaN values
print("pdu_fg", len(pdu_fg), pdu_fg.isnull().sum().sum())
print("pdu_h", len(pdu_h), pdu_h.isnull().sum().sum())

In [None]:
pd.set_option('future.no_silent_downcasting', False)

In [None]:
# Group by respondent, group by elaboration, count and filter only where count is 1
pdu_fg_counts = pdu_fg[pdu_fg["unit_duration"] < 110]
print(len(pdu_fg), len(pdu_fg_counts), len(pdu_fg) - len(pdu_fg_counts))
pdu_fg_counts = pdu_fg_counts.groupby(["respondent", "elaboration"]).count()
pdu_fg_counts = pdu_fg_counts[pdu_fg_counts["variant"] == 1]
pdu_fg_counts

In [None]:
# Group by respondent, group by elaboration, count and filter only where count is 1
pdu_h_counts = pdu_h[pdu_h["unit_duration"] < 110]
print(len(pdu_h), len(pdu_h_counts), len(pdu_h) - len(pdu_h_counts))
pdu_h_counts = pdu_h_counts.groupby(["respondent", "elaboration"]).count()
pdu_h_counts = pdu_h_counts[pdu_h_counts["variant"] == 1]
pdu_h_counts

In [None]:
# Show NaN values in pdu_fg
pdu_fg[pdu_fg.isnull().any(axis=1)].head()

In [None]:
# Show NaN values in pdu_h
pdu_h[pdu_h.isnull().any(axis=1)].head()

In [None]:
# Drop NaN values
pdu_fg_clean = pdu_fg.dropna()
pdu_h_clean = pdu_h.dropna()

print("pdu_fg_clean", len(pdu_fg_clean), pdu_fg_clean.isnull().sum().sum())
print("pdu_h_clean", len(pdu_h_clean), pdu_h_clean.isnull().sum().sum())

In [None]:
# Isolate numerical columns
numerical_cols = pdu_fg_clean.select_dtypes(include=[np.number])

# Create a mask for rows with any 'inf' values in numerical columns
inf_mask = np.isinf(numerical_cols).any(axis=1)

# Drop these rows from the DataFrame
pdu_fg_clean = pdu_fg_clean[~inf_mask]

print(len(pdu_fg_clean))

In [None]:
# Isolate numerical columns
numerical_cols = pdu_h_clean.select_dtypes(include=[np.number])

# Create a mask for rows with any 'inf' values in numerical columns
inf_mask = np.isinf(numerical_cols).any(axis=1)

# Drop these rows from the DataFrame
pdu_h_clean = pdu_h_clean[~inf_mask]

print(len(pdu_h_clean))

In [None]:
normalized_pdu_fg = pdu_fg_clean.copy()
normalized_pdu_h = pdu_h_clean.copy()

normalized_pdu_fg[voice_features_to_normalize] = normalized_pdu_fg.groupby('respondent')[voice_features_to_normalize].transform(z_normalize)
normalized_pdu_h[voice_features_to_normalize] = normalized_pdu_h.groupby('respondent')[voice_features_to_normalize].transform(z_normalize)

merged_aggregations = pd.concat([normalized_pdu_fg, normalized_pdu_h])

In [None]:
# Plot distributions of features for FG and H before normalization and after normalization
def plot_distributions(df_before, df_after, feature):
    fig, axs = plt.subplots(1, 2, figsize=(15, 5))
    sns.histplot(df_before[feature], ax=axs[0])
    axs[0].set_title(f"Before normalization {feature}")
    sns.histplot(df_after[feature], ax=axs[1])
    axs[1].set_title(f"After normalization {feature}")
    plt.show()

In [None]:
len(merged_aggregations)

In [None]:
normalized_pdu_fg[normalized_pdu_fg["respondent"] == "respondent_15"]["apq3Shimmer"]

## Ground Truth

In [None]:
elaborations_indices = [4, 8, 15, 18, 30, 32, 39, 41, 51, 52]
elaborations_questions = [x for i, x in enumerate(glob_big5_questions) if i + 1 in elaborations_indices]
elaborations_names = [f"elaboration_{x}_{y}" for x in range(1, 6) for y in range(1, 3)]
elaborations_columns = [f"rbfi{x}" if x in glob_reversed_questions else f"bfi{x}" for x in elaborations_indices]

elaborations = {elaborations_names[i]: (elaborations_columns[i], elaborations_questions[i], elaborations_indices[i]) for i in range(len(elaborations_indices))}
elaborations

In [None]:
elaborations_dict = {}

for key, value in elaborations.items():
    elaborations_dict[key] = value[0] + "_gt"

elaborations_dict

In [None]:
elaborations_dict_reversed = {value: key for key, value in elaborations_dict.items()}
elaborations_dict_reversed

In [None]:
ground_truth_columns = [value for key, value in elaborations_dict.items()]

ground_truth_columns

In [None]:
ground_truth_columns_reversed = [value for key, value in elaborations_dict_reversed.items()]

ground_truth_columns_reversed

In [None]:
pairing_path = "data\\4_Pair_UXtweak_and_SurveyJS\\4_Pair_UXtweak_and_SurveyJS_data.csv"

In [None]:
pairing_df = pd.read_csv(pairing_path)
pairing_df = pairing_df[["group_evaluated", "order"] + ground_truth_columns]
# Rename group_evaluated to variant and order to respondent
pairing_df = pairing_df.rename(columns={"group_evaluated": "variant", "order": "respondent"})
# Replace 0.5 with 1 in ground_truth_columns
pairing_df[ground_truth_columns] = pairing_df[ground_truth_columns].replace(0.5, 1)
# Add prefix respondent_ to values in order column
pairing_df["respondent"] = "respondent_" + pairing_df["respondent"].astype(str)
# Rename ground truth columns to match the ones in aggregated dataframes
pairing_df = pairing_df.rename(columns=elaborations_dict_reversed)
# Each elaboration should be in a separate row
pairing_df = pairing_df.melt(id_vars=["variant", "respondent"], value_vars=ground_truth_columns_reversed, var_name="elaboration", value_name="indicator_fg")


In [None]:
pairing_df

In [None]:
pairing_df[pairing_df["indicator_fg"] == 0].count()

In [None]:
pairing_df[["elaboration", "indicator_fg"]].groupby("elaboration").sum()

In [None]:
pairing_df[["elaboration", "indicator_fg"]].groupby("elaboration").sum().sum()

In [None]:
pairing_df["control"] = "control"

In [None]:
merged_aggregations_enriched = pd.merge(merged_aggregations, pairing_df, on=["variant", "respondent", "elaboration"], how="outer")

In [None]:
# Drop rows with NaN values in ground_truth column
merged_aggregations_enriched = merged_aggregations_enriched.dropna(subset=["indicator_fg"])
merged_aggregations_enriched = merged_aggregations_enriched[merged_aggregations_enriched["control"] == "control"].drop(columns=["control"])
merged_aggregations_enriched

In [None]:
counts_of_indicator = merged_aggregations_enriched[merged_aggregations_enriched["indicator_fg"] > 0].groupby(["variant", "respondent", "elaboration"]).count()["indicator_fg"]
counts_of_indicator

In [None]:
# Count number of elaborations where sum is bigger than 0
counts_of_indicator[counts_of_indicator > 0].count()

In [None]:
check = merged_aggregations_enriched.groupby(["variant", "respondent", "elaboration"]).sum()    
check = check[check["indicator_fg"] == 0]
check.groupby(["variant", "respondent"]).count()["indicator_fg"].count()

In [None]:
merged_aggregations_enriched[merged_aggregations_enriched["variant"] == "FG"]["indicator_fg"].sum()

In [None]:
merged_aggregations_enriched[merged_aggregations_enriched["variant"] == "H"]["indicator_fg"].sum()

In [None]:
# Drop row if any of the values is NaN
columns_to_check = ['duration', 'meanF0Hz', 'medianF0Hz', 'stdevF0Hz', 'HNR', 
                                                'localJitter', 'localabsoluteJitter', 'rapJitter', 
                                                'ppq5Jitter', 'ddpJitter', 'localShimmer', 
                                                'localdbShimmer', 'apq3Shimmer', 'apq5Shimmer', 
                                                'apq11Shimmer', 'ddaShimmer']

# Remove rows with NaN values
# merged_aggregations_enriched = merged_aggregations_enriched.dropna(subset=columns_to_check)
merged_aggregations_enriched = merged_aggregations_enriched.dropna()

In [None]:
len(merged_aggregations_enriched)

In [None]:
# Table of counts of indicator_fg per variant
table = pd.pivot_table(merged_aggregations_enriched, values='indicator_fg', index=['variant'], aggfunc=np.sum)
table

In [None]:
# Table of counts of indicator_fg per variant and sex
table = pd.pivot_table(merged_aggregations_enriched, values='indicator_fg', index=['variant', 'female'], aggfunc=np.sum)
table

In [None]:
# Table of counts of indicator_fg per variant and elaboration
table = pd.pivot_table(merged_aggregations_enriched, values='indicator_fg', index=['variant', 'elaboration'], aggfunc=np.sum)
table

In [None]:
# Remove columns in which all values are the same
print(merged_aggregations_enriched.shape)
merged_aggregations_enriched = merged_aggregations_enriched.loc[:, merged_aggregations_enriched.apply(pd.Series.nunique) != 1]
possible_features = [f for f in merged_aggregations_enriched.columns if f in pdu_features]
print(merged_aggregations_enriched.shape)

## Random state

In [None]:
# Seed must be between 0 and 2**32 - 1
random_state = random.randint(0, 2**32 - 1)

print(random_state)

logging.info(f"random_state={random_state}")

In [None]:
random_state = 181163425

In [None]:
random.seed(random_state)

## Advanced analytics

In [None]:
merged_aggregations_enriched

In [None]:
merged_aggregations_enriched.groupby("indicator_fg").count()

In [None]:
categorical_cols, continuous_cols = detect_categorical_columns(merged_aggregations_enriched)
categorical_cols

In [None]:
aa_categorical_features = ['female',
                           'hesitation',
                           'disfluency',
                           'tense',
                           'qualifiers',
                           'contradictions',
                           'elaboration',
                           'variant']
aa_target = "indicator_fg"
aa_remove = ['respondent', aa_target, *aa_categorical_features]
aa_continuous_features = [f for f in merged_aggregations_enriched.columns if f not in aa_remove]

In [None]:
aa_path = 'C:\\Users\\PeterSmrecek\\Documents\\DP-Code\\data\\12_PDU_Aggregations_and_Models\\stats\\aa_voice_merged_aggregations_enriched.xlsx'

In [None]:
calculate_advanced_descriptive_stats(aa_target, aa_continuous_features, aa_categorical_features, merged_aggregations_enriched, aa_path)

## Train-Test Split

In [None]:
# Create test and train datasets, but keep all elaborations of the same respondent of the same variant in the same dataset

# Get unique respondents of each variant
unique_fg_respondents = merged_aggregations_enriched[merged_aggregations_enriched["variant"] == "FG"]["respondent"].unique()
unique_h_respondents = merged_aggregations_enriched[merged_aggregations_enriched["variant"] == "H"]["respondent"].unique()

print(len(unique_fg_respondents), len(unique_h_respondents))

# Select 80% of respondents for training
train_fg_respondents = random.sample(list(unique_fg_respondents), int(0.8 * len(unique_fg_respondents)))
train_h_respondents = random.sample(list(unique_h_respondents), int(0.8 * len(unique_h_respondents)))

print("train_fg_respondents:", train_fg_respondents)
print("train_h_respondents:", train_h_respondents)
logging.info(f"train_fg_respondents: {train_fg_respondents}")
logging.info(f"train_h_respondents: {train_h_respondents}")

# Select 20% of respondents for testing
test_fg_respondents = [x for x in unique_fg_respondents if x not in train_fg_respondents]
test_h_respondents = [x for x in unique_h_respondents if x not in train_h_respondents]

print("test_fg_respondents:", test_fg_respondents)
print("test_h_respondents:", test_h_respondents)
logging.info(f"test_fg_respondents: {test_fg_respondents}")
logging.info(f"test_h_respondents: {test_h_respondents}")

# Save this split to file
if not os.path.exists("data\\12_PDU_Aggregations_and_Models\\train_test_split"):
    os.makedirs("data\\12_PDU_Aggregations_and_Models\\train_test_split")
with open(f"data\\12_PDU_Aggregations_and_Models\\train_test_split\\{dt_string}.py", "w") as f:
    f.write("\n".join([f"train_fg_respondents = {train_fg_respondents}", f"train_h_respondents = {train_h_respondents}", f"test_fg_respondents = {test_fg_respondents}", f"test_h_respondents = {test_h_respondents}"]))

# Create train and test datasets
train_fg = merged_aggregations_enriched[(merged_aggregations_enriched["variant"] == "FG") & (merged_aggregations_enriched["respondent"].isin(train_fg_respondents))]
train_h = merged_aggregations_enriched[(merged_aggregations_enriched["variant"] == "H") & (merged_aggregations_enriched["respondent"].isin(train_h_respondents))]
test_fg = merged_aggregations_enriched[(merged_aggregations_enriched["variant"] == "FG") & (merged_aggregations_enriched["respondent"].isin(test_fg_respondents))]
test_h = merged_aggregations_enriched[(merged_aggregations_enriched["variant"] == "H") & (merged_aggregations_enriched["respondent"].isin(test_h_respondents))]

# Create train and test datasets
df_to_train = pd.concat([train_fg, train_h])
df_to_test = pd.concat([test_fg, test_h])

print(len(df_to_train), len(df_to_test))


In [None]:
print(f"{len(df_to_train[df_to_train['indicator_fg'] == 1])}/{len(df_to_train)} {len(df_to_train[df_to_train['indicator_fg'] == 1]) / len(df_to_train)}")
print(f"{len(df_to_test[df_to_test['indicator_fg'] == 1])}/{len(df_to_test)} {len(df_to_test[df_to_test['indicator_fg'] == 1]) / len(df_to_test)}")

## Plots before preprocessing

In [None]:
calculate_descriptive_stats('indicator_fg', possible_features, df_to_train, 'C:\\Users\\PeterSmrecek\\Documents\\DP-Code\\data\\12_PDU_Aggregations_and_Models\\stats\\voice_before_preprocessing_df_to_train.xlsx')

In [None]:
show_box_boxwithout_hist('indicator_fg', possible_features, df_to_train, True)

In [None]:
fig, ax = plt.subplots(figsize=(120, 96))
df_corr = df_to_train[possible_features + ['indicator_fg']].corr()

sns.heatmap(df_corr, ax=ax, annot=True, fmt=".3f")

## Undersampling / Oversampling

In [None]:
from imblearn.under_sampling import RandomUnderSampler

# Defining the undersampling strategy
rus = RandomUnderSampler(random_state=random_state)

# Assume your features are all columns except 'indicator_fg' and 'indicator_fg' is the label column
X_train = df_to_train.drop('indicator_fg', axis=1)
y_train = df_to_train['indicator_fg']

# Fitting the model
X_resampled, y_resampled = rus.fit_resample(X_train, y_train)

# Creating a new DataFrame from the resampled data
df_random_underresampled = pd.DataFrame(X_resampled, columns=X_train.columns)
df_random_underresampled['indicator_fg'] = y_resampled

# Now df_random_underresampled has a balanced indicator_fg
print("Original Distribution of Classes: ", df_to_train['indicator_fg'].value_counts())
print("New Distribution of Classes: ", df_random_underresampled['indicator_fg'].value_counts())

In [None]:
from imblearn.under_sampling import NearMiss

# Defining the NearMiss strategy (Version 3 is commonly used)
nm = NearMiss(version=3)

# Assume your features are all columns except 'indicator_fg' and 'indicator_fg' is the label column
X_train = df_to_train.drop(['indicator_fg', "respondent", "variant", "elaboration"], axis=1)
y_train = df_to_train['indicator_fg']

# Applying NearMiss
X_resampled, y_resampled = nm.fit_resample(X_train, y_train)

# Creating a new DataFrame from the resampled data
df_nearmiss_undersampled = pd.DataFrame(X_resampled, columns=X_train.columns)
df_nearmiss_undersampled['indicator_fg'] = y_resampled

# Now df_nearmiss_undersampled has a balanced indicator_fg
print("Original Distribution of Classes: ", df_to_train['indicator_fg'].value_counts())
print("New Distribution of Classes after NearMiss: ", df_nearmiss_undersampled['indicator_fg'].value_counts())

In [None]:
from imblearn.over_sampling import RandomOverSampler

# Defining the oversampling strategy
ros = RandomOverSampler(random_state=random_state)

# Assume your features are all columns except 'indicator_fg' and 'indicator_fg' is the label column
X_train = df_to_train.drop('indicator_fg', axis=1)
y_train = df_to_train['indicator_fg']

# Applying the oversampling strategy
X_resampled, y_resampled = ros.fit_resample(X_train, y_train)

# Creating a new DataFrame from the resampled data
df_random_oversampled = pd.DataFrame(X_resampled, columns=X_train.columns)
df_random_oversampled['indicator_fg'] = y_resampled

# Now df_random_oversampled has a balanced indicator_fg
print("Original Distribution of Classes: ", df_to_train['indicator_fg'].value_counts())
print("New Distribution of Classes: ", df_random_oversampled['indicator_fg'].value_counts())

In [None]:
from imblearn.over_sampling import SMOTE

# Defining the SMOTE strategy
smote = SMOTE(random_state=random_state)

# Assume your features are all columns except 'indicator_fg' and 'indicator_fg' is the label column
X_train = df_to_train.drop(['indicator_fg', "respondent", "variant", "elaboration"], axis=1)
y_train = df_to_train['indicator_fg']

# Applying SMOTE to your training data
X_smoted, y_smoted = smote.fit_resample(X_train, y_train)

# Create a DataFrame from the SMOTEd data
df_smote_oversampled = pd.DataFrame(X_smoted, columns=X_train.columns)
df_smote_oversampled['indicator_fg'] = y_smoted

# Now df_smote_oversampled has a balanced indicator_fg
print("Original Distribution of Classes: ", df_to_train['indicator_fg'].value_counts())
print("New Distribution of Classes with SMOTE: ", df_smote_oversampled['indicator_fg'].value_counts())


In [None]:
# sampling = "RandomUnderSampler" # Accuracy okolo 0.6, ale recall pre 1 obstojny, mnohokrat nad 0.6, precision ale velmi nizka, pod 0.1
# sampling = "NearMiss" # Accuracy pod 0.5, ale recall pre 1 obstojny, mnohokrat nad 0.6, precision ale velmi nizka, pod 0.2, ale vyssia ako 0.1
sampling = "RandomOverSampler" # Vysoka accuracy, aj okolo 0.7-0.8, pre 1 recall velmi nizky, mnohokrat pod 0.2, ale vyssi ako 0.1, precision velmi nizka, pod 0.2, ale vyssia ako 0.1
# sampling = "SMOTE" # Najlepsie asi, accuracy okolo 80, pre 1 precision aj recall okolo 0.3

In [None]:
if sampling == "RandomUnderSampler":
    df_to_train = df_random_underresampled
if sampling == "NearMiss":
    df_to_train = df_nearmiss_undersampled
if sampling == "RandomOverSampler":
    df_to_train = df_random_oversampled
if sampling == "SMOTE":
    df_to_train = df_smote_oversampled

## Normalize other features

In [None]:
# Create tranformer that will normalize data
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler

cols_to_transform = [f for f in pdu_features_not_normalized if f in possible_features]
print(f"Normalizing {len(cols_to_transform)} from {len(possible_features)} features")

ct = ColumnTransformer([
        ('scaler', StandardScaler(), cols_to_transform)
    ], remainder='passthrough')

ct.set_output(transform="pandas")
print(df_to_train.shape, df_to_test.shape)

df_to_train = ct.fit_transform(df_to_train)
df_to_test = ct.transform(df_to_test)

# Remove prefix from columns
df_to_train.columns = df_to_train.columns.str.replace('scaler__', '')
df_to_train.columns = df_to_train.columns.str.replace('remainder__', '')
df_to_test.columns = df_to_test.columns.str.replace('scaler__', '')
df_to_test.columns = df_to_test.columns.str.replace('remainder__', '')

print(df_to_train.shape, df_to_test.shape)


## Save preprocessed datasets

In [None]:
# Save datasets
if not os.path.exists("data\\12_PDU_Aggregations_and_Models\\datasets"):
    os.makedirs("data\\12_PDU_Aggregations_and_Models\\datasets")
df_to_train.to_csv(f"data\\12_PDU_Aggregations_and_Models\\datasets\\{dt_string}_train.csv", index=False)
df_to_test.to_csv(f"data\\12_PDU_Aggregations_and_Models\\datasets\\{dt_string}_test.csv", index=False)

## Plots after preprocessing

In [None]:
calculate_descriptive_stats('indicator_fg', possible_features, df_to_train, 'C:\\Users\\PeterSmrecek\\Documents\\DP-Code\\data\\12_PDU_Aggregations_and_Models\\stats\\voice_after_preprocessing_df_to_train.xlsx')

In [None]:
show_box_boxwithout_hist('indicator_fg', possible_features, df_to_train, True)

In [None]:
fig, ax = plt.subplots(figsize=(120, 96))
df_corr = df_to_train[possible_features + ['indicator_fg']].corr()

sns.heatmap(df_corr, ax=ax, annot=True, fmt=".3f")

## T-Test and U-Test

In [None]:
feature_names = [col for col in df_to_train.columns if col not in ["respondent", "elaboration", "variant", "indicator_fg"]]
print(len(feature_names))
print(feature_names)

In [None]:
statistical_tests_selected_features = []
results = []

for feature_name in feature_names:
    logging.info(f'++++++++++Test for {feature_name}++++++++++')
    if test_feature(df_to_train, feature_name, results, logging, ignore_power=False):
        statistical_tests_selected_features.append(feature_name)
    
print(statistical_tests_selected_features)

In [None]:
test_results = pd.DataFrame(results, columns = ['Feature', 'T-test statistic', 'T-test p-value', 'U-test statistic', 'U-test p-value', 'Power', 'Selected'])
relevant_test_results = test_results[['Feature', 'T-test statistic', 'T-test p-value', 'U-test statistic', 'U-test p-value', 'Power', 'Selected']]
relevant_test_results.index = np.arange(1, len(relevant_test_results) + 1)
relevant_test_results

In [None]:
relevant_test_results[relevant_test_results["Selected"] == True]

In [None]:
if len(statistical_tests_selected_features) < 2:
    statistical_tests_selected_features = possible_features

## LMM

In [None]:
import pandas as pd
import statsmodels.api as sm
from statsmodels.formula.api import mixedlm

stats_df = df_to_train.copy(deep=True)

# Concat respondent and variant to create user_id
stats_df['user_id'] = stats_df['respondent'] + '_' + stats_df['variant']

# Cast indicator_fg to int
stats_df['indicator_fg'] = stats_df['indicator_fg'].astype(int)

# Drop columns respondent, variant, and elaboration
stats_df = stats_df.drop(columns=['respondent', 'variant', 'elaboration'])

# Initialize a dictionary to store LMM results
lmm_results = {}

# Perform mixed-effects model for each column except the label, user ID, and task ID columns
for column in stats_df.columns:
    if column not in ['indicator_fg', 'user_id']:
        formula = f"indicator_fg ~ {column}"
        model = mixedlm(formula, stats_df, groups=stats_df["user_id"])
        result = model.fit()

        lmm_results[column] = {'Coefficient': result.params[column], 't-value': result.tvalues[column], 'p-value': result.pvalues[column]}

# Convert the results to a DataFrame for better readability
lmm_results_df = pd.DataFrame(lmm_results).T

# Display the results
print(lmm_results_df)


In [None]:
lmm_test_selected_features = lmm_results_df[lmm_results_df["p-value"] < 0.05].index.tolist()
lmm_results_df[lmm_results_df["p-value"] < 0.05]

## Select statistically significant features

In [None]:
df_to_train = df_to_train[["respondent", "elaboration", "variant", "indicator_fg"] + lmm_test_selected_features]

## Feature selection

The following code is insipred by official documentation.

In [None]:
lasso = True

In [None]:
if lasso:
    try:
        X_train_lasso = df_to_train.drop(["respondent", "elaboration", "variant", "indicator_fg"], axis=1)
    except:
        X_train_lasso = df_to_train.drop(["indicator_fg"], axis=1)
    y_train_lasso = df_to_train['indicator_fg']

In [None]:
if lasso:
    lsvc = LinearSVC(C=0.03, penalty="l1", dual=False).fit(X_train_lasso, y_train_lasso)
    model = SelectFromModel(lsvc, prefit=True)
    X_new = model.transform(X_train_lasso)
    X_new.shape

In [None]:
lasso_selected_features = []

In [None]:
if lasso:
    lasso_selected_features = X_train_lasso.columns[(model.get_support())]
    lasso_selected_features = list(lasso_selected_features)
lasso_selected_features

In [None]:
len(lasso_selected_features)

In [None]:
if lasso:
    export_lasso_df = pd.DataFrame({'Feature': list(X_train_lasso.columns), 'Weight': lsvc.coef_.tolist()[0]}) 
    export_lasso_df['Selected'] = export_lasso_df['Feature'].apply(lambda x: x in lasso_selected_features)
    export_lasso_df.index = np.arange(1, len(export_lasso_df) + 1)
    export_lasso_df

## Use selected features only

In [None]:
if lasso:
    df_to_test = df_to_test[lasso_selected_features + ["indicator_fg"]]
    df_to_train = df_to_train[lasso_selected_features + ["indicator_fg"]]
else:
    df_to_test = df_to_test[statistical_tests_selected_features + ["indicator_fg"]]
    df_to_train = df_to_train[statistical_tests_selected_features + ["indicator_fg"]]

print(len(df_to_train), len(df_to_test))

## Save selected features

In [None]:
# Save selected columns to file
if not os.path.exists("data\\12_PDU_Aggregations_and_Models\\selected_columns"):
    os.makedirs("data\\12_PDU_Aggregations_and_Models\\selected_columns")
with open(f"data\\12_PDU_Aggregations_and_Models\\selected_columns\\{dt_string}.py", "w") as f:
    f.write("\n".join([f"df_to_test_cols = {str(df_to_test.columns.to_list())}", f"df_to_train_cols = {str(df_to_train.columns.to_list())}"]))

## Shuffle

In [None]:
# Shuffle the data
df_to_train = df_to_train.sample(frac=1).reset_index(drop=True)
df_to_test = df_to_test.sample(frac=1).reset_index(drop=True)

In [None]:
X_train = df_to_train.drop(["indicator_fg"], axis=1).reset_index(drop=True)
X_test = df_to_test.drop(["indicator_fg"], axis=1).reset_index(drop=True)
y_train = df_to_train['indicator_fg'].astype(int).reset_index(drop=True)
y_test = df_to_test['indicator_fg'].astype(int).reset_index(drop=True)

In [None]:
# Number of indicators with value 1 in each dataset
print(y_train.value_counts())
print(y_test.value_counts())

## Controlling

In [None]:
shap_plots = True

In [None]:
def path_generator(model):
    dir_path = "data\\12_PDU_Aggregations_and_Models\\models"
    if not os.path.exists(dir_path):
        os.makedirs(dir_path)

    dir_path = f"data\\12_PDU_Aggregations_and_Models\\models\\{model}"
    if not os.path.exists(dir_path):
        os.makedirs(dir_path)

    return f"data\\12_PDU_Aggregations_and_Models\\models\\{model}\\{dt_string}.joblib"

In [None]:
global_report = None

## Decision Tree

The following function is taken from my project developed on the subject Intelligent Data Analysis 2021/2022.

In [None]:
decision_tree_param_grid = {
    'max_depth': [None, 10, 20, 30, 40],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'criterion': ['gini', 'entropy']
}

clf = DecisionTreeClassifier(random_state=random_state)

clf1, best_params1, train_report1, test_report1 = model_training(clf, X_train, X_test, y_train, y_test, logging, path_generator("decision_tree"), decision_tree_param_grid, driver_silent=False, random_state=random_state, n_iter=50, cv=3, verbose=4)
global_report = add_to_global_report(global_report, train_report1, test_report1, "decision_tree", best_params1)

In [None]:
# Get feature importances
importances = clf1.feature_importances_
indices = np.argsort(importances)[::-1]
print("Feature ranking:")
for f in range(X_train.shape[1]):
    print(f"{f + 1}. feature {X_train.columns[indices[f]]} ({importances[indices[f]]})")

In [None]:
if shap_plots:
    calculate_shap(clf1, X_train, X_test, tree=True, pos_class=True)

## Random Forest

The following function is taken from my project developed on the subject Intelligent Data Analysis 2021/2022.

In [None]:
random_forest_param_grid = {
    'n_estimators': [100, 200, 300],
    'max_features': [None, 'sqrt', 'log2'],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10]
}

clf = RandomForestClassifier(random_state=random_state)

clf2, best_params2, train_report2, test_report2 = model_training(clf, X_train, X_test, y_train, y_test, logging, path_generator("random_forest"), random_forest_param_grid, driver_silent=False, random_state=random_state, n_iter=50, cv=3, verbose=4)
global_report = add_to_global_report(global_report, train_report2, test_report2, "random_forest", best_params2)

In [None]:
# Get feature importances
importances = clf2.feature_importances_
indices = np.argsort(importances)[::-1]
print("Feature ranking:")
for f in range(X_train.shape[1]):
    print(f"{f + 1}. feature {X_train.columns[indices[f]]} ({importances[indices[f]]})")

In [None]:
if shap_plots:
    calculate_shap(clf2, X_train, X_test, tree=True, pos_class=True)

## SVM

In [None]:
linear_svm_param_grid = {
    'C': [0.1, 1, 10, 100]
}

clf = svm.SVC(kernel='linear', random_state=random_state)

clf3_a, best_params3_a, train_report3_a, test_report3_a = model_training(clf, X_train, X_test, y_train, y_test, logging, path_generator("linear_svm"), linear_svm_param_grid, driver_silent=False, random_state=random_state, n_iter=50, cv=3, verbose=4, zero_division=0)
global_report = add_to_global_report(global_report, train_report3_a, test_report3_a, "linear_svm", best_params3_a)

In [None]:
if shap_plots:
    calculate_shap(clf3_a, X_train, X_test)

In [None]:
poly_svm_param_grid = {
    'degree': [2, 3, 4],
    'coef0': [0, 1, 10] 
}

clf = svm.SVC(kernel='poly', random_state=random_state)

clf3_b, best_params3_b, train_report3_b, test_report3_b = model_training(clf, X_train, X_test, y_train, y_test, logging, path_generator("poly_svm"), poly_svm_param_grid, driver_silent=False, random_state=random_state, n_iter=50, cv=3, verbose=4, zero_division=0)
global_report = add_to_global_report(global_report, train_report3_b, test_report3_b, "poly_svm", best_params3_b)

In [None]:
rbf_svm_param_grid = {
    'gamma': ['scale', 'auto', 0.01, 0.1, 1]
}

clf = svm.SVC(kernel='rbf', random_state=random_state)

clf3_c, best_params3_c, train_report3_c, test_report3_c = model_training(clf, X_train, X_test, y_train, y_test, logging, path_generator("rbf_svm"), rbf_svm_param_grid, driver_silent=False, random_state=random_state, n_iter=50, cv=3, verbose=4, zero_division=0)
global_report = add_to_global_report(global_report, train_report3_c, test_report3_c, "rbf_svm", best_params3_c)

In [None]:
sigmoid_svm_param_grid = {
    'gamma': ['scale', 'auto', 0.01, 0.1, 1],
    'coef0': [0, 1, 10]
}

clf = svm.SVC(kernel='sigmoid', random_state=random_state)

clf3_d, best_params3_d, train_report3_d, test_report3_d = model_training(clf, X_train, X_test, y_train, y_test, logging, path_generator("sigmoid_svm"), sigmoid_svm_param_grid, driver_silent=False, random_state=random_state, n_iter=50, cv=3, verbose=4, zero_division=0)
global_report = add_to_global_report(global_report, train_report3_d, test_report3_d, "sigmoid_svm", best_params3_d)

## Gradient Boosting

In [None]:
gradient_boosting_param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 10],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

clf = GradientBoostingClassifier(random_state=random_state)

clf4, best_params4, train_report4, test_report4 = model_training(clf, X_train, X_test, y_train, y_test, logging, path_generator("gradient_boosting"), gradient_boosting_param_grid, driver_silent=False, random_state=random_state, n_iter=50, cv=3, verbose=4)
global_report = add_to_global_report(global_report, train_report4, test_report4, "gradient_boosting", best_params4)

In [None]:
# Get feature importances
importances = clf4.feature_importances_
indices = np.argsort(importances)[::-1]
print("Feature ranking:")
for f in range(X_train.shape[1]):
    print(f"{f + 1}. feature {X_train.columns[indices[f]]} ({importances[indices[f]]})")

In [None]:
if shap_plots:
    calculate_shap(clf4, X_train, X_test, tree=True)

## Logistic Regression

In [None]:
logistic_regression_param_grid = {
    'C': [0.1, 1, 10, 100],
    'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
    'penalty': ['None', 'l2', 'l1', 'elasticnet']
}

clf = LogisticRegression(max_iter=2000000, random_state=random_state)

clf5, best_params5, train_report5, test_report5 = model_training(clf, X_train, X_test, y_train, y_test, logging, path_generator("logistic_regression"), logistic_regression_param_grid, driver_silent=False, random_state=random_state, n_iter=50, cv=3, verbose=4)
global_report = add_to_global_report(global_report, train_report5, test_report5, "logistic_regression", best_params5)

In [None]:
if shap_plots:
    calculate_shap(clf5, X_train, X_test)

## XGBoost 

In [None]:
xgboost_param_grid = {
    'n_estimators': [100, 200, 300, 800],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7],
    'objective': ['binary:hinge', 'binary:logistic', 'binary:logitraw']
}

clf = xgb.XGBClassifier(objective='binary:hinge', random_state=random_state)

clf6, best_params6, train_report6, test_report6 = model_training(clf, X_train, X_test, y_train, y_test, logging, path_generator("xgboost"), xgboost_param_grid, driver_silent=False, random_state=random_state, n_iter=50, cv=3, verbose=4)
global_report = add_to_global_report(global_report, train_report6, test_report6, "xgboost", best_params6)

In [None]:
if shap_plots:
    calculate_shap(clf6, X_train, X_test, tree=True)

## Balanced Random Forest

In [None]:
balanced_random_forest_param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

clf = BalancedRandomForestClassifier(random_state=random_state)

clf7, best_params7, train_report7, test_report7 = model_training(clf, X_train, X_test, y_train, y_test, logging, path_generator("balanced_random_forest"), balanced_random_forest_param_grid, driver_silent=False, random_state=random_state, n_iter=50, cv=3, verbose=4)
global_report = add_to_global_report(global_report, train_report7, test_report7, "balanced_random_forest", best_params7)

In [None]:
# Get feature importances
importances = clf7.feature_importances_
indices = np.argsort(importances)[::-1]
print("Feature ranking:")
for f in range(X_train.shape[1]):
    print(f"{f + 1}. feature {X_train.columns[indices[f]]} ({importances[indices[f]]})")

In [None]:
if shap_plots:
    calculate_shap(clf7, X_train, X_test, tree=True, pos_class=True)

## Balanced Bagging Classifier

In [None]:
balanced_bagging_classifier_param_grid = {
    'n_estimators': [10, 50, 100],
    'max_samples': [0.5, 0.7, 1.0],
    'max_features': [0.5, 0.7, 1.0]
}

clf = BalancedBaggingClassifier(random_state=random_state, estimator=None, n_estimators=10, 
                                max_samples=1.0, max_features=1.0, bootstrap=True, bootstrap_features=False, 
                                oob_score=False, warm_start=False, sampling_strategy='auto', replacement=False, 
                                n_jobs=None, verbose=0, sampler=None)

clf8, best_params8, train_report8, test_report8 = model_training(clf, X_train, X_test, y_train, y_test, logging, path_generator("balanced_bagging_classifier"), balanced_bagging_classifier_param_grid, driver_silent=False, random_state=random_state, n_iter=50, cv=3, verbose=4)
global_report = add_to_global_report(global_report, train_report8, test_report8, "balanced_bagging_classifier", best_params8)

In [None]:
clf8.feature_names_in_

## Report

In [None]:
global_report

In [None]:
# Save global report
if not os.path.exists("data\\12_PDU_Aggregations_and_Models\\report"):
    os.makedirs("data\\12_PDU_Aggregations_and_Models\\report")
path_to_save = f"data\\12_PDU_Aggregations_and_Models\\report\\{dt_string}.csv"
global_report["metric"] = global_report.index
global_report.to_csv(path_to_save, index=False, sep=";")