In [1]:
# Import libraries
from kan import KAN
import matplotlib.pyplot as plt
from sklearn.datasets import make_moons
import torch

import argparse
import pandas as pd
import numpy as np
import scipy
import os
import random
import pickle
import matplotlib.pyplot as plt
from sklearn.metrics import (classification_report, balanced_accuracy_score, confusion_matrix, 
                roc_auc_score, accuracy_score, roc_curve, RocCurveDisplay)

from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression, ElasticNet
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, StackingClassifier
from xgboost import XGBClassifier
from sklearn.neural_network import MLPClassifier

In [2]:
def confidence_interval(data, confidence=0.95):
    # Calculate confidence interval
    a = 1.0 * np.array(data)
    n = len(a)
    m, se = np.mean(a), scipy.stats.sem(a)
    h = se * scipy.stats.t.ppf((1 + confidence) / 2.0, n-1)
    return m-h, m+h

In [3]:
question_numbers = [1, 2, 3, 4, 5, 6, 7, 8]         # Numbers of questions from DASS to run through
target = "anxiety_status"
models_to_train = 10        # Number of models for each number of questions from DASS
models_per_question = 50    # Number of ensembles per model
test_split = 0.1    
seed = 42
random.seed(seed)


ACCS = []
AUCS = []
PRES = []
RECS = []
F1S = []
AUC_STDEV = []
F1_STDEV = []
AUC_95CI_U = []
AUC_95CI_D = []
F1_95CI_U = []
F1_95CI_D = []


data_folder = "./data"
models_folder = "./models"

In [4]:
feats_df = pd.read_csv(os.path.join(data_folder, "features.csv"))
labels_df = pd.read_csv(os.path.join(data_folder, "labels.csv"))

questions = [20, 9, 30, 11, 19, 2, 36, 28, 4, 23]

In [5]:
def train_acc():
                return torch.mean((torch.round(model(dataset['train_input'])[:,0]) == dataset['train_label'][:,0]).float())

def test_acc():
    return torch.mean((torch.round(model(dataset['test_input'])[:,0]) == dataset['test_label'][:,0]).float())

def acc(formula, X, y):
                batch = X.shape[0]
                correct = 0
                for i in range(batch):
                    correct += np.round(np.array(formula.subs('x_1', X[i,0]).subs('x_2', X[i,1])).astype(np.float64)) == y[i,0]
                return correct/batch

In [None]:
# For different numbers of questions from DASS-42
for num_questions in question_numbers:  # For question numbers in [1, 2, 3, 4, 5, 6, 7, 8]
    models = {}

    accs = []
    aucs = []
    pres = []
    recs = []
    f1s = []
    auc_stdev = []
    f1_stdev = []
    auc_95ci_u = []
    auc_95ci_d = []
    f1_95ci_u = []
    f1_95ci_d = []
    lst_comb =[]    

    model_num = 0
    for a in range(models_to_train):  # For a in range(10)
        model = {}

        print("Training model", a)
        cols = ["gender_m", "gender_f", "region_other", 
                    "region_east", "region_west", "age_norm"] # With demographic features

        if num_questions == 1:  # Only has 1 question
            if a >= len(questions):
                break
            question_nums = [questions[a]]  # a=0, question[0]=20, question_nums contains a list of question numbers
            
        else:  # More than 1 question
            question_nums = random.sample(questions, num_questions)  # Randomly sample num_questions from all questions
            question_nums.sort() # Sort the questions
            #resample if already in list
            while question_nums in lst_comb:
                question_nums = random.sample(questions, num_questions)
            lst_comb.append(question_nums)
        # Finish sampling questions

        for q in question_nums:  # q is one of the selected questions
            for j in range(4):
                cols.append("Q{0}A_{1}".format(q, j))  # Generate the question numbers
        features = feats_df[cols]  # Get the features for the selected questions

        labels = labels_df[[target]].copy()

        np.random.seed(seed)
        shufId = np.random.permutation(int(len(labels)))
        index = int(test_split * len(labels.index))  # Index of data to be used for testing

        df_prist = features.iloc[shufId[0:index]]   # Data for testing
        df_trainvalid = features.iloc[shufId[index:-1]] # Data for training and validation

        gt_prist = labels.iloc[shufId[0:index]]     # Labels for testing
        gt_trainvalid = labels.iloc[shufId[index:-1]]       # Labels for training and validation

        df_prist.to_csv(os.path.join(data_folder, "prist_features.csv"), index=False)
        gt_prist.to_csv(os.path.join(data_folder, "prist_labels.csv"), index=False)

        accs1 = []
        aucs1 = []
        pres1 = []
        recs1 = []
        f1s1 = []
        ensemble_models = []
        
        train_rmse = []
        test_rmse = []
        

        for b in range(models_per_question):  # b from 0 to 49
            if b % 10 == 0:
                print("Training iteration", b)

            np.random.seed(b)
            shufId = np.random.permutation(int(len(gt_trainvalid)))
            index = int((1/9) * len(gt_trainvalid.index))

            df_valid = df_trainvalid.iloc[shufId[0:index]]
            df_train = df_trainvalid.iloc[shufId[index:-1]]

            gt_valid = gt_trainvalid.iloc[shufId[0:index]]
            gt_train = gt_trainvalid.iloc[shufId[index:-1]]

            df_valid = df_valid.reset_index(drop=True)
            df_train = df_train.reset_index(drop=True)

            gt_valid = gt_valid.reset_index(drop=True)
            gt_train = gt_train.reset_index(drop=True)

            dataset = {}
            # Convert data to PyTorch tensors
            dataset['train_input'] = torch.from_numpy(df_train.values).float()
            dataset['train_label'] = torch.from_numpy(gt_train.values[:, None]).float()
            dataset['test_input'] = torch.from_numpy(df_prist.values).float()
            dataset['test_label'] = torch.from_numpy(gt_prist.values[:, None]).float()
            
            
            X = dataset['train_input']
            y = dataset['train_label']
            
            # Determine the number of features (input dimensions) and output dimensions
            input_dim = df_train.shape[1]  # Number of features
            output_dim = 1  # Assuming binary classification or regression for simplicity

            # Define the width of the network layers
            # Example: input layer, two hidden layers with 5 neurons each, and output layer
            width = [input_dim, 5, output_dim] 
            
            # Initialize and train the KAN model
            model = KAN(
                        width=width,
                        grid=3, 
                        k=3   
                    )
            def train_acc():
                return torch.mean((torch.round(model(dataset['train_input'])[:,0]) == dataset['train_label'][:,0]).float())

            def test_acc():
                return torch.mean((torch.round(model(dataset['test_input'])[:,0]) == dataset['test_label'][:,0]).float())

            results = model.train(dataset, opt="LBFGS", steps=20, metrics=(train_acc, test_acc))
            lib = ['x','x^2','x^3','x^4','exp','log','sqrt','tanh','sin','tan','abs']
            model.auto_symbolic(lib=lib)
            formula = model.symbolic_formula()[0][0]
            print(formula)
            train_rmse.append(results['train_loss'][-1].item())
            test_rmse.append(results['test_loss'][-1].item())
            
            
