In [1]:
import pandas as pd
import pycaret
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from pycaret.classification import *
from imblearn.over_sampling import SMOTE

In [2]:
df = pd.read_pickle("../data/depresjon/depresjon_preprocessed.pkl")
df

Unnamed: 0,timestamp,user_id,activity,days,gender,age,afftype,melanch,inpatient,edu,...,month_sin,weekday_sin,week_sin,day_sin,month_cos,weekday_cos,week_cos,day_cos,activity_quantile,user_activity_quantile
0,1052265600000000000,1,174.144444,11,0,0,2.0,2.0,2.0,1,...,-0.974928,0.974928,-0.998027,0.988468,-0.222521,-0.222521,0.062791,0.151428,1,1
1,1052352000000000000,1,156.247222,11,0,0,2.0,2.0,2.0,1,...,-0.974928,0.433884,-0.998027,0.998717,-0.222521,-0.900969,0.062791,-0.050649,1,1
2,1052438400000000000,1,124.135417,11,0,0,2.0,2.0,2.0,1,...,-0.974928,-0.433884,-0.998027,0.968077,-0.222521,-0.900969,0.062791,-0.250653,1,1
3,1052524800000000000,1,134.961806,11,0,0,2.0,2.0,2.0,1,...,-0.974928,-0.974928,-0.998027,0.897805,-0.222521,-0.222521,0.062791,-0.440394,1,1
4,1052611200000000000,1,99.439583,11,0,0,2.0,2.0,2.0,1,...,-0.974928,-0.781831,-0.998027,0.790776,-0.222521,0.623490,0.062791,-0.612106,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
400,1086480000000000000,9,162.295833,13,0,1,1.0,2.0,2.0,1,...,-0.781831,-0.781831,-0.481754,0.937752,0.623490,0.623490,0.876307,0.347305,1,0
401,1086566400000000000,9,224.508333,13,0,1,1.0,2.0,2.0,1,...,-0.781831,0.000000,-0.24869,0.988468,0.623490,1.000000,0.968583,0.151428,1,2
402,1086652800000000000,9,202.056250,13,0,1,1.0,2.0,2.0,1,...,-0.781831,0.781831,-0.24869,0.998717,0.623490,0.623490,0.968583,-0.050649,1,2
403,1086739200000000000,9,168.656250,13,0,1,1.0,2.0,2.0,1,...,-0.781831,0.974928,-0.24869,0.968077,0.623490,-0.222521,0.968583,-0.250653,1,1


In [3]:
df['category_madrs'].value_counts()

0    201
1    188
Name: category_madrs, dtype: int64

In [4]:
def train_test_split_per_user(data, train_size=0.7):
    users = list(set(data.user_id))
    users = sorted(users, reverse=True)  # fix randomness
    total_users = len(users)
    slice = int(train_size * total_users)
    users_train = users[:slice]
    users_test = users[slice:]
    return data[data.user_id.isin(users_train)], data[data.user_id.isin(users_test)]

In [5]:
train_data, test_data = train_test_split_per_user(df)

fold_groups = train_data.user_id
train_data = train_data.drop(columns=['user_id'])
test_data = test_data.drop(columns=['user_id'])

In [6]:
X = train_data.drop(columns=['category_madrs'])
y = train_data['category_madrs']

# Apply SMOTE
smote = SMOTE(random_state=123)
X_resampled, y_resampled = smote.fit_resample(X, y)

# Combine resampled data
train_data_resampled = pd.concat([X_resampled, y_resampled], axis=1)

In [7]:
s = setup(data=train_data_resampled, 
          target='category_madrs', 
          session_id=123, 
          fold_strategy='groupkfold', 
          fold=5,  
          fold_groups=fold_groups, 
          test_data=test_data, 
          normalize=True)

Unnamed: 0,Description,Value
0,Session id,123
1,Target,category_madrs
2,Target type,Binary
3,Original data shape,"(431, 23)"
4,Transformed data shape,"(431, 23)"
5,Transformed train set shape,"(306, 23)"
6,Transformed test set shape,"(125, 23)"
7,Numeric features,22
8,Preprocess,True
9,Imputation type,simple


In [8]:
best = compare_models(sort = 'F1')

Processing:   0%|          | 0/69 [00:00<?, ?it/s]

In [9]:
predictions = predict_model(best, data=test_data)

AttributeError: 'list' object has no attribute 'predict'

In [None]:
from sklearn.metrics import f1_score

f1 = f1_score(predictions_df['category_madrs'], predictions_df['Label'])

print(f'F1 Score: {f1:.4f}')

In [None]:
# Extracting X_test (features) - this includes all columns except the label and prediction columns
X_test = predictions.drop(columns=['Label', 'Score'])

# Extracting y_test (true labels)
y_test = predictions['category_madrs']  # Adjust this to your actual target column name

# Extracting y_pred (predicted labels)
y_pred = predictions['Label']

In [None]:
print(y_pred.value_counts())
print(y_test.value_counts())

In [None]:
train_data.columns

In [None]:
test_data.columns

In [None]:
from sklearn.model_selection import cross_val_score

f1_scores = cross_val_score(best, X_train, y_train, cv=5, scoring='f1')
print(f1_scores)

In [None]:
y_pred.to_csv('../data/depresjon/predictions/depresjon_pred_generic.csv', index=False)
X_test.to_csv('../data/depresjon/predictions/depresjon_X_test_generic.csv', index=False)
y_test.to_csv('../data/depresjon/predictions/depresjon_y_test_generic.csv', index=False)

In [None]:
cm = confusion_matrix(y_test, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot(cmap=plt.cm.Blues)
plt.title('Confusion Matrix')
plt.show()

In [None]:
algo = create_model('gbc')

In [None]:
algo

In [None]:
#algo.get_all_params()

In [None]:
tuned_algo = tune_model(algo)

In [None]:
#tuned_algo.get_all_params()

In [None]:
tuned_algo

In [None]:
plot_model(tuned_algo, plot = 'auc')

In [None]:
plot_model(tuned_algo, plot='feature')

In [None]:
plot_model(tuned_algo, plot = 'confusion_matrix')

In [None]:
# Predict on test / hold-out Sample
test_predictions = predict_model(tuned_algo, data=test_data)