In [None]:
import os
import pandas as pd
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from joblib import dump

INFO_CSV  = "OBF-Psychiatric-Dataset/schizophrenia-info.csv"
TS_FOLDER = "OBF-Psychiatric-Dataset/schizophrenia"

df_info = pd.read_csv(INFO_CSV).rename(columns={'number': 'patient_id'})
df_info['bprs'] = pd.to_numeric(df_info['bprs'], errors='coerce').fillna(0)
df_info = df_info.drop(columns=['acc_time'])


categorical_cols = ['gender', 'age', 'schtype', 'migraine', 'cloz', 'antipsych', 'moodst']
encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    df_info[col] = le.fit_transform(df_info[col].astype(str))
    encoders[col] = dict(zip(le.classes_, le.transform(le.classes_)))
dump(encoders, 'encoders.joblib')


records = []
for _, meta in df_info.set_index('patient_id').iterrows():
    pid = meta.name
    ts = pd.read_csv(
        os.path.join(TS_FOLDER, f"{pid}.csv"),
        parse_dates=['timestamp']
    )[['timestamp', 'activity']].copy()
    ts['prev_act'] = ts['activity'].shift(1).fillna(0)

    for _, row in ts.iterrows():
        rec = meta.to_dict()
        rec.update({
            'patient_id': pid,
            'timestamp':    row['timestamp'],
            'prev_act':     row['prev_act'],
            'activity':     row['activity']
        })
        records.append(rec)

df_model = pd.DataFrame(records)


numeric_cols = ['days', 'bprs', 'prev_act']
scaler = MinMaxScaler()
df_model[numeric_cols] = scaler.fit_transform(df_model[numeric_cols])
dump(scaler, 'scaler.joblib')


df_model = df_model.sort_values(['patient_id', 'timestamp'])
train_parts, val_parts, test_parts = [], [], []

for pid, group in df_model.groupby('patient_id', sort=False):
    n = len(group)
    n_train = int(n * 0.70)
    n_val   = int(n * 0.15)
    train_parts.append( group.iloc[:n_train] )
    val_parts.append(   group.iloc[n_train:n_train + n_val] )
    test_parts.append(  group.iloc[n_train + n_val:] )

train_df = pd.concat(train_parts)
val_df   = pd.concat(val_parts)
test_df  = pd.concat(test_parts)


def split_xy_ts(df):
    ts = df['timestamp']
    y  = df['activity']
    X  = df.drop(columns=['activity', 'timestamp'])
    return X, y, ts

X_train, y_train, t_train = split_xy_ts(train_df)
X_val,   y_val,   t_val   = split_xy_ts(val_df)
X_test,  y_test,  t_test  = split_xy_ts(test_df)

dump((X_train, y_train, t_train), 'train_set.joblib')
dump((X_val,   y_val,   t_val),   'val_set.joblib')
dump((X_test,  y_test,  t_test),  'test_set.joblib')



['test_set.joblib']

In [27]:
X_train

Unnamed: 0,gender,age,days,schtype,migraine,bprs,cloz,antipsych,moodst,patient_id,prev_act
0,1.0,4.0,0.5,1.0,1.0,0.813559,0.0,0.0,1.0,schizophrenia_1,0.000000
1,1.0,4.0,0.5,1.0,1.0,0.813559,0.0,0.0,1.0,schizophrenia_1,0.083125
2,1.0,4.0,0.5,1.0,1.0,0.813559,0.0,0.0,1.0,schizophrenia_1,0.020750
3,1.0,4.0,0.5,1.0,1.0,0.813559,0.0,0.0,1.0,schizophrenia_1,0.014500
4,1.0,4.0,0.5,1.0,1.0,0.813559,0.0,0.0,1.0,schizophrenia_1,0.000000
...,...,...,...,...,...,...,...,...,...,...,...
209705,1.0,8.0,0.5,0.0,1.0,0.983051,1.0,0.0,0.0,schizophrenia_9,0.058000
209706,1.0,8.0,0.5,0.0,1.0,0.983051,1.0,0.0,0.0,schizophrenia_9,0.049250
209707,1.0,8.0,0.5,0.0,1.0,0.983051,1.0,0.0,0.0,schizophrenia_9,0.037875
209708,1.0,8.0,0.5,0.0,1.0,0.983051,1.0,0.0,0.0,schizophrenia_9,0.061875
