In [49]:
import pandas as pd
from dataprep.eda import create_report
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from datetime import datetime

In [50]:
custom_colors = ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd', '#8c564b', '#e377c2', '#7f7f7f']
complementary_colors = ['#a9d0f5', '#f3ba8c', '#98e698', '#f5a9a9', '#d8b5d8', '#d2a6a6', '#f5a9f2', '#d8d8d8']

full_palette = custom_colors + complementary_colors
sns.set_palette(full_palette)

plt.rcParams['axes.prop_cycle'] = plt.cycler(color=custom_colors)
plt.rcParams['axes.titlesize'] = 30
plt.rcParams['axes.labelsize'] = 20
plt.rcParams['xtick.labelsize'] = 20
plt.rcParams['ytick.labelsize'] = 20
plt.rcParams['legend.fontsize'] = 15

In [51]:
df = pd.read_pickle('..\data\depresjon\df_preprocessed.pkl')
df

Unnamed: 0,timestamp,Dates,Time,user_id,activity,days,gender,afftype,melanch,inpatient,marriage,work,madrs1,madrs2,madrs_mean
0,2003-05-07 12:00:00,2003-05-07,12:00:00,condition_1,346.550000,11.0,2.0,2.0,2.0,2.0,1.0,2.0,19.0,19.0,19.0
1,2003-05-07 13:00:00,2003-05-07,13:00:00,condition_1,284.566667,11.0,2.0,2.0,2.0,2.0,1.0,2.0,19.0,19.0,19.0
2,2003-05-07 14:00:00,2003-05-07,14:00:00,condition_1,279.183333,11.0,2.0,2.0,2.0,2.0,1.0,2.0,19.0,19.0,19.0
3,2003-05-07 15:00:00,2003-05-07,15:00:00,condition_1,218.783333,11.0,2.0,2.0,2.0,2.0,1.0,2.0,19.0,19.0,19.0
4,2003-05-07 16:00:00,2003-05-07,16:00:00,condition_1,238.550000,11.0,2.0,2.0,2.0,2.0,1.0,2.0,19.0,19.0,19.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
335,2004-06-10 11:00:00,2004-06-10,11:00:00,condition_9,0.000000,13.0,2.0,1.0,2.0,2.0,1.0,2.0,26.0,26.0,26.0
336,2004-06-10 12:00:00,2004-06-10,12:00:00,condition_9,14.600000,13.0,2.0,1.0,2.0,2.0,1.0,2.0,26.0,26.0,26.0
337,2004-06-10 13:00:00,2004-06-10,13:00:00,condition_9,511.316667,13.0,2.0,1.0,2.0,2.0,1.0,2.0,26.0,26.0,26.0
338,2004-06-10 14:00:00,2004-06-10,14:00:00,condition_9,7.733333,13.0,2.0,1.0,2.0,2.0,1.0,2.0,26.0,26.0,26.0


In [52]:
df['Dates'] = pd.to_datetime(df['Dates'])
df['DayName'] = pd.Series(df['Dates'].dt.day_name(), index=df.index)
df

Unnamed: 0,timestamp,Dates,Time,user_id,activity,days,gender,afftype,melanch,inpatient,marriage,work,madrs1,madrs2,madrs_mean,DayName
0,2003-05-07 12:00:00,2003-05-07,12:00:00,condition_1,346.550000,11.0,2.0,2.0,2.0,2.0,1.0,2.0,19.0,19.0,19.0,Wednesday
1,2003-05-07 13:00:00,2003-05-07,13:00:00,condition_1,284.566667,11.0,2.0,2.0,2.0,2.0,1.0,2.0,19.0,19.0,19.0,Wednesday
2,2003-05-07 14:00:00,2003-05-07,14:00:00,condition_1,279.183333,11.0,2.0,2.0,2.0,2.0,1.0,2.0,19.0,19.0,19.0,Wednesday
3,2003-05-07 15:00:00,2003-05-07,15:00:00,condition_1,218.783333,11.0,2.0,2.0,2.0,2.0,1.0,2.0,19.0,19.0,19.0,Wednesday
4,2003-05-07 16:00:00,2003-05-07,16:00:00,condition_1,238.550000,11.0,2.0,2.0,2.0,2.0,1.0,2.0,19.0,19.0,19.0,Wednesday
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
335,2004-06-10 11:00:00,2004-06-10,11:00:00,condition_9,0.000000,13.0,2.0,1.0,2.0,2.0,1.0,2.0,26.0,26.0,26.0,Thursday
336,2004-06-10 12:00:00,2004-06-10,12:00:00,condition_9,14.600000,13.0,2.0,1.0,2.0,2.0,1.0,2.0,26.0,26.0,26.0,Thursday
337,2004-06-10 13:00:00,2004-06-10,13:00:00,condition_9,511.316667,13.0,2.0,1.0,2.0,2.0,1.0,2.0,26.0,26.0,26.0,Thursday
338,2004-06-10 14:00:00,2004-06-10,14:00:00,condition_9,7.733333,13.0,2.0,1.0,2.0,2.0,1.0,2.0,26.0,26.0,26.0,Thursday


In [53]:
df['user_id'] = df['user_id'].str.replace('condition_', '')
df

Unnamed: 0,timestamp,Dates,Time,user_id,activity,days,gender,afftype,melanch,inpatient,marriage,work,madrs1,madrs2,madrs_mean,DayName
0,2003-05-07 12:00:00,2003-05-07,12:00:00,1,346.550000,11.0,2.0,2.0,2.0,2.0,1.0,2.0,19.0,19.0,19.0,Wednesday
1,2003-05-07 13:00:00,2003-05-07,13:00:00,1,284.566667,11.0,2.0,2.0,2.0,2.0,1.0,2.0,19.0,19.0,19.0,Wednesday
2,2003-05-07 14:00:00,2003-05-07,14:00:00,1,279.183333,11.0,2.0,2.0,2.0,2.0,1.0,2.0,19.0,19.0,19.0,Wednesday
3,2003-05-07 15:00:00,2003-05-07,15:00:00,1,218.783333,11.0,2.0,2.0,2.0,2.0,1.0,2.0,19.0,19.0,19.0,Wednesday
4,2003-05-07 16:00:00,2003-05-07,16:00:00,1,238.550000,11.0,2.0,2.0,2.0,2.0,1.0,2.0,19.0,19.0,19.0,Wednesday
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
335,2004-06-10 11:00:00,2004-06-10,11:00:00,9,0.000000,13.0,2.0,1.0,2.0,2.0,1.0,2.0,26.0,26.0,26.0,Thursday
336,2004-06-10 12:00:00,2004-06-10,12:00:00,9,14.600000,13.0,2.0,1.0,2.0,2.0,1.0,2.0,26.0,26.0,26.0,Thursday
337,2004-06-10 13:00:00,2004-06-10,13:00:00,9,511.316667,13.0,2.0,1.0,2.0,2.0,1.0,2.0,26.0,26.0,26.0,Thursday
338,2004-06-10 14:00:00,2004-06-10,14:00:00,9,7.733333,13.0,2.0,1.0,2.0,2.0,1.0,2.0,26.0,26.0,26.0,Thursday


In [54]:
def sin_transform(values):
    """
    Applies SIN transform to a series value.
    Args:
        values (pd.Series): A series to apply SIN transform on.
    Returns
        (pd.Series): The transformed series.
    """

    return np.sin(2 * np.pi * values / len(set(values)))


def cos_transform(values):
    """
    Applies COS transform to a series value.
    Args:
        values (pd.Series): A series to apply SIN transform on.
    Returns
        (pd.Series): The transformed series.
    """
    return np.cos(2 * np.pi * values / len(set(values)))


def date_engineering(data):
    # Ensure timestamp is in datetime format
    data['timestamp'] = pd.to_datetime(data['timestamp'])

    # Extract date components
    data["year"] = data["timestamp"].dt.year
    data["month"] = data["timestamp"].dt.month
    data["weekday"] = data["timestamp"].dt.weekday
    data["week"] = data["timestamp"].dt.isocalendar().week
    data["day"] = data["timestamp"].dt.day

    # Apply sin and cos transforms
    data["month_sin"] = sin_transform(data["month"])
    data["weekday_sin"] = sin_transform(data["weekday"])
    data["week_sin"] = sin_transform(data["week"])
    data["day_sin"] = sin_transform(data["day"])

    data["month_cos"] = cos_transform(data["month"])
    data["weekday_cos"] = cos_transform(data["weekday"])
    data["week_cos"] = cos_transform(data["week"])
    data["day_cos"] = cos_transform(data["day"])

    # Drop original date components
    data = data.drop(columns=['year', 'month', 'weekday', 'week', 'day'])

    return data

In [55]:
df = date_engineering(df)
df

Unnamed: 0,timestamp,Dates,Time,user_id,activity,days,gender,afftype,melanch,inpatient,...,madrs_mean,DayName,month_sin,weekday_sin,week_sin,day_sin,month_cos,weekday_cos,week_cos,day_cos
0,2003-05-07 12:00:00,2003-05-07,12:00:00,1,346.550000,11.0,2.0,2.0,2.0,2.0,...,19.0,Wednesday,-0.974928,0.974928,-0.998027,0.988468,-0.222521,-0.222521,0.062791,0.151428
1,2003-05-07 13:00:00,2003-05-07,13:00:00,1,284.566667,11.0,2.0,2.0,2.0,2.0,...,19.0,Wednesday,-0.974928,0.974928,-0.998027,0.988468,-0.222521,-0.222521,0.062791,0.151428
2,2003-05-07 14:00:00,2003-05-07,14:00:00,1,279.183333,11.0,2.0,2.0,2.0,2.0,...,19.0,Wednesday,-0.974928,0.974928,-0.998027,0.988468,-0.222521,-0.222521,0.062791,0.151428
3,2003-05-07 15:00:00,2003-05-07,15:00:00,1,218.783333,11.0,2.0,2.0,2.0,2.0,...,19.0,Wednesday,-0.974928,0.974928,-0.998027,0.988468,-0.222521,-0.222521,0.062791,0.151428
4,2003-05-07 16:00:00,2003-05-07,16:00:00,1,238.550000,11.0,2.0,2.0,2.0,2.0,...,19.0,Wednesday,-0.974928,0.974928,-0.998027,0.988468,-0.222521,-0.222521,0.062791,0.151428
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
335,2004-06-10 11:00:00,2004-06-10,11:00:00,9,0.000000,13.0,2.0,1.0,2.0,2.0,...,26.0,Thursday,-0.781831,0.433884,-0.24869,0.897805,0.623490,-0.900969,0.968583,-0.440394
336,2004-06-10 12:00:00,2004-06-10,12:00:00,9,14.600000,13.0,2.0,1.0,2.0,2.0,...,26.0,Thursday,-0.781831,0.433884,-0.24869,0.897805,0.623490,-0.900969,0.968583,-0.440394
337,2004-06-10 13:00:00,2004-06-10,13:00:00,9,511.316667,13.0,2.0,1.0,2.0,2.0,...,26.0,Thursday,-0.781831,0.433884,-0.24869,0.897805,0.623490,-0.900969,0.968583,-0.440394
338,2004-06-10 14:00:00,2004-06-10,14:00:00,9,7.733333,13.0,2.0,1.0,2.0,2.0,...,26.0,Thursday,-0.781831,0.433884,-0.24869,0.897805,0.623490,-0.900969,0.968583,-0.440394


In [56]:
df = df.drop(columns=['timestamp', 'Dates', 'Time', 'madrs1', 'madrs2', 'DayName'])
df

Unnamed: 0,user_id,activity,days,gender,afftype,melanch,inpatient,marriage,work,madrs_mean,month_sin,weekday_sin,week_sin,day_sin,month_cos,weekday_cos,week_cos,day_cos
0,1,346.550000,11.0,2.0,2.0,2.0,2.0,1.0,2.0,19.0,-0.974928,0.974928,-0.998027,0.988468,-0.222521,-0.222521,0.062791,0.151428
1,1,284.566667,11.0,2.0,2.0,2.0,2.0,1.0,2.0,19.0,-0.974928,0.974928,-0.998027,0.988468,-0.222521,-0.222521,0.062791,0.151428
2,1,279.183333,11.0,2.0,2.0,2.0,2.0,1.0,2.0,19.0,-0.974928,0.974928,-0.998027,0.988468,-0.222521,-0.222521,0.062791,0.151428
3,1,218.783333,11.0,2.0,2.0,2.0,2.0,1.0,2.0,19.0,-0.974928,0.974928,-0.998027,0.988468,-0.222521,-0.222521,0.062791,0.151428
4,1,238.550000,11.0,2.0,2.0,2.0,2.0,1.0,2.0,19.0,-0.974928,0.974928,-0.998027,0.988468,-0.222521,-0.222521,0.062791,0.151428
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
335,9,0.000000,13.0,2.0,1.0,2.0,2.0,1.0,2.0,26.0,-0.781831,0.433884,-0.24869,0.897805,0.623490,-0.900969,0.968583,-0.440394
336,9,14.600000,13.0,2.0,1.0,2.0,2.0,1.0,2.0,26.0,-0.781831,0.433884,-0.24869,0.897805,0.623490,-0.900969,0.968583,-0.440394
337,9,511.316667,13.0,2.0,1.0,2.0,2.0,1.0,2.0,26.0,-0.781831,0.433884,-0.24869,0.897805,0.623490,-0.900969,0.968583,-0.440394
338,9,7.733333,13.0,2.0,1.0,2.0,2.0,1.0,2.0,26.0,-0.781831,0.433884,-0.24869,0.897805,0.623490,-0.900969,0.968583,-0.440394


In [58]:
non_numeric_cols = df.select_dtypes(include=['object']).columns
print(non_numeric_cols)

Index(['user_id'], dtype='object')


In [59]:
unique_ids = df['user_id'].unique()
train_ids, test_ids = train_test_split(unique_ids, test_size=0.3, random_state=42)

train_df = df[df['user_id'].isin(train_ids)]
test_df = df[df['user_id'].isin(test_ids)]

train_df = train_df.drop(columns=['user_id'])
test_df = test_df.drop(columns=['user_id'])

In [61]:
X_train = train_df.drop('madrs_mean', axis=1)
y_train = train_df['madrs_mean']

X_test = test_df.drop('madrs_mean', axis=1)
y_test = test_df['madrs_mean']

random_forest_model = RandomForestRegressor(n_estimators=100, random_state=42)
random_forest_model.fit(X_train, y_train)
y_pred = random_forest_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'Random Forest:\n MSE: {mse}, R²: {r2}')

Random Forest:
 MSE: 73.57992595220985, R²: -3.1665468699267025
