# Stress Prediction Model

### Library Import

In [1]:
import warnings
import numpy as np
import pandas as pd
import seaborn as sns
from scipy import stats
from scipy.sparse import hstack
from scipy.optimize import minimize
from matplotlib import pyplot as plt
%matplotlib inline
from tqdm import tqdm_notebook, tqdm
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score, accuracy_score, log_loss, r2_score
from sklearn.metrics import mean_squared_error, mean_squared_log_error
from sklearn.feature_selection import RFE
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_validate
from sklearn.linear_model import LinearRegression
from sklearn import model_selection
from sklearn import metrics
from sklearn.preprocessing import LabelEncoder
import pickle

#from catboost import CatBoostRegressor
#import category_encoders as ce
#import xgboost as xgb
#import lightgbm as lgb
#from lightgbm import LGBMRegressor
warnings.filterwarnings('ignore')

-----

## 1. LifeSnaps

### Data Import

In [7]:
train = pd.read_csv('lifesnaps.csv')
print('taining data shape:',train.shape)

taining data shape: (7410, 63)


-----

### Data Preprocess

In [8]:
train_clean = train[['id', 'date', 'stress_score', 'daily_temperature_variation', 'calories', 'distance', 'steps', 'gender']]

In [9]:
train_clean['gender'] = train_clean['gender'].apply(lambda x: 0 if x == 'MALE' else 1)

In [10]:
train_clean=train_clean[train_clean['stress_score'].isnull()==False]

In [11]:
train_clean['stress_score'] = train_clean['stress_score'].apply(lambda x: x/10)

In [12]:
train_clean['stress_score'] = train_clean['stress_score'].apply(lambda x: round(x))

In [13]:
df_train=train_clean.copy()

# ['id', 'date', 'stress_score', 'daily_temperature_variation', 'calories', 'distance', 'steps', 'gender']
df_train=df_train[df_train['date'].isnull()==False]  # removing null values
df_train=df_train[df_train['stress_score'].isnull()==False]
df_train=df_train[df_train['daily_temperature_variation'].isnull()==False]
df_train=df_train[df_train['calories'].isnull()==False]
df_train=df_train[df_train['distance'].isnull()==False]
df_train=df_train[df_train['steps'].isnull()==False]
df_train=df_train[df_train['gender'].isnull()==False]

In [14]:
cols = ['stress_score', 'daily_temperature_variation', 'calories', 'distance', 'steps', 'gender']
df =df_train[cols]
xtrain, xtest, ytrain, ytest = train_test_split(df.loc[:, df.columns != "stress_score"],
                                                df.loc[:, df.columns == "stress_score"],
                                                test_size=0.3, random_state=88)

-----

### SVR Model + Cross-Validation

In [15]:
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.svm import SVR  # Use SVR for regression
from sklearn.metrics import mean_absolute_error

svr_model = SVR(kernel='rbf')

svr_model.fit(xtrain, ytrain)

train_predictions = cross_val_predict(svr_model, xtrain, ytrain, cv=5)
test_predictions = cross_val_predict(svr_model, xtest, ytest, cv=5)

train_mae = mean_absolute_error(ytrain, train_predictions)
test_mae = mean_absolute_error(ytest, test_predictions)

print("SVR - Mean Absolute Error for train (5-fold CV):", round(train_mae, 2))
print("SVR - Mean Absolute Error for test (5-fold CV):", round(test_mae, 2))

SVR - Mean Absolute Error for train (5-fold CV): 1.51
SVR - Mean Absolute Error for test (5-fold CV): 1.3


-----

## 2. Data Collection + LifeSnaps

### Data Import

In [42]:
train2 = pd.read_csv('data_collection.csv')
print('taining data shape:',train2.shape)

taining data shape: (393, 15)


-----

### Data Preprocess

In [43]:
train_clean2 = train2[['id', 'date', 'stress_score', 'daily_temperature_variation', 'calories', 'distance', 'steps', 'gender']]

In [46]:
df_train2=train_clean2.copy()

# ['id', 'date', 'stress_score', 'daily_temperature_variation', 'calories', 'distance', 'steps', 'gender']
df_train2=df_train2[df_train2['stress_score'].isnull()==False]
df_train2=df_train2[df_train2['daily_temperature_variation'].isnull()==False]
df_train2=df_train2[df_train2['calories'].isnull()==False]
df_train2=df_train2[df_train2['distance'].isnull()==False]
df_train2=df_train2[df_train2['steps'].isnull()==False]
df_train2=df_train2[df_train2['gender'].isnull()==False]
df_train2.drop(df_train2[df_train2['distance'] == '#VALUE!'].index, inplace=True)
df_train2['steps'] = df_train2['steps'].replace({',': ''}, regex=True).astype(float)

------

### Mix Two(LifeSnaps, Data Collection) DataFrame & Shuffle

In [47]:
mixed_df = pd.concat([df_train, df_train2], axis=0, ignore_index=False)
result_df = mixed_df.sample(frac=1, random_state=42)

In [48]:
result_df

Unnamed: 0,id,date,stress_score,daily_temperature_variation,calories,distance,steps,gender
7208,621e36f967b776a240e5e7c9,2021-07-16,8,-2.674622,3352.75,19139.6,27772.0,1.0
4452,621e33b067b776a240f39e56,2021-07-06,8,-1.269686,2820.23,5407.1,7146.0,0.0
6089,621e351a67b776a240f6204b,2021-06-28,0,-1.508792,2999.75,4361.4,7654.0,0.0
825,621e2f5767b776a240d8f9d6,2021-11-21,9,-2.383186,2374.47,11769.9,17413.0,1.0
1227,621e2f9167b776a240011ccb,2022-01-16,8,-1.696735,1614.49,2184.3,3120.0,1.0
...,...,...,...,...,...,...,...,...
4354,621e339967b776a240e502de,2021-11-26,8,-0.986472,1803.91,8336.7,10049.0,1.0
4546,621e33cf67b776a240087de9,2022-01-05,7,-2.007019,3482.40,8183.6,11041.0,0.0
3412,621e32af67b776a24045b4cf,2021-07-25,8,-0.464637,2672.12,5345.0,7483.0,0.0
6088,621e351a67b776a240f6204b,2021-06-27,8,-1.128925,1912.76,1744.7,2424.0,0.0


In [49]:
cols = ['stress_score', 'daily_temperature_variation', 'calories', 'distance', 'steps', 'gender']
df = result_df[cols]
xtrain, xtest, ytrain, ytest = train_test_split(df.loc[:, df.columns != "stress_score"],
                                                df.loc[:, df.columns == "stress_score"],
                                                test_size=0.3, random_state=88)

-----

### SVR Model + Cross-Validation

In [50]:
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.svm import SVR
from sklearn.metrics import mean_absolute_error

svr_model = SVR(kernel='rbf')

svr_model.fit(xtrain, ytrain)

train_predictions = cross_val_predict(svr_model, xtrain, ytrain, cv=5)
test_predictions = cross_val_predict(svr_model, xtest, ytest, cv=5)

train_mae = mean_absolute_error(ytrain, train_predictions)
test_mae = mean_absolute_error(ytest, test_predictions)

print("SVR - Mean Absolute Error for train (5-fold CV):", round(train_mae, 2))
print("SVR - Mean Absolute Error for test (5-fold CV):", round(test_mae, 2))

SVR - Mean Absolute Error for train (5-fold CV): 1.57
SVR - Mean Absolute Error for test (5-fold CV): 1.5
