# 1. Imports and data

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import scipy
from scipy import stats
import seaborn as sns
import numpy as np
import warnings  # Supress warnings
from scipy.stats import shapiro
from sklearn.impute import KNNImputer
import math
from sklearn.preprocessing import StandardScaler, MinMaxScaler


In [8]:
daily_data_imputed = pd.read_csv("data_for_FE.csv")
daily_data_imputed, daily_data_imputed.columns

(      Unnamed: 0       id        date        screen  call  sms      mood  \
 0              0  AS14.01  2014-03-15   4873.140800   6.0  1.0  7.270000   
 1              1  AS14.01  2014-03-17   4873.140800   6.0  1.0  7.270000   
 2              2  AS14.01  2014-03-18   4535.069000   1.0  2.0  7.150000   
 3              3  AS14.01  2014-03-19   2879.106800   1.0  2.8  6.553333   
 4              4  AS14.01  2014-03-20   2275.944000   1.0  1.6  7.190000   
 ...          ...      ...         ...           ...   ...  ...       ...   
 1267        1267  AS14.33  2014-05-11   5336.354001   3.2  2.8  6.000000   
 1268        1268  AS14.33  2014-05-12  15521.871000   1.0  2.0  6.000000   
 1269        1269  AS14.33  2014-05-13  18801.167999   1.0  2.0  5.000000   
 1270        1270  AS14.33  2014-05-14   4149.179000   1.6  3.0  6.833333   
 1271        1271  AS14.33  2014-05-15  15883.185998   7.0  2.0  6.000000   
 
       circumplex.arousal  circumplex.valence  activity  appCat.builtin  \

In [10]:
daily_data_imputed = daily_data_imputed.drop('Unnamed: 0', axis=1)
daily_data_imputed

Unnamed: 0,id,date,screen,call,sms,mood,circumplex.arousal,circumplex.valence,activity,appCat.builtin,appCat.communication,appCat.entertainment,appCat.other,appCat.social
0,AS14.01,2014-03-15,4873.140800,6.0,1.0,7.270000,-0.400000,0.620000,0.049001,441.5332,1728.0884,1855.3920,192.0246,876.7360
1,AS14.01,2014-03-17,4873.140800,6.0,1.0,7.270000,-0.400000,0.620000,0.049001,441.5332,1728.0884,1855.3920,192.0246,876.7360
2,AS14.01,2014-03-18,4535.069000,1.0,2.0,7.150000,-0.123333,0.716667,0.131613,1770.4486,3918.0898,692.5764,430.8046,703.4910
3,AS14.01,2014-03-19,2879.106800,1.0,2.8,6.553333,-0.020000,0.633333,0.150912,3301.2994,3603.9096,602.7210,69.0396,333.4178
4,AS14.01,2014-03-20,2275.944000,1.0,1.6,7.190000,0.036667,0.403333,0.081548,248.9790,2168.2290,350.8560,11.3450,807.7310
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1267,AS14.33,2014-05-11,5336.354001,3.2,2.8,6.000000,0.000000,0.400000,0.011163,259.8930,504.5490,357.6370,53.5510,4154.9440
1268,AS14.33,2014-05-12,15521.871000,1.0,2.0,6.000000,-0.750000,0.500000,0.024017,1196.5320,1065.8850,474.8360,293.2330,4037.5670
1269,AS14.33,2014-05-13,18801.167999,1.0,2.0,5.000000,0.500000,0.000000,0.139964,2505.5300,4340.7720,1232.1030,39.5480,3313.2740
1270,AS14.33,2014-05-14,4149.179000,1.6,3.0,6.833333,-0.333333,0.400000,0.003986,268.9800,160.8820,1093.6140,100.9610,3754.7720


# 2. Convert 'date' to datetime and extract useful features

In [13]:
daily_data_imputed['date'] = pd.to_datetime(daily_data_imputed['date'])
daily_data_imputed['day_of_week'] = daily_data_imputed['date'].dt.dayofweek
daily_data_imputed['week_of_year'] = daily_data_imputed['date'].dt.isocalendar().week
daily_data_imputed['month'] = daily_data_imputed['date'].dt.month

In [15]:
daily_data_imputed

Unnamed: 0,id,date,screen,call,sms,mood,circumplex.arousal,circumplex.valence,activity,appCat.builtin,appCat.communication,appCat.entertainment,appCat.other,appCat.social,day_of_week,week_of_year,month
0,AS14.01,2014-03-15,4873.140800,6.0,1.0,7.270000,-0.400000,0.620000,0.049001,441.5332,1728.0884,1855.3920,192.0246,876.7360,5,11,3
1,AS14.01,2014-03-17,4873.140800,6.0,1.0,7.270000,-0.400000,0.620000,0.049001,441.5332,1728.0884,1855.3920,192.0246,876.7360,0,12,3
2,AS14.01,2014-03-18,4535.069000,1.0,2.0,7.150000,-0.123333,0.716667,0.131613,1770.4486,3918.0898,692.5764,430.8046,703.4910,1,12,3
3,AS14.01,2014-03-19,2879.106800,1.0,2.8,6.553333,-0.020000,0.633333,0.150912,3301.2994,3603.9096,602.7210,69.0396,333.4178,2,12,3
4,AS14.01,2014-03-20,2275.944000,1.0,1.6,7.190000,0.036667,0.403333,0.081548,248.9790,2168.2290,350.8560,11.3450,807.7310,3,12,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1267,AS14.33,2014-05-11,5336.354001,3.2,2.8,6.000000,0.000000,0.400000,0.011163,259.8930,504.5490,357.6370,53.5510,4154.9440,6,19,5
1268,AS14.33,2014-05-12,15521.871000,1.0,2.0,6.000000,-0.750000,0.500000,0.024017,1196.5320,1065.8850,474.8360,293.2330,4037.5670,0,20,5
1269,AS14.33,2014-05-13,18801.167999,1.0,2.0,5.000000,0.500000,0.000000,0.139964,2505.5300,4340.7720,1232.1030,39.5480,3313.2740,1,20,5
1270,AS14.33,2014-05-14,4149.179000,1.6,3.0,6.833333,-0.333333,0.400000,0.003986,268.9800,160.8820,1093.6140,100.9610,3754.7720,2,20,5


# 3. Creating multiple lagged features for mood, arousal, and valence

In [18]:
lags = [1, 2, 3, 7]  # defining the lags we want to create
for lag in lags:
    daily_data_imputed[f'mood_lag{lag}'] = daily_data_imputed.groupby('id')['mood'].shift(lag)
    daily_data_imputed[f'arousal_lag{lag}'] = daily_data_imputed.groupby('id')['circumplex.arousal'].shift(lag)
    daily_data_imputed[f'valence_lag{lag}'] = daily_data_imputed.groupby('id')['circumplex.valence'].shift(lag)

# Display the dataset with new lagged features to verify
daily_data_imputed[['date', 'mood', 'mood_lag1', 'mood_lag2', 'mood_lag3', 'mood_lag7',
      'arousal_lag1', 'arousal_lag2', 'arousal_lag3', 'arousal_lag7',
      'valence_lag1', 'valence_lag2', 'valence_lag3', 'valence_lag7']].head(10)

Unnamed: 0,date,mood,mood_lag1,mood_lag2,mood_lag3,mood_lag7,arousal_lag1,arousal_lag2,arousal_lag3,arousal_lag7,valence_lag1,valence_lag2,valence_lag3,valence_lag7
0,2014-03-15,7.27,,,,,,,,,,,,
1,2014-03-17,7.27,7.27,,,,-0.4,,,,0.62,,,
2,2014-03-18,7.15,7.27,7.27,,,-0.4,-0.4,,,0.62,0.62,,
3,2014-03-19,6.553333,7.15,7.27,7.27,,-0.123333,-0.4,-0.4,,0.716667,0.62,0.62,
4,2014-03-20,7.19,6.553333,7.15,7.27,,-0.02,-0.123333,-0.4,,0.633333,0.716667,0.62,
5,2014-03-21,6.2,7.19,6.553333,7.15,,0.036667,-0.02,-0.123333,,0.403333,0.633333,0.716667,
6,2014-03-22,6.4,6.2,7.19,6.553333,,0.2,0.036667,-0.02,,0.2,0.403333,0.633333,
7,2014-03-23,6.8,6.4,6.2,7.19,7.27,0.6,0.2,0.036667,-0.4,0.5,0.2,0.403333,0.62
8,2014-03-24,6.0,6.8,6.4,6.2,7.27,0.2,0.6,0.2,-0.4,0.8,0.5,0.2,0.62
9,2014-03-26,6.6,6.0,6.8,6.4,7.15,0.8,0.2,0.6,-0.123333,0.0,0.8,0.5,0.716667


In [20]:
daily_data_imputed = daily_data_imputed.dropna()

In [22]:
daily_data_imputed

Unnamed: 0,id,date,screen,call,sms,mood,circumplex.arousal,circumplex.valence,activity,appCat.builtin,...,valence_lag1,mood_lag2,arousal_lag2,valence_lag2,mood_lag3,arousal_lag3,valence_lag3,mood_lag7,arousal_lag7,valence_lag7
7,AS14.01,2014-03-23,6773.832001,2.4,3.6,6.800000,0.200000,0.8,0.142741,1286.246,...,0.5,6.2,0.20,0.2,7.19,0.036667,0.403333,7.270000,-0.400000,0.620000
8,AS14.01,2014-03-24,15047.351001,10.0,3.2,6.000000,0.800000,0.0,0.078961,866.956,...,0.8,6.4,0.60,0.5,6.20,0.200000,0.200000,7.270000,-0.400000,0.620000
9,AS14.01,2014-03-26,16423.801000,2.0,4.0,6.600000,-0.200000,0.6,0.101308,1167.497,...,0.0,6.8,0.20,0.8,6.40,0.600000,0.500000,7.150000,-0.123333,0.716667
10,AS14.01,2014-03-27,17442.149999,2.0,1.0,7.000000,0.200000,0.8,0.159511,1229.327,...,0.6,6.0,0.80,0.0,6.80,0.200000,0.800000,6.553333,-0.020000,0.633333
11,AS14.01,2014-03-28,4923.489000,5.0,2.0,6.400000,-0.600000,0.6,0.095698,10062.595,...,0.8,6.6,-0.20,0.6,6.00,0.800000,0.000000,7.190000,0.036667,0.403333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1267,AS14.33,2014-05-11,5336.354001,3.2,2.8,6.000000,0.000000,0.4,0.011163,259.893,...,1.0,6.6,-0.80,0.4,5.00,-0.250000,-0.500000,6.970000,0.290000,0.300000
1268,AS14.33,2014-05-12,15521.871000,1.0,2.0,6.000000,-0.750000,0.5,0.024017,1196.532,...,0.4,7.6,-0.40,1.0,6.60,-0.800000,0.400000,6.900000,0.206667,0.400000
1269,AS14.33,2014-05-13,18801.167999,1.0,2.0,5.000000,0.500000,0.0,0.139964,2505.530,...,0.5,6.0,0.00,0.4,7.60,-0.400000,1.000000,8.333333,0.000000,1.000000
1270,AS14.33,2014-05-14,4149.179000,1.6,3.0,6.833333,-0.333333,0.4,0.003986,268.980,...,0.0,6.0,-0.75,0.5,6.00,0.000000,0.400000,7.333333,-1.666667,0.666667


# 4. Calculating rolling window features for mood: mean, median, std, min, and max over a 3-day and 7-day period

In [25]:
daily_data_imputed['mood_mean_3d'] = daily_data_imputed.groupby('id')['mood'].transform(lambda x: x.rolling(3).mean())
daily_data_imputed['mood_median_3d'] = daily_data_imputed.groupby('id')['mood'].transform(lambda x: x.rolling(3).median())
daily_data_imputed['mood_std_3d'] = daily_data_imputed.groupby('id')['mood'].transform(lambda x: x.rolling(3).std())
daily_data_imputed['mood_min_3d'] = daily_data_imputed.groupby('id')['mood'].transform(lambda x: x.rolling(3).min())
daily_data_imputed['mood_max_3d'] = daily_data_imputed.groupby('id')['mood'].transform(lambda x: x.rolling(3).max())

daily_data_imputed['mood_mean_7d'] = daily_data_imputed.groupby('id')['mood'].transform(lambda x: x.rolling(7).mean())
daily_data_imputed['mood_median_7d'] = daily_data_imputed.groupby('id')['mood'].transform(lambda x: x.rolling(7).median())
daily_data_imputed['mood_std_7d'] = daily_data_imputed.groupby('id')['mood'].transform(lambda x: x.rolling(7).std())
daily_data_imputed['mood_min_7d'] = daily_data_imputed.groupby('id')['mood'].transform(lambda x: x.rolling(7).min())
daily_data_imputed['mood_max_7d'] = daily_data_imputed.groupby('id')['mood'].transform(lambda x: x.rolling(7).max())
# The first 2 entries for each id will have NaN values because there's not enough data to calculate rolling statistics.
data_cleaned = daily_data_imputed.dropna()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  daily_data_imputed['mood_mean_3d'] = daily_data_imputed.groupby('id')['mood'].transform(lambda x: x.rolling(3).mean())
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  daily_data_imputed['mood_median_3d'] = daily_data_imputed.groupby('id')['mood'].transform(lambda x: x.rolling(3).median())
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.h

Unnamed: 0,id,date,screen,call,sms,mood,circumplex.arousal,circumplex.valence,activity,appCat.builtin,...,mood_mean_3d,mood_median_3d,mood_std_3d,mood_min_3d,mood_max_3d,mood_mean_7d,mood_median_7d,mood_std_7d,mood_min_7d,mood_max_7d
13,AS14.01,2014-03-30,4523.214001,4.2,3.4,7.5,-0.5,0.75,0.049093,414.365,...,7.3,7.5,0.818535,6.4,8.0,6.9,6.8,0.675771,6.0,8.0
14,AS14.01,2014-03-31,11836.834,6.0,2.0,7.4,0.0,0.6,0.070505,2961.405,...,7.633333,7.5,0.321455,7.4,8.0,6.985714,7.0,0.698638,6.0,8.0
15,AS14.01,2014-04-01,17173.906002,1.0,1.6,6.0,0.2,0.0,0.118443,1385.517,...,6.966667,7.4,0.83865,6.0,7.5,6.985714,7.0,0.698638,6.0,8.0
16,AS14.01,2014-04-04,12379.411,2.0,1.0,6.2,0.2,0.4,0.126689,1957.829,...,6.533333,6.2,0.757188,6.0,7.4,6.928571,7.0,0.749921,6.0,8.0
17,AS14.01,2014-04-05,3340.431,2.0,3.2,6.8,-0.4,0.6,0.119002,15159.19,...,6.333333,6.2,0.416333,6.0,6.8,6.9,6.8,0.750555,6.0,8.0


In [29]:
missing_values_count = data_cleaned.isnull().sum()
total_missing_values = data_cleaned.isnull().sum().sum()
missing_values_count, total_missing_values

(id                      0
 date                    0
 screen                  0
 call                    0
 sms                     0
 mood                    0
 circumplex.arousal      0
 circumplex.valence      0
 activity                0
 appCat.builtin          0
 appCat.communication    0
 appCat.entertainment    0
 appCat.other            0
 appCat.social           0
 day_of_week             0
 week_of_year            0
 month                   0
 mood_lag1               0
 arousal_lag1            0
 valence_lag1            0
 mood_lag2               0
 arousal_lag2            0
 valence_lag2            0
 mood_lag3               0
 arousal_lag3            0
 valence_lag3            0
 mood_lag7               0
 arousal_lag7            0
 valence_lag7            0
 mood_mean_3d            0
 mood_median_3d          0
 mood_std_3d             0
 mood_min_3d             0
 mood_max_3d             0
 mood_mean_7d            0
 mood_median_7d          0
 mood_std_7d             0
 

In [31]:
data_cleaned_updated = data_cleaned

# 5. Normalization & Scale

In [34]:
# Normalizing continuous features using MinMaxScaler
min_max_scaler = MinMaxScaler()
data_cleaned_updated[['screen', 'activity']] = min_max_scaler.fit_transform(
    data_cleaned_updated[['screen', 'activity']])

# App usage columns to be normalized
app_usage_columns = ['appCat.builtin', 'appCat.communication', 'appCat.entertainment', 
                     'appCat.other', 'appCat.social']

# Calculating the sum for normalization
data_cleaned_updated['app_usage_total'] = data_cleaned_updated[app_usage_columns].sum(axis=1)

# Normalizing app usage data
for col in app_usage_columns:
    data_cleaned_updated[col + '_normalized'] = data_cleaned_updated[col] / data_cleaned_updated['app_usage_total']

# Standardizing mood lag features
mood_lag_columns = ['mood_lag1', 'mood_lag2', 'mood_lag3', 'mood_lag7']
scaler = StandardScaler()
data_cleaned_updated[mood_lag_columns] = scaler.fit_transform(data_cleaned_updated[mood_lag_columns])

# Creating cyclical features for day of the week and month
data_cleaned_updated['day_of_week_sin'] = np.sin(2 * np.pi * data_cleaned_updated['day_of_week']/7)
data_cleaned_updated['day_of_week_cos'] = np.cos(2 * np.pi * data_cleaned_updated['day_of_week']/7)
data_cleaned_updated['month_sin'] = np.sin(2 * np.pi * data_cleaned_updated['month']/12)
data_cleaned_updated['month_cos'] = np.cos(2 * np.pi * data_cleaned_updated['month']/12)

# Define high activity as being the top quartile of activity data
high_activity_threshold = data_cleaned_updated['activity'].quantile(0.75)
data_cleaned_updated['high_activity'] = (data_cleaned_updated['activity'] >= high_activity_threshold).astype(int)

# Displaying the head of the dataset to check the transformations
data_cleaned_updated.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_cleaned_updated[['screen', 'activity']] = min_max_scaler.fit_transform(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_cleaned_updated['app_usage_total'] = data_cleaned_updated[app_usage_columns].sum(axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_cleaned_updated[col + '_norma

Unnamed: 0,id,date,screen,call,sms,mood,circumplex.arousal,circumplex.valence,activity,appCat.builtin,...,appCat.builtin_normalized,appCat.communication_normalized,appCat.entertainment_normalized,appCat.other_normalized,appCat.social_normalized,day_of_week_sin,day_of_week_cos,month_sin,month_cos,high_activity
13,AS14.01,2014-03-30,0.233209,4.2,3.4,7.5,-0.5,0.75,0.110931,414.365,...,0.118721,0.372039,0.439992,0.021203,0.048044,-0.781831,0.62349,1.0,6.123234000000001e-17,0
14,AS14.01,2014-03-31,0.610632,6.0,2.0,7.4,0.0,0.6,0.159315,2961.405,...,0.253715,0.456868,0.150693,0.023008,0.115716,0.0,1.0,1.0,6.123234000000001e-17,0
15,AS14.01,2014-04-01,0.886055,1.0,1.6,6.0,0.2,0.0,0.267636,1385.517,...,0.101333,0.44581,0.075378,0.008863,0.368616,0.781831,0.62349,0.866025,-0.5,0
16,AS14.01,2014-04-04,0.638632,2.0,1.0,6.2,0.2,0.4,0.286269,1957.829,...,0.174065,0.487356,0.137382,0.015719,0.185478,-0.433884,-0.900969,0.866025,-0.5,0
17,AS14.01,2014-04-05,0.172171,2.0,3.2,6.8,-0.4,0.6,0.2689,15159.19,...,0.863019,0.107337,0.004348,0.004619,0.020678,-0.974928,-0.222521,0.866025,-0.5,0


# 6. Calculate the quantile values to split the data into three equal parts

In [37]:
quantiles = data_cleaned_updated['mood'].quantile([1/3, 2/3]).values

data_cleaned_updated['mood_category'] = pd.cut(data_cleaned_updated['mood'], bins=[data_cleaned_updated['mood'].min(), quantiles[0], quantiles[1], data_cleaned_updated['mood'].max()], 
                                 labels=['low', 'medium', 'high'], include_lowest=True)

data_cleaned_updated[['mood', 'mood_category']].head(), data_cleaned_updated['mood_category'].value_counts()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_cleaned_updated['mood_category'] = pd.cut(data_cleaned_updated['mood'], bins=[data_cleaned_updated['mood'].min(), quantiles[0], quantiles[1], data_cleaned_updated['mood'].max()],


(    mood mood_category
 13   7.5          high
 14   7.4          high
 15   6.0           low
 16   6.2           low
 17   6.8           low,
 low       329
 medium    300
 high      292
 Name: mood_category, dtype: int64)

In [39]:
data_cleaned_updated.to_csv('data_for_train.csv')