In [None]:
# Import libraries
from dateutil.parser import parse 
import matplotlib as mpl
import matplotlib.pyplot as plt
from datetime import datetime
import seaborn as sns
import numpy as np
import pandas as pd

In [None]:
# Import as Dataframe
df = pd.read_csv('./Data/dataset_mood_smartphone.csv')
df.head()

# Drop unnecessary columns
data =  df.drop(['Unnamed: 0'], axis=1)
    
# Make sure the 'time' column is of type datetime
data['time'] = pd.to_datetime(data['time'])

In [None]:
from Preprocessing.datacleaning import remove_incorrect_values, convert_to_wide, DetectAnomalies

In [None]:
# Remove incorrect values
valid_df, removed_df = remove_incorrect_values(data)

# Call the function and store the result in a new dataframe
new_df = convert_to_wide(valid_df)

In [None]:
# Create an instance of the DetectAnomalies class
anomaly_detector = DetectAnomalies(contamination=0.005)

# Create a list of columns to be analyzed
columns = ['activity', 'appCat.builtin', 'appCat.communication', 'appCat.entertainment', 'appCat.finance', 'appCat.game',
'appCat.office', 'appCat.other', 'appCat.social', 'appCat.travel', 'appCat.unknown', 'appCat.utilities', 'appCat.weather', 'screen']
    
# Call the detect_anomalies method to identify anomalies in the data
anomaly_df, clean_data, anomaly_dict = anomaly_detector.detect_anomalies(new_df, columns)

# Plot the anomalies and non-anomalies for each column
anomaly_detector.plot()

In [None]:
# Remove the index title from the dataframe
clean_data.index.name = None

# Remove the missing values from the target variable
clean_data = clean_data.dropna(subset=['mood']).reset_index(drop=True)

# Check the number of missing values in the imputed data
clean_data.isnull().sum()


In [None]:
from Preprocessing.datacleaning import impute_with0, ImputeKNN, ImputeIterative

In [None]:
from sklearn.model_selection import TimeSeriesSplit

# Create an instance of TimeSeriesSplit with the number of splits
tscv = TimeSeriesSplit(n_splits = 5) #

for train_index, test_index in tscv.split(clean_data):
    train_data = clean_data.iloc[train_index]
    test_data = clean_data.iloc[test_index]


print(train_data.shape) # (1015, 21)
print(test_data.shape) # (202, 21)

In [None]:
# Define the columns to impute
cols_to_impute = [col for col in train_data.columns if col not in ['id', 'date', 'mood']]

# Impute missing values with 0 for train and test data
zero_train = impute_with0(train_data, cols_to_impute)
zero_test = impute_with0(test_data, cols_to_impute)

In [None]:
# Create an instance of the Iterative Imputataion class
KNNimputer_train = ImputeKNN(train_data, cols_to_impute)
KNNimputer_test = ImputeKNN(test_data, cols_to_impute)

# Impute missing values and join the imputed data to the original DataFrame
KNN_train = KNNimputer_train.impute()
KNN_test = KNNimputer_test.impute()

# Join with original data
KNN_train_df = KNNimputer_train.join2full(train_data)
KNN_test_df = KNNimputer_test.join2full(test_data)

In [None]:
# Create an instance of the Iterative Imputataion class
ITimputer_train = ImputeIterative(train_data, cols_to_impute)
ITimputer_test = ImputeIterative(test_data, cols_to_impute)

# Impute missing values and join the imputed data to the original DataFrame
IT_train = ITimputer_train.impute()
IT_test = ITimputer_test.impute()

# Join with original data
IT_train_df = ITimputer_train.join2full(train_data)
IT_test_df = ITimputer_test.join2full(test_data)

In [None]:
from Preprocessing.featureengineering import feature_engineering

In [None]:
# Feature engineering zero data
zero_train_fe = feature_engineering(zero_train)
zero_test_fe = feature_engineering(zero_test)

# Feature engineering KNN data
KNN_train_fe = feature_engineering(KNN_train_df)
KNN_test_fe = feature_engineering(KNN_test_df)

# Feature engineering Iterative data
IT_train_fe = feature_engineering(IT_train_df)
IT_test_fe = feature_engineering(IT_test_df)

In [None]:
# split data into features and target
X_train_zero = zero_train_fe.drop(['id', 'date', 'mood'], axis=1)
y_train_zero = zero_train_fe['mood']
X_test_zero = zero_test_fe.drop(['id', 'date', 'mood'], axis=1)
y_test_zero = zero_test_fe['mood']

# Of zonder imputation
X_train_zero = train_data.drop(['id', 'date', 'mood'], axis=1)
y_train_zero = train_data['mood']
X_test_zero = test_data.drop(['id', 'date', 'mood'], axis=1)
y_test_zero = test_data['mood']