## Final model 

In [12]:
# import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import confusion_matrix

from sklearn import metrics

# read the data
df = pd.read_csv('../data/data_combined_final.csv')

# drop the first column (index)
df.drop(df.columns[0], axis=1, inplace=True)

# drop the country column
df.drop('country', axis=1, inplace=True)

In [14]:
df.columns

Index(['Steps', 'Socialize (min)', 'Leave house', 'People contact',
       'Alcohol (bev)', 'Exercise', 'Stress', 'Worry (scale)',
       'Worry (finances)', 'Worry (health)', 'PHQ9', 'PANAS (NA)',
       'PANAS (PA)', 'Isolation', 'Latency', 'Wakes', 'Sleeptime (h)',
       'student', 'Age Group', 'Restfulness', 'Wakes (scale)', 'Nervousness',
       'Depression', 'Anxiety', 'Insomnia', 'Exercise (scale)',
       'Food Healthiness', 'Alcohol'],
      dtype='object')

In [18]:
df['Worry (scale)']

0       4.000000
1       4.400000
2       5.100000
3       3.058824
4       3.485714
          ...   
2143         NaN
2144         NaN
2145         NaN
2146         NaN
2147         NaN
Name: Worry (scale), Length: 2148, dtype: float64

Steps = Number of steps taken in the day on average <br>
Socialize (min) = Number of minutes spent socializing on average  <br>
Leave house = Categorical value of whether the person left the house or not 1: Yes, 0: No  <br>
People contact = Number of people the person had contact with on average in a day  <br>
Alcohol (bev) = Number of alcoholic beverages consumed on average in a day  <br>
Exercise = Categorical value of whether the person exercised or not 1: Yes, 0: No  <br>
Stress = Categorical value of stress from 1 to 7 with increasing stress  <br>

## these can me excluded since these are covid related <br>
Worry (scale) = Categorical value of worry from 1 to 7 with increasing worry  <br>
Worry (finances) = Categorical value of worry about finances from 1 to 7 with increasing worry  <br>
       Q: How worried are you about the personal financial 
Worry (health) = Categorical value of worry about health from 1 to 7 with increasing worry  <br>

#check these: <br>
PHQ9 = Categorical value of depression from 1 to 7 with increasing depression <br>
PANAS (NA) = Categorical value of negative affect from 1 to 7 with increasing negative affect <br>
PANAS (PA) = Categorical value of positive affect from 1 to 7 with increasing positive affect <br>
Isolation = Categorical value of isolation (how isolated the person felt) from 1 to 7 with increasing isolation <br>
Latency = Minutes it took to fall asleep on average<br>
Wakes = Categorical from 0 to 5 5 is more than 5 times<br>
Sleeptime (h) = Hours of sleep on average <br>
student = Categorical value of whether the person is a student or not 1: Yes, 0: No <br>
Age Group = Categorical value of age group <br>
       1: under 22 <br>
       2: 22-24 <br>
       3: 25-29 <br>
       4: 30-34 <br>
Restfulness = Categorical value of restfulness from 1 to 4: <br>
       Q : Do you feel you sleep enough? <br>
       1: Yes, nearly always <br>
       2: yes, often<br>
       3: Seldom or hardly ever<br>
       4: Can't say <br>
Wakes (scale) = Categorical value of how often the person has woken up or had difficulty to fall asleep <br>
       Q: Have you experienced any the following during the past month: Waking up during the night or difficulty in falling asleep? <br>
       0: No <br>
       1:Occationally <br>
       2: Weekly <br>
       3: Daily or almost daily<br>


Nervousness = Categorical value of nervousness from 0 to 3 with increasing nervousness <br>
       Q : Have you experienced any the following during the past month: Nervous tension/nervousness <br>
       0 : No <br>
       1 : Occationally <br>
       2 : Weekly <br>
       3 : Daily or almost daily <br>

Depression = Categorical value of depression from 0 to 3 with increasing depression <br>
       Q: Have you experienced any the following during the past month: Depression or feeling low
       0: No
       1: Occationally
       2: Weekly
       3: Daily or almost daily

Anxiety = Categorical value of anxiety from 0 to 3 with increasing anxiety <br>
       Q: Have you experienced any the following during the past month: Anxiety <br>
       0: No <br>
       1: Occationally <br>
       2: Weekly <br>
       3: Daily or almost daily <br>

Insomnia = Categorical value of insomnia from 1 to 4 with increasing insomnia <br>
       Q: Have you recently lost sleep over worry? <br>
       1: No, not at all <br>
       2: No more than usual<br>
       3: Rather more than usual<br>
       4: Much more than usual<br>

Exercise (scale) = Categorical value of exercise from 0 to 5 with increasing exercise <br>
       Q: How often do you take more vigorous physical exercise at the minimum for 30 minutes at a time and becoming at least a bit out of breath and sweaty (e.g. jogging/fast walking, cycling, swimming, gymnastics, ball games)? <br>
       0: Not at all or very seldom <br>
       1: 1 - 3 times a month <br>
       2: Roughly once a week<br>
       3: 2 - 3 times a week<br>
       4: 4 - 6 times a week<br>
       5: Daily<br>
	
Food Healthiness = Categorical value of food healthiness from 0 to 2 with increasing food healthiness <br>
       Q: When buying/acquiring food, do you take health factors into account? <br>
       0: Never or very seldom <br>
       1: Occationally <br>
       2: Often <br>




Alcohol = Categorical value of alcohol consumption from 0 to 5  with increasing alcohol consumption <br>
       Q: How often do you drink alcohol?<br>
       0: Never <br>
       1: Montly or less often<br>
       2: 2 - 3 times a month<br>
       3: Once a week<br>
       4: 2-3 times a week<br>
       5: Four times a week or more often <br>




In [None]:
# the outcome is a categorical variable
# we will use the logistic regression model

# libraries have been imported in the previous cell
# the split has been done in the previous cell
# the data has been imported in the previous cell
# creating the logistic regression model
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression()
classifier.fit(X_train, y_train)

# predicting the test set results
y_pred = classifier.predict(X_test)

# evaluating the model
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
print(cm)
print('Accuracy:', metrics.accuracy_score(y_test, y_pred))

