# Erste Analyse

In [1]:
# imports
import pandas as pd
import json
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
# export options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

from IPython.display import HTML

HTML("""
<style>
    /* Make dataframe scrollable */
    .dataframe {
        display: block;
        overflow-x: auto;
        white-space: nowrap;
    }
    /* Set text background to white and text color to black */
    body {
        background-color: white !important;
        color: black !important;
    }
    /* Ensure all text elements have a white background and black text */
    p, li, h1, h2, h3, h4, h5, h6, span, div {
        background-color: white !important;
        color: black !important;
    }
</style>
""")

In [3]:
# Important! Set project root as current working directory
from src.utils import get_project_root
import os

root = get_project_root()
os.chdir(root)

In [4]:
# get file_id to read dataset
json_file_path = 'credentials/file_ids.json'

with open(json_file_path, 'r') as j:
     file_id_bl = json.loads(j.read())['baseline']

with open(json_file_path, 'r') as j:
     file_id_fu = json.loads(j.read())['follow_up']

In [None]:
# read file
dwn_url='https://drive.google.com/uc?id='
bl = pd.read_csv(dwn_url + file_id_bl, parse_dates=['created_at'], index_col='answer_id')

fu = pd.read_csv(dwn_url + file_id_fu, parse_dates=['created_at'], index_col='answer_id')

## Get some baseline facts

In [None]:
# n participants
bl.user_id.nunique()

In [None]:
# n follow-up assessments
fu.shape[0]

## How many follow-up assessments over the time?

In [None]:
print('Number of monthly active users in baseline')
bl['YYYY-MM'] = bl.created_at.dt.strftime('%Y-%m')
bl.groupby('YYYY-MM').nunique()['user_id']

In [None]:
print('n assessments per month in follow up')
fu['YYYY-MM'] = fu.created_at.dt.strftime('%Y-%m')
fu['YYYY-MM'].value_counts().sort_index()

In [None]:
print('Number of monthly active users in follow up')
fu.groupby('YYYY-MM').nunique()['user_id']

## Welche Domänen (alle im Fragebogen) wurden über die Zeit wie häufig befüllt und ausgefüllt im Zeitverlauf

**Zwischen den Domänen sind keine großen Unterschiede im Zeitverlauf. Wenn es in einem Monat viele aktive Nutzer gab, dann auch in den jeweiligen Domänen. Die unten stehende Tabelle gibt die Anzahl der vorhandenen Werte in diesem Jahr und Monat (YYYY-MM) für alle Variablen an.**

In [None]:
bl['YYYY-MM'] = bl.created_at.dt.strftime('%Y-%m')
res = bl.groupby('YYYY-MM').apply(lambda x: x.notnull().sum())
# res.to_excel('results/first_glance/non_null_count_per_month_and_variable.xlsx')
res

## Height distribution

In [None]:
sns.displot(data=bl, x="heigh", kind="kde")
plt.show()

bl.groupby('geschlecht').describe()['heigh']

## Weight distribution

In [None]:
sns.displot(data=bl, x="weigh1", kind="kde")
plt.show()

bl.groupby('geschlecht').describe()['weigh1']

## Sensor and app data

In [None]:
print('n Geographical data in baseline:\t', bl.sensordata_altitude.notnull().sum())
print()
print('tracking permission ratio:', bl.sensordata_altitude.notnull().sum()/bl.shape[0])
print('------------------------------------------------')
print('n Geographical data in followup:\t', fu.sensordata_altitude.notnull().sum())
print()
print('tracking permission ratio: \t', fu.sensordata_altitude.notnull().sum()/fu.shape[0])

In [None]:
print('n permissions apps tracking in baseline:\t', bl.sensordata_apps.notnull().sum())
print()
print('tracking permission ratio', bl.sensordata_apps.notnull().sum()/bl.shape[0])
print('------------------------------------------------')
print('n permissions apps tracking in followup:\t', fu.sensordata_apps.notnull().sum())
print()
print('tracking permission ratio: \t', fu.sensordata_apps.notnull().sum()/fu.shape[0])

In [None]:
!jupyter nbconvert --to html --no-input src/d01_analyse/RQ1/Analyse.ipynb