 # Can we observe different worst symptoms between countries?

In [1]:
# add src to path so the noteboook can import utilities.py
import sys
sys.path.append('C:\\Users\\joa24jm\\Documents\\tinnitus-country\\src\\d00_utils')

In [2]:
# project location
p_loc = 'C:/Users/joa24jm/Documents/tinnitus-country/'

In [3]:
# imports
import pandas as pd
import utilities as u

In [4]:
# read in df
df = pd.read_csv(p_loc + 'data/02_intermediate/baseline.csv', index_col = 'Unnamed: 0',
                parse_dates = ['4', '9', 'created_at'],na_values = ['??.??.????', '27.02.2522']
                )
# meta = pd.read_csv(p_loc + 'data/01_raw/users_metadata.csv', delimiter = ';')


In [5]:
# vlookup for country
# df = pd.merge(meta[['user_id', 'country']], df, left_on='user_id', right_index=True, how = 'right')

In [6]:
# add season of answer
df['season'] = df.created_at.apply(u.get_season)

In [7]:
df.columns

Index(['4', '5', '6', '7', '9', '10', '11', '13', '35', '36', '37', '38', '40',
       '24', '31', '3', '32', '26', '34', '30', '29', '25', '33', '27', '2',
       '28', '39', '12', '8', '65', '17', '66', '67', '68', '69', '70', '71',
       '72', '73', '74', '75', '76', '77', '78', '79', '80', '81', '82', '83',
       '84', '85', 'created_at', 'user_id', 'country', 'season'],
      dtype='object')

##### Now we want to get the distribution of the worst symptom for countries with more than 100 users


In [8]:
# get all countries with more than 100 users
u_treshold = 100

s = df.groupby('country').nunique()['user_id'] > u_treshold
u_countries = s[s].index.tolist()

In [9]:
u_countries

['CA', 'CH', 'DE', 'GB', 'NL', 'US']

In [10]:
symptoms = df['24'].value_counts()[:-1].index.tolist()

In [11]:
# rename col for better readabiliy
df.rename(columns={'24':'worst_symptom'}, inplace = True)

In [12]:
# reduce df to only these countries and symptoms
sub_df = df[df.country.isin(u_countries) & df['worst_symptom'].isin(symptoms)]

In [13]:
# crosstab for statistics
ct = pd.crosstab(sub_df['worst_symptom'], sub_df.country, normalize = 'columns')
# format ct for better readability
ct.style.format('{:,.2%}')

country,CA,CH,DE,GB,NL,US
worst_symptom,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
"Because of the tinnitus I am more irritable with my family, friends and colleagues.",0.00%,2.63%,6.49%,2.99%,5.14%,5.96%
Because of the tinnitus I am more sensitive to environmental noises.,11.57%,12.28%,9.54%,4.48%,6.86%,8.01%
Because of the tinnitus it is difficult to concentrate.,11.57%,6.14%,12.60%,6.97%,13.71%,15.46%
"Because of the tinnitus it is difficult to follow a conversation, a piece of music or a film.",10.74%,11.40%,14.05%,14.43%,9.71%,16.01%
Because of the tinnitus it is hard for me to get to sleep.,9.92%,10.53%,10.61%,16.42%,8.57%,13.78%
I am feeling depressed because of the tinnitus.,8.26%,10.53%,7.79%,18.41%,14.29%,9.68%
I don't have any of these symptoms.,12.40%,20.18%,10.46%,4.98%,6.29%,8.19%
I find it harder to relax because of the tinnitus.,25.62%,19.30%,17.48%,21.89%,21.71%,14.15%
I have strong worries because of the tinnitus.,9.92%,7.02%,10.99%,9.45%,13.71%,8.75%


In [14]:
# reformat ct in a scientific way
# ct = u.format_ct(ct)
ct

country,CA,CH,DE,GB,NL,US
worst_symptom,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
"Because of the tinnitus I am more irritable with my family, friends and colleagues.",0.0,0.026316,0.064885,0.029851,0.051429,0.05959
Because of the tinnitus I am more sensitive to environmental noises.,0.115702,0.122807,0.09542,0.044776,0.068571,0.080074
Because of the tinnitus it is difficult to concentrate.,0.115702,0.061404,0.125954,0.069652,0.137143,0.154562
"Because of the tinnitus it is difficult to follow a conversation, a piece of music or a film.",0.107438,0.114035,0.140458,0.144279,0.097143,0.160149
Because of the tinnitus it is hard for me to get to sleep.,0.099174,0.105263,0.106107,0.164179,0.085714,0.137803
I am feeling depressed because of the tinnitus.,0.082645,0.105263,0.077863,0.18408,0.142857,0.096834
I don't have any of these symptoms.,0.123967,0.201754,0.10458,0.049751,0.062857,0.081937
I find it harder to relax because of the tinnitus.,0.256198,0.192982,0.174809,0.218905,0.217143,0.141527
I have strong worries because of the tinnitus.,0.099174,0.070175,0.109924,0.094527,0.137143,0.087523


In [15]:
# export crosstab to excel-csv
# ct.to_csv(p_loc + 'results/01_tables/ct_worst_symptom_country_unformatted.csv')

### Further split up this crosstab using season

In [16]:
# crosstab for statistics
ct = pd.crosstab([sub_df['worst_symptom'],sub_df['season']], sub_df.country, normalize = 'columns')
# reindex in a logical order
ct = ct.reindex(['spring', 'summer', 'autumn', 'winter'], level = 'season')
# format ct for better readability
ct[['CH','DE','GB','NL','US']].style.format('{:,.2%}')
# ct = u.format_ct(ct)


Unnamed: 0_level_0,country,CH,DE,GB,NL,US
worst_symptom,season,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
"Because of the tinnitus I am more irritable with my family, friends and colleagues.",spring,0.00%,1.91%,0.50%,1.14%,0.93%
"Because of the tinnitus I am more irritable with my family, friends and colleagues.",summer,0.00%,1.37%,1.00%,1.14%,1.86%
"Because of the tinnitus I am more irritable with my family, friends and colleagues.",autumn,1.75%,1.68%,0.50%,2.29%,2.23%
"Because of the tinnitus I am more irritable with my family, friends and colleagues.",winter,0.88%,1.53%,1.00%,0.57%,0.93%
Because of the tinnitus I am more sensitive to environmental noises.,spring,4.39%,2.67%,1.99%,1.14%,1.49%
Because of the tinnitus I am more sensitive to environmental noises.,summer,0.88%,1.83%,1.00%,1.71%,2.23%
Because of the tinnitus I am more sensitive to environmental noises.,autumn,4.39%,2.90%,0.50%,4.00%,2.61%
Because of the tinnitus I am more sensitive to environmental noises.,winter,2.63%,2.14%,1.00%,0.00%,1.68%
Because of the tinnitus it is difficult to concentrate.,spring,0.88%,3.44%,2.49%,2.86%,4.66%
Because of the tinnitus it is difficult to concentrate.,summer,2.63%,2.90%,1.99%,1.14%,2.79%


In [17]:
# export crosstab to excel-csv
# ct.to_csv(p_loc + 'results/01_tables/ct_worst_symptom_season_country_unformatted.csv')

In [18]:
# helper cell to get the number of users per country
ct = pd.crosstab([sub_df['worst_symptom'],sub_df['season']], sub_df.country, normalize = False)
for i,s in zip(ct.columns, ct.sum()):
    print(f'{i} (n={s})')

CA (n=121)
CH (n=114)
DE (n=1310)
GB (n=201)
NL (n=175)
US (n=537)


### Worst symptom by Season

In [19]:
# drop na worst symptom
worst_nan = "Because of the tinnitus I am more irritable with my family, friends and colleagues.I find it harder to relax because of the tinnitus.I have strong worries because of the tinnitus.Because of the tinnitus it is hard for me to get to sleep.Because of the tinnitus it is difficult to concentrate.Because of the tinnitus I am more sensitive to environmental noises.Because of the tinnitus it is difficult to follow a conversation, a piece of music or a film.I am feeling depressed because of the tinnitus.I don't have any of these symptoms."
df = df[df.worst_symptom != worst_nan]

In [20]:
# crosstab for statistics
ct = pd.crosstab(df['worst_symptom'], df.season, normalize = 'columns')

In [21]:
ct.style.format('{:.1%}')

season,autumn,spring,summer,winter
worst_symptom,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
"Because of the tinnitus I am more irritable with my family, friends and colleagues.",6.5%,6.3%,6.5%,4.4%
Because of the tinnitus I am more sensitive to environmental noises.,8.3%,8.7%,7.9%,7.8%
Because of the tinnitus it is difficult to concentrate.,12.4%,12.9%,11.5%,10.5%
"Because of the tinnitus it is difficult to follow a conversation, a piece of music or a film.",11.2%,11.3%,14.6%,12.7%
Because of the tinnitus it is hard for me to get to sleep.,11.8%,11.3%,10.6%,14.5%
I am feeling depressed because of the tinnitus.,12.3%,9.9%,9.9%,11.8%
I don't have any of these symptoms.,8.3%,9.0%,9.6%,11.4%
I find it harder to relax because of the tinnitus.,17.8%,20.3%,16.8%,16.0%
I have strong worries because of the tinnitus.,11.5%,10.3%,12.6%,10.9%


In [25]:
ct.transpose().describe().style.format('{:.1%}')

worst_symptom,"Because of the tinnitus I am more irritable with my family, friends and colleagues.",Because of the tinnitus I am more sensitive to environmental noises.,Because of the tinnitus it is difficult to concentrate.,"Because of the tinnitus it is difficult to follow a conversation, a piece of music or a film.",Because of the tinnitus it is hard for me to get to sleep.,I am feeling depressed because of the tinnitus.,I don't have any of these symptoms.,I find it harder to relax because of the tinnitus.,I have strong worries because of the tinnitus.
count,400.0%,400.0%,400.0%,400.0%,400.0%,400.0%,400.0%,400.0%,400.0%
mean,5.9%,8.2%,11.8%,12.4%,12.1%,11.0%,9.6%,17.7%,11.3%
std,1.0%,0.4%,1.0%,1.6%,1.7%,1.2%,1.3%,1.9%,1.0%
min,4.4%,7.8%,10.5%,11.2%,10.6%,9.9%,8.3%,16.0%,10.3%
25%,5.8%,7.9%,11.2%,11.3%,11.1%,9.9%,8.8%,16.6%,10.7%
50%,6.4%,8.1%,11.9%,12.0%,11.6%,10.9%,9.3%,17.3%,11.2%
75%,6.5%,8.4%,12.5%,13.1%,12.5%,11.9%,10.0%,18.4%,11.8%
max,6.5%,8.7%,12.9%,14.6%,14.5%,12.3%,11.4%,20.3%,12.6%


### Is there a difference between the worst symptoms?

In [38]:
ct = ct[['spring', 'summer', 'autumn', 'winter']]

In [41]:
from scipy.stats import chi2_contingency, f_oneway
import numpy as np
table = np.array(ct)
stat, p, dof, expected = chi2_contingency(table)
N = table.sum().sum()
print(f'X²({dof}, N={N}) = {round(stat,2)}, p={round(p,2)}')

X²(24, N=4.0) = 0.04, p=1.0
