 # Can we observe different worst symptoms between countries?

In [1]:
# add src to path so the noteboook can import utilities.py
import sys
sys.path.append('C:\\Users\\joa24jm\\Documents\\tinnitus-country\\src\\d00_utils')

In [2]:
# project location
p_loc = 'C:/Users/joa24jm/Documents/tinnitus-country/'

In [3]:
# imports
import pandas as pd
import utilities as u

In [4]:
# read in df
df = pd.read_csv(p_loc + 'data/02_intermediate/baseline.csv', index_col = 'Unnamed: 0',
                parse_dates = ['4', '9', 'created_at'],na_values = ['??.??.????', '27.02.2522']
                )
meta = pd.read_csv(p_loc + 'data/01_raw/users_metadata.csv', delimiter = ';')


In [5]:
# vlookup for country
df = pd.merge(meta[['user_id', 'country']], df, left_on='user_id', right_index=True, how = 'right')

In [6]:
# add season of answer
df['season'] = df.created_at.apply(u.get_season)

##### Now we want to get the distribution of the worst symptom for the top ten countries


In [7]:
countries = df.country.value_counts()[:10].index.tolist()

In [8]:
symptoms = df['24'].value_counts()[:-1].index.tolist()

In [9]:
# rename col for better readabiliy
df.rename(columns={'24':'worst_symptom'}, inplace = True)

In [10]:
# reduce df to only these countries and symptoms
sub_df = df[df.country.isin(countries) & df['worst_symptom'].isin(symptoms)]

In [11]:
# crosstab for statistics
ct = pd.crosstab(sub_df['worst_symptom'], sub_df.country, normalize = 'columns')
# format ct for better readability
ct.style.format('{:,.2%}')

country,AU,CA,CH,DE,ES,FR,GB,IT,NL,US
worst_symptom,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
"Because of the tinnitus I am more irritable with my family, friends and colleagues.",8.45%,0.00%,2.63%,6.49%,9.76%,9.38%,2.99%,8.97%,5.14%,5.96%
Because of the tinnitus I am more sensitive to environmental noises.,7.04%,11.57%,12.28%,9.54%,4.88%,9.38%,4.48%,8.97%,6.86%,8.01%
Because of the tinnitus it is difficult to concentrate.,8.45%,11.57%,6.14%,12.60%,13.41%,3.12%,6.97%,14.10%,13.71%,15.46%
"Because of the tinnitus it is difficult to follow a conversation, a piece of music or a film.",12.68%,10.74%,11.40%,14.05%,3.66%,9.38%,14.43%,6.41%,9.71%,16.01%
Because of the tinnitus it is hard for me to get to sleep.,16.90%,9.92%,10.53%,10.61%,12.20%,14.06%,16.42%,11.54%,8.57%,13.78%
I am feeling depressed because of the tinnitus.,19.72%,8.26%,10.53%,7.79%,13.41%,18.75%,18.41%,12.82%,14.29%,9.68%
I don't have any of these symptoms.,9.86%,12.40%,20.18%,10.46%,7.32%,6.25%,4.98%,11.54%,6.29%,8.19%
I find it harder to relax because of the tinnitus.,14.08%,25.62%,19.30%,17.48%,18.29%,17.19%,21.89%,12.82%,21.71%,14.15%
I have strong worries because of the tinnitus.,2.82%,9.92%,7.02%,10.99%,17.07%,12.50%,9.45%,12.82%,13.71%,8.75%


In [12]:
# reformat ct in a scientific way
# ct = u.format_ct(ct)
ct

country,AU,CA,CH,DE,ES,FR,GB,IT,NL,US
worst_symptom,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
"Because of the tinnitus I am more irritable with my family, friends and colleagues.",0.084507,0.0,0.026316,0.064885,0.097561,0.09375,0.029851,0.089744,0.051429,0.05959
Because of the tinnitus I am more sensitive to environmental noises.,0.070423,0.115702,0.122807,0.09542,0.04878,0.09375,0.044776,0.089744,0.068571,0.080074
Because of the tinnitus it is difficult to concentrate.,0.084507,0.115702,0.061404,0.125954,0.134146,0.03125,0.069652,0.141026,0.137143,0.154562
"Because of the tinnitus it is difficult to follow a conversation, a piece of music or a film.",0.126761,0.107438,0.114035,0.140458,0.036585,0.09375,0.144279,0.064103,0.097143,0.160149
Because of the tinnitus it is hard for me to get to sleep.,0.169014,0.099174,0.105263,0.106107,0.121951,0.140625,0.164179,0.115385,0.085714,0.137803
I am feeling depressed because of the tinnitus.,0.197183,0.082645,0.105263,0.077863,0.134146,0.1875,0.18408,0.128205,0.142857,0.096834
I don't have any of these symptoms.,0.098592,0.123967,0.201754,0.10458,0.073171,0.0625,0.049751,0.115385,0.062857,0.081937
I find it harder to relax because of the tinnitus.,0.140845,0.256198,0.192982,0.174809,0.182927,0.171875,0.218905,0.128205,0.217143,0.141527
I have strong worries because of the tinnitus.,0.028169,0.099174,0.070175,0.109924,0.170732,0.125,0.094527,0.128205,0.137143,0.087523


In [13]:
# export crosstab to excel-csv
ct.to_csv(p_loc + 'results/01_tables/ct_worst_symptom_country_unformatted.csv')

### Further split up this crosstab using season

In [14]:
# crosstab for statistics
ct = pd.crosstab([sub_df['worst_symptom'],sub_df['season']], sub_df.country, normalize = 'columns')
# reindex in a logical order
ct = ct.reindex(['spring', 'summer', 'autumn', 'winter'], level = 'season')
# format ct for better readability
ct.style.format('{:,.2%}')
# ct = u.format_ct(ct)


Unnamed: 0_level_0,country,AU,CA,CH,DE,ES,FR,GB,IT,NL,US
worst_symptom,season,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
"Because of the tinnitus I am more irritable with my family, friends and colleagues.",spring,1.41%,0.00%,0.00%,1.91%,4.88%,1.56%,0.50%,5.13%,1.14%,0.93%
"Because of the tinnitus I am more irritable with my family, friends and colleagues.",summer,2.82%,0.00%,0.00%,1.37%,0.00%,3.12%,1.00%,1.28%,1.14%,1.86%
"Because of the tinnitus I am more irritable with my family, friends and colleagues.",autumn,4.23%,0.00%,1.75%,1.68%,4.88%,1.56%,0.50%,1.28%,2.29%,2.23%
"Because of the tinnitus I am more irritable with my family, friends and colleagues.",winter,0.00%,0.00%,0.88%,1.53%,0.00%,3.12%,1.00%,1.28%,0.57%,0.93%
Because of the tinnitus I am more sensitive to environmental noises.,spring,1.41%,2.48%,4.39%,2.67%,0.00%,1.56%,1.99%,2.56%,1.14%,1.49%
Because of the tinnitus I am more sensitive to environmental noises.,summer,2.82%,3.31%,0.88%,1.83%,1.22%,6.25%,1.00%,1.28%,1.71%,2.23%
Because of the tinnitus I am more sensitive to environmental noises.,autumn,1.41%,4.13%,4.39%,2.90%,2.44%,1.56%,0.50%,2.56%,4.00%,2.61%
Because of the tinnitus I am more sensitive to environmental noises.,winter,1.41%,1.65%,2.63%,2.14%,1.22%,0.00%,1.00%,2.56%,0.00%,1.68%
Because of the tinnitus it is difficult to concentrate.,spring,4.23%,2.48%,0.88%,3.44%,4.88%,0.00%,2.49%,2.56%,2.86%,4.66%
Because of the tinnitus it is difficult to concentrate.,summer,1.41%,1.65%,2.63%,2.90%,4.88%,1.56%,1.99%,2.56%,1.14%,2.79%


In [15]:
# export crosstab to excel-csv
ct.to_csv(p_loc + 'results/01_tables/ct_worst_symptom_season_country_unformatted.csv')

In [26]:
# helper cell to get the number of users per country
ct = pd.crosstab([sub_df['worst_symptom'],sub_df['season']], sub_df.country, normalize = False)
for i,s in zip(ct.columns, ct.sum()):
    print(f'{i} (n={s})')

AU (n=71)
CA (n=121)
CH (n=114)
DE (n=1310)
ES (n=82)
FR (n=64)
GB (n=201)
IT (n=78)
NL (n=175)
US (n=537)
