In [None]:
import pandas as pd
import json
import matplotlib.pyplot as plt
import numpy as np
import statsmodels.stats.anova as anova
from scipy import stats

# Configuration

Specify the file containing summary discrimination information (i.e. `discriminated` column). Running the `EMAprep` notebook will generate such a file for the relevant surveys (weekly and evening). You can also aggregate them using `EMAaggregation` notebook to get a combined file and scale grouping.

Note: weekly reports of discimination ask about the incidents that occured the day before. So affective ratings and stressors of weekly responses reflect the status the day after the discrimination. Substance use questions are asking about the day before so they represent same day status. On the other hand, evening reports of discrimination ask about the incidens of the same day. Therefore, affect ratings and stressors represent the same day status. Subsance use of the morning of the next day are relevant.

this should be rewritten to read from the aggregated data. **TO-DO**

In [None]:
prompt = """\
Specify the **absolute path** of the configuration file containing information about: 

 - the file containing summary discrimination information (i.e. `discriminated` column). \
 Running the `EMAprep` notebook will generate such a file for the relevant surveys (weekly and evening).\
 You can also aggregate them using `EMAaggregation` notebook to get a combined file and scale grouping.
 - output files where various results (tables and figures) are stored.

Note: weekly reports of discimination ask about the incidents that occured the day before.\
 So affect ratings and stressors of weekly responses reflect the status the day after the\
 discrimination. Substance use questions are asking about the day before so they represent\
 same day status. On the other hand, evening reports of discrimination ask about the\
 incidens of the same day. Therefore, affect ratings and stressors represent the same day\
 status. Subsance use of the morning of the next day are relevant.

  
"""
#config_file = 'emadiscrimination-config-evening.json'
config_file = input(prompt)
print('using configurations specified in {}'.format(config_file))

In [None]:
with open(config_file, 'r') as file_obj:
    config = json.load(file_obj)

survey = config['survey']
data_file = config['data_file']
column_grouping_file = config['column_grouping_file']
sleep_file = config['additional_files']['sleep']
step_file = config['additional_files']['step']
discrimination_id_file = config['discrimination_id_file']
discrimination_file = config['discrimination_file']
discrimination_frq_figure = config['discrimination_frq_figure']
discrimination_btw_box = config['discrimination_btw_box']
discrimination_dayof_within_box = config['discrimination_dayof_within_box']
discrimination_dayafter_within_box = config['discrimination_dayafter_within_box']

In [None]:
print('reading data with summary discrimination information from', data_file, 'for survey', survey)
additional_str = ""
for item in config['additional_files']:
    additional_str = config['additional_files'][item] + ", " + additional_str
print('the additional files to use for analysis are', additional_str)
print('   participant IDs who reported discrimination are stored in', discrimination_id_file)
print('   reports of discrimination by PID and date are stored in', discrimination_file)
print('   the frequency of different types of discrimination are stored in', discrimination_frq_figure)
print('   the box plot of affect ratings between people who reported discrimination and people who did not is stored in', discrimination_btw_box)
print('   the box plot of affect ratings of people who reported discrimination on days they reported discrimination vs. days they did not is stored in', discrimination_dayof_within_box)
print('   the box plot of affect ratings of people who reported discrimination on days after reports of dicrimination vs. days with no reports of discrimination the prior day is stored in', discrimination_dayafter_within_box)

# Setup

In [None]:
# read column groupings
with open(column_grouping_file, 'r') as file_obj:
    scale_grouping = json.load(file_obj)

In [None]:
# read EMA responses
responses = pd.read_csv(data_file)
#responses.info()

# Discrimination Analysis

In [None]:
# TO-DO test of normality

In [None]:
# TO-DO test of homogenity of variance

In [None]:
# one-way ANOVA comparing affect ratings between participants who reported discrimination at least once and
# those who did not report discrimination at all
def between_discrimination(dependent_column_name, resps):
    F, p =  stats.f_oneway(resps[resps['discriminated'] == 'YES'][dependent_column_name], 
                           resps[resps['discriminated'] == 'NO'][dependent_column_name])
    #F, p = stats.mannwhitneyu(resps[resps['discriminated'] == 'YES'][dependent_column_name], 
    #                       resps[resps['discriminated'] == 'NO'][dependent_column_name]) # TO-DO test
    print('one-way ANOVA on {}: F = {:.2f}, p = {:.3f}'.format(dependent_column_name, F, p))
    print('*****************')
    return (F, p)

In [None]:
# repeated measure ANOVA comparing affect ratings of participants who reported discrimination
# between days they reported discrimination and days they didn't
def within_discrimination(dependent_column_name, resps):
    model = anova.AnovaRM(resps, 
                          dependent_column_name, 
                          'pid', 
                          within=['discriminated'],
                          aggregate_func=np.mean)
    res = model.fit()
    print('ratings of {}'.format(dependent_column_name))
    print(res)
    print('*****************')
    return res

In [None]:
def set_box_color(bp, color):
    plt.setp(bp['boxes'], color=color)
    plt.setp(bp['whiskers'], color=color)
    plt.setp(bp['caps'], color=color)
    plt.setp(bp['fliers'], color=color, marker = ".")
    plt.setp(bp['medians'], color=color)

In [None]:
# TO-DO effect size calculation

## discrimination statistics

In [None]:
unfair_subset = responses[responses['discriminated'] == 'YES']
no_unfair_subset = responses[responses['discriminated'] == 'NO']

In [None]:
unfair_report_pids = (unfair_subset.sort_values('PID'))['PID'].unique()
no_unfair_report_pids = (no_unfair_subset.sort_values('PID'))['PID'].unique()
pids_reported_both = np.intersect1d(no_unfair_report_pids, unfair_report_pids)
pids_reported_no_unfair = set(no_unfair_report_pids) - set(unfair_report_pids)
pids_reported_only_unfair = set(unfair_report_pids) - set(pids_reported_both)
print('number of discrimination reports: ', unfair_subset.shape[0])
print('people reporting discrimination', unfair_report_pids)
print('number of people discriminated against: ', len(unfair_report_pids))
print('number of people with no reports of discrimination', len(pids_reported_no_unfair))
print('people who reported discrimination in every survey they responded to', pids_reported_only_unfair)

In [None]:
print('{} of unfair treatment reports are long'.format(unfair_subset['long'].sum()))
print('{} of unfair treatment reports are late'.format(unfair_subset['late'].sum()))
print('{} of unfair treatment reports are late and long'.format(unfair_subset[unfair_subset['late']]['long'].sum()))

In [None]:
unfair_subset[unfair_subset['long']]

In [None]:
unfair_subset[unfair_subset['late']]

NOTE: take a look at late and long reports of unfair treatment and decide if it fine to include them in further analysis

- YSS: UW phase I weekly EMA: all are fine
- YSS: UW phase I evening EMA: row 1057 is too long and late; everything else is fine

In [None]:
columns_unfair = scale_grouping['discrimination'].copy()
columns_unfair.remove('unfair_not')
if 'unfair_yesno' in columns_unfair:
    columns_unfair.remove('unfair_yesno')


In [None]:
per_pid_unfair_report = unfair_subset.groupby('pid').sum()[columns_unfair]
per_pid_unfair_report['total'] = per_pid_unfair_report[columns_unfair].sum(axis=1)
# TO-DO obtain the exact dates of discrimination for every report and store that alongside this information
per_pid_unfair_report.to_csv(discrimination_id_file)
per_pid_unfair_report.head(10)

In [None]:
PARTICIPANT_NUM=209
plotObj = unfair_subset['pid'].plot(kind='hist', bins=PARTICIPANT_NUM)
plotObj.set_xlabel('participant ids')
print('some people have reported discrimination once but some have reported it multiple times')

In [None]:
ax = per_pid_unfair_report[columns_unfair].sum().sort_values(ascending=False).plot(kind='bar')
xlabels = [x.get_text()[7:] for x in ax.get_xticklabels()]
ax.set_xticklabels(xlabels)
ax.yaxis.grid()
ax.set_ylabel('Number of Reports')
plt.tight_layout()
plt.savefig(discrimination_frq_figure, format = 'pdf', bbox_inches = 'tight')
plt.show()

In [None]:
unfair_subset['discrimination_num'] = unfair_subset[columns_unfair].sum(axis=1)

In [None]:
output_columns = ['pid', 'recorded_date', 'discrimination_num']
output_columns.extend(columns_unfair)
unfair_subset[output_columns].set_index('pid').to_csv(discrimination_file)

In [None]:
# TO-DO create plots similar to those in response characteristics for the following slices
# - people who have reported discrimination at least once vs. people who have not repported discrimination
# - for people who have reported discrimination on days with reports of discrimination vs. not

In [None]:
# using aggregated data
#haggregated_file = '/Users/yasaman/UWEXP/analysis-scripts/surveys/results/emaaggregation/aggregated-horizontal-numVal-internalID.csv'
haggregated_file = '/Users/yasaman/UWEXP/analysis-scripts/combined/results/bigtable_UWonly.csv'
data = pd.read_csv(haggregated_file)
unfair_cols = ['unfair_age', 'unfair_appearance', 'unfair_disability', 'unfair_gender', 'unfair_height', 
               'unfair_income', 'unfair_intelligence', 'unfair_learning', 'unfair_major', 'unfair_national', 
               'unfair_orientation', 'unfair_religion', 'unfair_weight']
discrimination_category_breakdown_fig = '/Users/yasaman/UWEXP/analysis-scripts/surveys/results/discrimination/discrimination_category_breakdown.png'

In [None]:
no_post_pids = [18, 26, 27, 41, 53, 56, 69, 71, 81, 83, 85, 89, 100, 101, 107, 112, 114, 119, 121, 129, 131, 133, 135, 
                139, 141, 147, 152, 164, 182, 192, 197, 200, 208]
data = data[~data['PID'].isin(no_post_pids)]

In [None]:
ax = data[unfair_cols].sum().sort_values(ascending=False).plot(kind='bar', color=['b']*len(unfair_cols))
xlabels = [x.get_text()[7:] for x in ax.get_xticklabels()]
ax.set_xticklabels(xlabels, rotation=45, ha='right')
ax.yaxis.grid()
ax.set_ylabel('Number of Reports')
plt.tight_layout()
plt.savefig(discrimination_category_breakdown_fig, format = 'png', bbox_inches = 'tight')
plt.show()

<span style="color:red">TO-DO the following sections should be refactored </span>

## discrimination comparisons: affect

compare affect ratings for:

- individuals who reported discrimination vs. not (btw comparison over avg ratings of each individual in both groups)
- days with and without discrimination for individuals who have experienced discrimination (within comparison over avg ratings for individuals who reported discrimination on days they reported discrimination and on days they didn't)
- days after reports of discrimination and days no discrimination was reported the prior day (within comparisons over avg ratings for individuals who reported disrimination if discrimination happened the day before vs. not)

Between comparisons are indicative of likely chornic correlates of exposure to discrimination while the within comparisons identify short-term correlations.

In [None]:
# mean affect ratings for 
# - participants who reported discrimination at least once
# - participants who did not report any discrimination
between = responses[responses['discriminated'].notnull()]
between_affect_avg = between.groupby(['pid'])[scale_grouping['affect']].mean()
between_affect_avg.loc[unfair_report_pids, 'discriminated'] = 'YES'
between_affect_avg.loc[pids_reported_no_unfair, 'discriminated'] = 'NO'
group_means = between_affect_avg.groupby(['discriminated'])[scale_grouping['affect']].mean()
print(group_means.T)
mean_diff = group_means.loc['YES'] - group_means.loc['NO']
print('the difference in affect ratings in people who experienced discrimination vs. those who did not')
print('NOTE: positive difference means the rating is larger in the group who experienced discrimination')
print('NOTE: ratings range from 1 (not at all) to 5 (extremely) for each item below')
print(mean_diff)

In [None]:
# TO-DO test assumptions for the applicability of one-way ANOVA
result_between = [between_discrimination(scale, between_affect_avg) for scale in scale_grouping["affect"]]

# NOTE: discrimination reports are not ranked samplese so I'm not sure if Mann-Whitney U test is applicable here
# TO-DO search further and consult with a statistician

In [None]:
axes = between_affect_avg.boxplot(column=scale_grouping['affect'], 
                           by='discriminated', 
                           figsize=(12, 16),
                           layout=(int(len(scale_grouping['affect'])/2)+1, 2))

In [None]:
discriminated_yes = [between_affect_avg.loc[unfair_report_pids, scale] for scale in scale_grouping["affect"]]
discriminated_no = [between_affect_avg.loc[pids_reported_no_unfair, scale] for scale in scale_grouping["affect"]]

plt.figure(figsize=(8,4))

bpl = plt.boxplot(discriminated_yes, 
                  positions=np.array(range(len(scale_grouping['affect'])))*2.0-0.4, 
                  sym='', widths=0.6)
bpr = plt.boxplot(discriminated_no, 
                  positions=np.array(range(len(scale_grouping['affect'])))*2.0+0.4, 
                  sym='', widths=0.6)
set_box_color(bpl, '#D7191C') # colors are from http://colorbrewer2.org/
set_box_color(bpr, '#2C7BB6')

# draw temporary red and blue lines and use them to create a legend
plt.plot([], c='#D7191C', label='Reported >= 1 Discrimination')
plt.plot([], c='#2C7BB6', label='Reported No Discrimination')
plt.legend(ncol = 2)
plt.ylabel('Affect Ratings')
plt.xticks(range(0, len(scale_grouping['affect']) * 2, 2), 
           [scale[5: ] for scale in scale_grouping['affect']])
plt.xlim(-1, len(scale_grouping['affect'])*2-1)
plt.ylim(0, 5.5)
plt.tight_layout()
plt.savefig(discrimination_btw_box)

<span style="color:red">TO-DO same-day analysis is only applicable to daily data. day-after analysis is applicable to both daily and weekly. proper filtering should be in place to ensure only daily data is used for same-day analysis. also, proper calculations should ensure the next-day values are used from daily data for day-after analysis. </span>

In [None]:
# mean affect ratings of participants who reported discrimination
# between days they reported discrimination and days they didn't
within = responses[responses['discriminated'].notnull() & responses['pid'].isin(pids_reported_both)]
within_unstacked = within.groupby(['pid', 'discriminated'])[scale_grouping['affect']].mean().unstack()
group_means = within_unstacked.mean().unstack(level=-1)
print(group_means)
mean_diff = group_means['YES'] - group_means['NO']
print('the difference in affect ratings in the presence and absence of discrimination')
print('NOTE: positive difference means the rating is larger when discrimination is reported')
print('NOTE: ratings range from 1 (not at all) to 5 (extremely) for each item below')
print(mean_diff)

In [None]:
# TO-DO test assumptions for the applicability of repeated measure ANOVA
result_within = [within_discrimination(scale, within) for scale in scale_grouping['affect']]

In [None]:
discrimination_reported = [within_unstacked[scale]['YES'] for scale in scale_grouping['affect']]
discrimination_not_reported = [within_unstacked[scale]['NO'] for scale in scale_grouping['affect']]

plt.figure(figsize=(8,4))

bpl = plt.boxplot(discrimination_reported, 
                  positions=np.array(range(len(scale_grouping['affect'])))*2.0-0.4, 
                  sym='', widths=0.6)
bpr = plt.boxplot(discrimination_not_reported, 
                  positions=np.array(range(len(scale_grouping['affect'])))*2.0+0.4, 
                  sym='', widths=0.6)
set_box_color(bpl, '#D7191C') # colors are from http://colorbrewer2.org/
set_box_color(bpr, '#2C7BB6')

# draw temporary red and blue lines and use them to create a legend
plt.plot([], c='#D7191C', label='Discrimination Reported')
plt.plot([], c='#2C7BB6', label='Discrimination NOT Reported')
plt.legend(ncol = 2)
plt.ylabel('Affect Ratings')
plt.xticks(range(0, len(scale_grouping['affect']) * 2, 2), 
           [scale[5: ] for scale in scale_grouping['affect']])
plt.xlim(-1, len(scale_grouping['affect'])*2-1)
plt.ylim(0, 5.5)
plt.tight_layout()
plt.savefig(discrimination_dayof_within_box)

## discrimination comparisons: alcohol consumption

compare alcohol consumption for:

- individuals who reported discrimination vs. not (btw comparisons of number of reports of alcohol consumption by individuals in each group)
- after reports of discrimination vs. not (within comparison of the number of reports of alcohol consumptions for individuals who reported discrimination on days they reported discrimination and days they didn't)

Between comparisons are indicative of likely chornic correlates of exposure to discrimination while the within comparisons identify short-term correlations. 

In [None]:
# between comparisons
columns_substance = ['stimulant_yesno', 'alcohol_yesno', 'any_drug', 'any_substance']
between = responses[(((responses['survey'] == 'weekly') | (responses['survey'] == 'morning'))
                      & (responses['discriminated'].notnull()))]
between['stimulant_yesno'] = between['stimulant_yesno'].map({2:0, 1:1})
between['alcohol_yesno'] = between['alcohol_yesno'].map({2:0, 1:1})
between_substance_frq = between.groupby(['pid'])[columns_substance].sum()
between_substance_frq.loc[unfair_report_pids, 'discriminated'] = 'YES'
between_substance_frq.loc[pids_reported_no_unfair, 'discriminated'] = 'NO'
group_means = between_substance_frq.groupby(['discriminated'])[columns_substance].mean()
group_means
mean_diff = group_means.loc['YES'] - group_means.loc['NO']
print('the difference in average frequency of substance use in people who experienced discrimination vs. those who did not')
print('NOTE: positive difference means the average frequency is larger in the group who experienced discrimination')
print(mean_diff)

In [None]:
# TO-DO test assumptions for the applicability of one-way ANOVA
result_between = [between_discrimination(item, between_substance_frq) for item in columns_substance]

In [None]:
# TO-Do plots for alcohol consumption between groups d
# TO-DO alcohol consumption (within comparisons)

## discrimination comparisons: reports of stressors

In [None]:
# TO-DO

## discrimination comparisons: sleep

In [None]:
sleep = pd.read_csv(sleep_file)

In [None]:
sleep.loc[sleep['PID'].isin(unfair_report_pids), 'discriminated'] = 'YES'
sleep.loc[sleep['PID'].isin(pids_reported_no_unfair), 'discriminated'] = 'NO'
print('people with reports of discrimination whose sleep data is unavailable: {}'.format(set(unfair_report_pids) - set(sleep['PID'].unique())))
print('people with no reports of discrimination whose sleep data is unavailable: {}'.format(pids_reported_no_unfair - set(sleep['PID'].unique())))

In [None]:
sleep_columns = ['totalTimeInBed', 'totalMinutesAsleep', 'minutesAwake', 'minutesAsleep', 'efficiency']
between_sleep_avg = sleep[sleep['isMainSleep'] == True].groupby(['PID'])[sleep_columns].mean()
between_sleep_avg.loc[set(unfair_report_pids) & set(sleep['PID'].unique()), 'discriminated'] = 'YES'
between_sleep_avg.loc[pids_reported_no_unfair & set(sleep['PID'].unique()), 'discriminated'] = 'NO'
group_means = between_sleep_avg.groupby(['discriminated'])[sleep_columns].mean()
print(group_means.T)
mean_diff = group_means.loc['YES'] - group_means.loc['NO']
print('the difference in sleep metrics in people who experienced discrimination vs. those who did not')
print('NOTE: positive difference means metris are larger in the group who experienced discrimination')
print('all time measures are in minutes')
print(mean_diff)

In [None]:
result_between = [between_discrimination(metric, between_sleep_avg) for metric in sleep_columns]

## discrimination comparisons: step

In [None]:
step = pd.read_csv(step_file)

In [None]:
step_columns = ['steps']
between_step_avg = step.groupby(['PID'])[step_columns].mean()
between_step_avg.loc[set(unfair_report_pids) & set(step['PID'].unique()), 'discriminated'] = 'YES'
between_step_avg.loc[pids_reported_no_unfair & set(step['PID'].unique()), 'discriminated'] = 'NO'
group_means = between_step_avg.groupby(['discriminated'])[step_columns].mean()
print(group_means.T)
mean_diff = group_means.loc['YES'] - group_means.loc['NO']
print('the difference in step metrics in people who experienced discrimination vs. those who did not')
print('NOTE: positive difference means metris are larger in the group who experienced discrimination')
print('all time measures are in minutes')
print(mean_diff)

In [None]:
result_between = [between_discrimination(metric, between_step_avg) for metric in step_columns]