In [None]:
import pandas as pd
import numpy as np
import io, os, sys, types
from IPython import get_ipython
from nbformat import read
from IPython.core.interactiveshell import InteractiveShell
from IPython.core.display import HTML
from scipy.stats import ttest_rel
import matplotlib.pyplot as plt
import copy
import json
import csv
import ast
import statsmodels.api as sm
import statsmodels.formula.api as smf

get_ipython().magic(u'matplotlib inline')
%matplotlib inline

# Prepare for the data

In [None]:
# Loads the helpers notebook, and the config files
#%run Helpers.ipynb
#%run LoadData.ipynb

with open("userconfig.json", 'r') as file_obj:
    config = json.load(file_obj)
    file_obj.close()
filedir = config["configdir"]
surveydir = config["surveydir"]

# Load the json files that have the data loading and cleaning info.
# this is currently v1 and that can change if we make tweaks to things 
# like the cutoff for non-response
with open(filedir+config["dataconfig"], 'r') as file_obj:
    dataconfig = json.load(file_obj)
    file_obj.close()
with open(filedir+dataconfig["mergeconfig"], 'r') as file_obj:
    mergeconfig = json.load(file_obj)
    file_obj.close()
        
# file = config["surveydir"]+dataconfig["mergedfile"]
# merged = pd.read_csv(file) 

# display(HTML(merged.head(5).to_html()))  

In [None]:
# combine the new and the old ipython
%run LoadData.ipynb
%run LoadData_old.ipynb
%run Helpers.ipynb

In [None]:
clean_surveys = load_surveys(config, dataconfig)

In [None]:
scaled_surveys = create_scales(config, dataconfig, copy.deepcopy(clean_surveys))

In [None]:
a = scaled_surveys["uw_baseline1"]

In [None]:
###### Create an exportable or analyzable summary file that merges baseline 1 and baseline 3. 

# Load the json file that has the merge info in it

merged = merge_surveys_by_column(config, dataconfig, mergeconfig, scaled_surveys)

display(HTML(merged.head(n=10).to_html()))

    
merged.index.name = "PID"
merged = merge_scales(config, dataconfig, merged)

merged.to_csv(surveydir+dataconfig["mergedfile"])

# XX I had this note: default for discrimination columns should be nan => 0
# XX but I am currently setting it to -1. 
print("TODO: determine whether discrimination nans need to be set to 0 and how to do that")


In [None]:
# Load the json file that has the data loading and cleaning info in it. 
# this is currently v1 and that can change if we make tweaks to things 
# like the cutoff for non-response
with open(filedir + "dataconfig.json", 'r') as file_obj:
    dataconfig = json.load(file_obj)


#merged_cmu = copy.deepcopy(merged)
merged_cmu = merged[merged.LOC_ALL == 1]

#merged_uw = copy.deepcopy(merged)
merged_uw = merged[merged.LOC_ALL == 0]

df_all = merged
df_cmu = merged_cmu
df_uw = merged_uw

print("------------generating datasets for all")
datasets = generate_datasets(dataconfig, df_all)
print("------------generating datasets for CMU")
cmu_datasets = generate_datasets(dataconfig, df_cmu)
print("------------generating datasets for UW")
uw_datasets = generate_datasets(dataconfig, df_uw)

In [None]:
uw_datasets["CSE"].shape

In [None]:
for i in uw_datasets.keys():
    if ("CSE" in i):
        print(i)

In [None]:
print(uw_datasets["urm"][(uw_datasets["urm"]["CSE_POST"] == 1).tolist()].shape,
     uw_datasets["firstgen"][(uw_datasets["firstgen"]["CSE_POST"] == 1).tolist()].shape,
     uw_datasets["direct_CSE"].shape,
     uw_datasets["stars"][(uw_datasets["stars"]["CSE_POST"] == 1).tolist()].shape,
     uw_datasets["all"][((uw_datasets["all"]["CSE_POST"] == 1) & (uw_datasets["all"]["Gender_MYU_ALL"] == 2)).tolist()].shape,
     )

In [None]:
for i in uw_datasets["all"].columns:
    if ("cse" in i.lower()):
        print(i)

# Data Analysis

## Discrimination

In [None]:
%run Helpers.ipynb
print(uw_datasets["all"].Gender_MYU_ALL.value_counts())

display(HTML(uw_datasets["all"].head(30).to_html()))

split = 'DISC_ALL'
for cols in [['Gender_MYU_ALL'],['Sexuality_MID'],['URM_MYU_ALL'],['Minority_MYU_ALL'],['Gender_MYU_ALL','Sexuality_MID'],
            ['Gender_MYU_ALL','URM_MYU_ALL'],['Gender_MYU_ALL','Minority_MYU_ALL'],['Sexuality_MID','URM_MYU_ALL'],
             ['Sexuality_MID','Minority_MYU_ALL'],['URM_MYU_ALL','FirstGen_MYU_ALL'],['SpecialClass_International_MID'],
            ['Gender_MYU_ALL','FirstGen_B1'],['URM_MYU_ALL','FirstGen_MYU_ALL'],['Gender_MYU_ALL','Sexuality_MID','URM_MYU_ALL'],
            ['Gender_MYU_ALL','Sexuality_MID','Minority_MYU_ALL']]:
    print("divisor: group")
    title = "Percentage of each category in " + describe(mergeconfig, cols,", ") + " reporting discrimination during EMAs at UW"
    print(title)
    labels = get_labels(mergeconfig, cols)
    print(labels)
    
    table = prep_for_plot(cols+[split], 
                          lambda data: percentage_groupby(uw_datasets["all"], cols, split, labels), 
                                                                            uw_datasets["all"])
    if table.empty:
        print("not results")
    else:
        print(table)
        create_bar_plot(table, title, xticks=['Unknown','False','True'], 
                        ylabel="percentage", xlabel="Discrimination reported", ylim=(0,1))

display(HTML(uw_datasets["all"].head(3).to_html()))
split = 'MLE_DISCRIMINATION_POST'
for cols in [['Gender_MYU_ALL','Sexuality_MID'],['FirstGen_MYU_ALL','MYU_Services_STARS_01_POST']]:
    print("divisor: split")
    title = "Percentage of each category in " + describe(mergeconfig, cols,"reporting discrimination at end at UW")  
    print(title)
    labels = get_labels(mergeconfig, cols)
    table = prep_for_plot(cols+[split], lambda data: percentage_groupby(data, cols, split, labels, divisor='split'), 
                          uw_datasets["all"])
    create_bar_plot(table, title,  xticks=['Unknown','False','True'], 
                    ylabel="percentage", xlabel="Discrimination reported", ylim=(0,1))


In [None]:
#cols = ['Race_EastAsian_B1','Race_SouthAsian_B1','URM_MYU_ALL','FirstGen_B1','Engineer_MID',
#        'TOTAL_ASSAULT_ALL','MYU_Services_STARS_01_POST','DirectAdmit_POST','Sexuality_MID']
cols = ['Gender_MYU_ALL','Sexuality_MID','URM_MYU_ALL']

split = 'DISC_ALL'
print("doing columns")
for col in cols:
    labels = get_labels(mergeconfig, [col])
    title = "Fraction" + describe(mergeconfig, [col],",")+"reporting discrimination during EMAs"
    print(title)
    table = prep_for_plot([col,split], lambda data: percentage_groupby(data, [col], split, labels), 
                          uw_datasets["all"])
    create_bar_plot(table, title, xticks=['True','False'],
                    ylabel="fraction", xlabel="Discrimination Reported (by cat)", ylim=(0,1))
    table = prep_for_plot([col,split], lambda data: percentage_groupby(data, [col], split, labels, divisor='split'), 
                          uw_datasets["all"])
    create_bar_plot(table, title, xticks=['True','False'],
                    ylabel="fraction", xlabel="Discrimination Reported (by tot)", ylim=(0,1))
    

split = 'MLE_DISCRIMINATION_POST'
print("doing columns")
for col in cols:
    labels = get_labels(mergeconfig, [col])
    title = "Fraction" + describe(mergeconfig, [col],",") + "reporting discrimination in June"
    print(title)
    table = prep_for_plot([col]+[split], lambda data: percentage_groupby(data, [col], split, labels), 
                          uw_datasets["all"])
    create_bar_plot(table, title, xticks=['Unknown','False','True'],
                    ylabel="fraction", xlabel="Discrimination Reported (by cat)", ylim=(0,1))
    table = prep_for_plot([col]+[split], lambda data: percentage_groupby(data, [col], split, labels, divisor='split'),
                          uw_datasets["all"])
    create_bar_plot(table, title,  xticks=['Unknown','False','True'],
                    ylabel="fraction", xlabel="Discrimination Reported (by tot)", ylim=(0,1))
    

In [None]:
# %run Helpers.ipynb

cols = ['Gender_MYU_ALL','Sexuality_MID']
title = "Percentage of each category in " + describe(mergeconfig, cols,", ") + " reporting discrimination during EMAs"
split = 'DISC_ALL'
labels = get_labels(mergeconfig, cols)
print(labels)

table = prep_for_plot(cols+[split], lambda data: percentage_groupby(data, cols, split, labels), 
                      uw_datasets["engineer_simplegender"])
print(table)
ax = create_bar_plot(table, title, xticks=['Unknown','False','True'], 
                    ylabel="percentage", xlabel="Discrimination reported", ylim=(0,1))


ax.figure.set_figheight(3)
ax.figure.set_figwidth(5)
plt.savefig('gender_sexuality.pdf',format = "pdf", bbox_inches = "tight")
plt.show()

In [None]:
cols = ['Gender_MYU_ALL','URM_MYU_ALL']
title = "Percentage of each category in " + describe(mergeconfig, cols,", ") + " reporting discrimination during EMAs"
split = 'DISC_ALL'

labels = get_labels(mergeconfig, cols)
table = prep_for_plot(cols+[split], lambda data: percentage_groupby(data, cols, split, labels, divisor='split'), 
                      uw_datasets["all"])
ax = create_bar_plot(table, title, xticks=['True','False'], 
                    ylabel="Fraction",
                     xlabel="Discrimination breakdown (by total number reporting)", ylim=(0,1))


ax.figure.set_figheight(3)
ax.figure.set_figwidth(5)
plt.savefig('gender_sexuality.pdf',format = "pdf", bbox_inches = "tight")
plt.show()

In [None]:
%run Helpers.ipynb
#only engineers
cols = ['Discrimination','No_Discrimination']

tmp_data = uw_datasets["engineer_simplegender"]
tmp_data['Discrimination'] = tmp_data.MLE_DISCRIMINATION_MID + tmp_data.MLE_DISCRIMINATION_POST
tmp_data['No_Discrimination'] = tmp_data.Discrimination == 0
tmp_data['Rel_Violence'] = tmp_data.MLE_GENDER_VIOLENCE_MID+tmp_data.MLE_GENDER_VIOLENCE_POST
tmp_data['Assault'] = tmp_data.MLE_ASSAULT_MID+tmp_data.MLE_ASSAULT_POST

cols = ['Gender_MFO_MID','URM_B1']
table = prep_for_plot(cols+['Discrimination'], lambda x:x, tmp_data)
levels = [["Male", "Female"],["Majority","URM"]]

   
    
plot = plot_percentages(table, cols,"Percentage reporting discrimination", ['Discrimination']) #, levels=levels)\




In [None]:
cols = ['Gender_MYU_ALL','URM_MYU_ALL','FirstGen_MYU_ALL']

title = "Engineers only: Percentage of each category in " + describe(mergeconfig,cols,", ") + " reporting discrimination during EMAs"
split = 'DISC_ALL'
labels = get_labels(mergeconfig, cols)
table = prep_for_plot(cols+[split], lambda data: percentage_groupby(data, cols, split, labels), 
                      uw_datasets["engineer_simplegender"])
ax = create_bar_plot(table, title, xticks=['False','True'], 
                    ylabel="percentage", xlabel="Discrimination reported", ylim=(0,1))
    
    
cols = ['Gender_MYU_ALL','URM_MYU_ALL']

title = "Engineers only: Percentage of each category in " + describe(mergeconfig,cols,", ") + " reporting discrimination during EMAs"
split = 'MLE_DISCRIMINATION_POST'
#labels = get_labels(mergeconfig, cols)
table = prep_for_plot(cols+[split], lambda data: percentage_groupby(data, cols, split),#, labels), 
                      uw_datasets["engineer_simplegender"])
ax = create_bar_plot(table, title, #xticks=['False','True'], 
                    ylabel="percentage", xlabel="Discrimination reported", ylim=(0,1))
    

In [None]:
np.array(per) * 91 / 100

In [None]:
x = np.arange(4)
per = [5,20,8.3,16.7]
plt.bar(x, per, color=["white", "white", "white", "white"], edgecolor = ["black","black"], linewidth = 2)
plt.grid(axis = "y")
plt.ylim((0,24))
# plt.title("% of discrimination in the last year of engineers\nby under-represented minority status and gender")
plt.xticks([0,1,2,3],[""])
# plt.ylabel("% among engeineers")
plt.show()

### Demographics

In [None]:
%run Helpers.ipynb
uw_datasets["all"].describe()

#cols = ['Race_EastAsian_B1','Race_SouthAsian_B1','URM_MYU_ALL','FirstGen_B1','Engineer_MID',
#        'TOTAL_ASSAULT_ALL','MYU_Services_STARS_01_POST','DirectAdmit_POST','Sexuality_MID']
cols = ['Gender_MYU_ALL','Minority_MYU_ALL','URM_MYU_ALL','FirstGen_MYU_ALL','Engineer_MID',
        'College_Engineering_MYU_ALL',
       'MYU_Services_STARS_01_POST','DirectAdmit_POST','Sexuality_MID']

print("doing columns")
for col in cols:
    labels = get_labels(mergeconfig, [col])
    print(labels)
    title = "Percentage of each answer in " + describe(mergeconfig, [col]," ") 
    print(title)
    table = prep_for_plot([col], lambda data: percentage_for_values(data, [col], order=labels[col]), 
                          uw_datasets["all"])
    #table.index = table.index.values.map(lambda x: labels[x])
    print("result: ")
    print(table)
    xticks = list(labels[col].values())
    plt = create_bar_plot(table, title, xticks=xticks)
 

In [None]:
%run Helpers.ipynb
cmu_datasets["all"].describe()

#cols = ['Race_EastAsian_B1','Race_SouthAsian_B1','URM_MYU_ALL','FirstGen_B1','Engineer_MID',
#        'TOTAL_ASSAULT_ALL','MYU_Services_STARS_01_POST','DirectAdmit_POST','Sexuality_MID']
cols = ['Gender_MYU_ALL','Minority_MYU_ALL','URM_MYU_ALL','FirstGen_MYU_ALL','College_Engineering_MYU_ALL',
        'College_Business_MYU_ALL', 'College_AS_MYU_ALL']

print("doing columns")
for col in cols:
    labels = get_labels(mergeconfig, [col])
    title = "Percentage of each answer in " + describe(mergeconfig, [col]," ") 
    print(title)
    table = prep_for_plot([col], lambda data: percentage_for_values(data, [col], order=labels[col]), 
                         cmu_datasets["all"])
    #table.index = table.index.values.map(lambda x: labels[x])
    print("result: ")
    print(table)
    xticks = list(labels[col].values())
    plt = create_bar_plot(table, title, xticks=xticks)

#### Comparing UW and CMU

In [None]:
%run Helpers.ipynb


configdir = config["configdir"]
with open(configdir+dataconfig["mergeconfig"], 'r') as file_obj:
    print(configdir)
    mergeconfig = json.load(file_obj)
    file_obj.close()
    
#cols = ['Race_EastAsian_B1','Race_SouthAsian_B1','URM_MYU_ALL','FirstGen_B1','Engineer_MID',
#        'TOTAL_ASSAULT_ALL','MYU_Services_STARS_01_POST','DirectAdmit_POST','Sexuality_MID']
cols = ['Gender_MYU_ALL','Minority_MYU_ALL','URM_MYU_ALL','FirstGen_MYU_ALL',
       'College_Engineering_MYU_ALL', 'College_Business_MYU_ALL', 'College_AS_MYU_ALL']
display(HTML(datasets["all"].head(2).to_html()))

split = 'LOC_ALL'
print("doing columns")
for col in cols:
    labels = get_labels(mergeconfig, [col])
    title = "Percentage" + describe(mergeconfig, [col],"of each category at each university") 
    print(title)
    table = prep_for_plot([col,split], lambda data: percentage_groupby(data, [col], split, labels, min_pple=0), 
                          datasets["all"])
    create_bar_plot(table, title,  xticks=["UW","CMU"],
                    ylabel="percentage", xlabel="University", ylim=(0,1))
    table = prep_for_plot([col,split], lambda data: percentage_groupby(data, [col], split, labels, min_pple=0, divisor='split'), 
                          datasets["all"])
    create_bar_plot(table, title,  xticks=["UW","CMU"],
                    ylabel="percentage", xlabel="University", ylim=(0,1))

#### Sexual Assault

In [None]:
%run 'Helpers.ipynb'

configdir = config["configdir"]
with open(configdir+dataconfig["mergeconfig"], 'r') as file_obj:
    print(configdir)
    mergeconfig = json.load(file_obj)
    file_obj.close()
    
#cols = ['Race_EastAsian_B1','Race_SouthAsian_B1','URM_MYU_ALL','FirstGen_B1','Engineer_MID',
#        'TOTAL_ASSAULT_ALL','MYU_Services_STARS_01_POST','DirectAdmit_POST','Sexuality_MID']
cols = ['TOTAL_ASSAULT_ALL','ASSAULT_PRE_ALL','ASSAULT_POST_ALL']

print("total sexual assaults at both institutions: ")
print(datasets["all"]['TOTAL_ASSAULT_ALL'].agg(sum))
print("Pre Study sexual assaults: ")
print(datasets["all"]['ASSAULT_PRE_ALL'].value_counts()[1])
print("During Study sexual assaults:")
print(datasets["all"]['ASSAULT_POST_ALL'].value_counts()[1])


print("total sexual assaults at UW: ")
print(uw_datasets["all"]['TOTAL_ASSAULT_ALL'].agg(sum))
print("Pre Study sexual assaults: ")
print(uw_datasets["all"]['ASSAULT_PRE_ALL'].value_counts()[1])
print("During Study sexual assaults:")
print(uw_datasets["all"]['ASSAULT_POST_ALL'].value_counts()[1])


#### Percentage in each colloge

In [None]:

print("doing colleges")
cols = ['College_AS_MYU_ALL',
        'College_Business_MYU_ALL',
        'College_Engineering_MYU_ALL']

xticks = ['AS','Business', 'Engineering']
fullTable = pd.DataFrame(columns = cols, index=[-1,1])
for col in cols:
    res = prep_for_plot([col,'LOC_ALL'], lambda data: percentage_for_values(data, [col]), 
                        uw_datasets["all"])
    fullTable[col] = res
    print("Res")
    print(res)
    
fullTable = fullTable.loc[[1], :]
print(fullTable)
plt = create_bar_plot(fullTable.T, "Colleges", xticks=xticks)


### UW Experience

#### STRESS: Phase I CMU Slide 15: How Stressful is being a student at UW?

In [None]:
cols = ['Minority_MYU_ALL', 'College_Engineering_MYU_ALL','FirstGen_MYU_ALL', 
        'MYU_Services_STARS_01_POST','DirectAdmit_POST','DISC_ALL','LOC_ALL']

value_category_map = get_labels(mergeconfig, cols)
print(value_category_map)

#### Slide 8 (High Levels of Stress for All…but better for engineers than other students!)

In [None]:
cols = ['MYU_STRESS_PRE_ALL', 'MYU_STRESS_POST_ALL']
table = prep_for_plot(cols, lambda x:x, uw_datasets["all"])
plot_box(table, "How stressful is being a student at UW?", xticks=['Jan','June'])


# break this out by status
# index_cols = ['Minority_B1','Engineer_MID','FirstGen_B1',
#               'MYU_Services_STARS_01_POST','DirectAdmit_POST','DISC']
index_cols = ["College_Engineering_MYU_ALL", "DirectAdmit_POST", "MYU_Services_STARS_01_POST", 'LOC_ALL']
value_cols = ['MYU_STRESS_POST_ALL']


import matplotlib.pyplot as plt
plot_group(df_all, value_cols, index_cols, mergeconfig,
           plot_type = "bar", remove_neg_value=True, ymin = 1, ymax = 7.9,
          title = "Stress Level at CMU and UW", ylabel = "Stress Level")




index_cols = ["College_Engineering_MYU_ALL", "DirectAdmit_POST", "MYU_Services_STARS_01_POST"]
table = prep_for_plot(index_cols + value_cols, lambda x:x, 
                      uw_datasets["all"])
print("**************** UW ONLY ****************")
plot_group(table, value_cols, index_cols, mergeconfig,
           plot_type = "bar", remove_neg_value=True, ymin = 1, ymax = 7.9,
          title = "Stress Level at UW", ylabel = "Stress Level")

index_cols = ["College_Engineering_MYU_ALL"]
table = prep_for_plot(index_cols + value_cols, lambda x:x, 
                      cmu_datasets["all"])
print("**************** CMU ONLY ****************")

plot_group(table, value_cols, index_cols, mergeconfig,
           plot_type = "bar", remove_neg_value=True, ymin = 1, ymax = 7.9,
           title = "Stress Level at CMU", ylabel = "Stress Level")


#### Second half of slide 9  (But not when we exclude direct admits and STARS students!)

In [None]:
index_cols = ["DirectAdmit_POST", "MYU_Services_STARS_01_POST"]
value_cols = ['MYU_STRESS_POST_ALL']

axes = plot_group(uw_datasets["engineering"], value_cols, index_cols, mergeconfig, plot_type = "bar", 
                  remove_neg_value=True, ymin = 1, ymax = 7.9,
                  title = "Stress Level of ", ylabel = "Stress Level")

index_cols = ['College_Engineering_MYU_ALL']
axes = plot_group(uw_datasets["stars_or_da"], value_cols, index_cols, 
                  mergeconfig, plot_type = "bar", remove_neg_value=True, ymin = 1, ymax = 7.9,
                  title = "Stress Level of ", ylabel = "Stress Level")


### Slide 20 (STARS Students Feel Less Depression than similar students not in STARS!)

In [None]:
from statsmodels.stats import weightstats
#print(uw_datasets["firstgen"].count())
#print(uw_datasets["stars"].count())

cols = ["MYU_STRESS_POST_ALL","PSS_POST_ALL","BDI_II_POST","BDI_II_Suicidality_POST","CES_D_POST_ALL",
        "UCLA_Loneliness_POST_ALL","ISEL_APPRAISEL_POST","ISEL_TANGIBLE_POST","ISEL_BELONGING_POST"]
for col in cols:
    df2 = pd.DataFrame({"STARS": [uw_datasets["stars"].loc[:,col].mean()],
                    "FirstGen not STARS":[uw_datasets["firstgen_nostar"].loc[:,col].mean()]})


    df2 = pd.DataFrame({"Name": [uw_datasets["stars"].loc[:,col].mean(),
                             uw_datasets["firstgen_nostar"].loc[:,col].mean()], "grp": [1,0]})
    df2 = df2.groupby("grp").mean()

    df2_group_std = pd.DataFrame({"Name": [uw_datasets["stars"].loc[:,col].std(), 
                                       uw_datasets["firstgen_nostar"].loc[:,col].std()]})
    fig, ax = plt.subplots()
    df2.plot.bar(yerr=df2_group_std, ax = ax, rot=0, grid = True,
                                        color = ["white", "white"], edgecolor = ["black","black"],
                                        linewidth = 2, capsize = 5)
    ax.set_xlabel("")
    ax.set_xticklabels(["First Gen not in STARS", "STARS"])
    ax.xaxis.grid()
    ax.set_title(col + " between First Generation Engineering Students and STARS")
    ax.set_ylabel(col)
    # ax.set_yticklabels(["Not at all : 1", "2", "3", "Somewhat : 4", "5", "6", "Extreme : 7"])
    ax.legend().set_visible(False)

    left = uw_datasets["stars"].loc[:,col].dropna().tolist()
    right = uw_datasets["firstgen_nostar"].loc[:,col].dropna().tolist()
    a = weightstats.ttest_ind(left, right, alternative='larger', usevar = "unequal",)
    b = weightstats.ttest_ind(left, right, alternative='smaller', usevar = "unequal")
    print(col)
    if (a[1] < b[1]):
        print("t-test with unequal variance: t-value =", round(a[0],2), "p-value =", round(a[1],2))
    else:
        print("t-test with unequal variance: t-value =", round(b[0],2), "p-value =", round(b[1],2))


for col in cols:
    df2 = pd.DataFrame({"Direct Admit": [uw_datasets["direct_engineer"].loc[:,col].mean()],
                    "Engineer not Direct Admit":[uw_datasets["engineer_nodirect"].loc[:,col].mean()]})


    df2 = pd.DataFrame({"Name": [uw_datasets["direct_engineer"].loc[:,col].mean(),
                             uw_datasets["engineer_nodirect"].loc[:,col].mean()], "grp": [1,0]})
    df2 = df2.groupby("grp").mean()

    df2_group_std = pd.DataFrame({"Name": [uw_datasets["direct_engineer"].loc[:,col].std(), 
                                       uw_datasets["engineer_nodirect"].loc[:,col].std()]})
    fig, ax = plt.subplots()
    df2.plot.bar(yerr=df2_group_std, ax = ax, rot=0, grid = True,
                                        color = ["white", "white"], edgecolor = ["black","black"],
                                        linewidth = 2, capsize = 5)
    ax.set_xlabel("")
    ax.set_xticklabels(["not Direct Admit", "Direct Admit"])
    ax.xaxis.grid()
    ax.set_title(col + " between non Direct Admits and Direct Admits in Engineering")
    ax.set_ylabel(col)
    # ax.set_yticklabels(["Not at all : 1", "2", "3", "Somewhat : 4", "5", "6", "Extreme : 7"])
    ax.legend().set_visible(False)

    left = uw_datasets["direct_engineer"].loc[:,col].dropna().tolist()
    right = uw_datasets["engineer_nodirect"].loc[:,col].dropna().tolist()
    a = weightstats.ttest_ind(left, right, alternative='larger', usevar = "unequal")
    b = weightstats.ttest_ind(left, right, alternative='smaller', usevar = "unequal")
    print(col)
    if (a[1] < b[1]):
        print("t-test with unequal variance: t-value =", round(a[0],2), "p-value =", round(a[1],2))
    else:
        print("t-test with unequal variance: t-value =", round(b[0],2), "p-value =", round(b[1],2))



### IDENTITY Phase I CMU Slide 20: Being a UW student is an important part of who I am

In [None]:
cols = ['MYU_IDENTITY_B2', 'MYU_IDENTITY_MID','MYU_IDENTITY_POST']
plot_box(prep_for_plot(cols, lambda x: x, datasets["all"]),  "Being a UW student is an important part of who I am", 
         xticks=['Pre','Mid','Post'])


# break this out by status
index_cols = ['Minority_MYU_ALL','College_Engineering_MYU_ALL','FirstGen_MYU_ALL',
              'MYU_Services_STARS_01_POST','DirectAdmit_POST','DISC_ALL']
       
value_cols = ['MYU_IDENTITY_MID']
plot_group(datasets["all"], value_cols, index_cols, mergeconfig, remove_neg_value = False, plot_type="box")

### DRAINED: Phase I CMU Slide 36: Do you ever feel emotionally drained by your work at UW?

In [None]:
cols = ['MYU_DRAINED_B2', 'MYU_DRAINED_MID','MYU_DRAINED_POST']

# plot_box(prep_for_plot(cols, lambda x:x, df),  "Do you ever feel emotionally drained by your work at UW?", 
#          xticks=['Pre','Mid','Post'])


# break this out by status
# index_cols = ['Minority_B1','Engineer_MID','FirstGen_B1',
#               'MYU_Services_STARS_01_POST','DirectAdmit_POST','DISC']
index_cols = ['MYU_Services_STARS_01_POST']
       
cols = ['MYU_DRAINED_POST']
plot_group(datasets["all"], cols, index_cols, mergeconfig, remove_neg_value = True, plot_type = "bar",ylabel = "Drained Level",
          title = "Feeling of Drained")
# for col in index_cols:
#     df2 = df[cols + [col]]
#     #print(df2[col].value_counts())
#     #print(df2.head())
#     df2 = df2.loc[df2[col] >= 0]
#     df2.boxplot(by=col)

#XX do engineer vs non engineer?

### UCLA Loneliness (Slide 10: Similar result for Loneliness)

In [None]:
# Depression... 
%run Helpers.ipynb

cols = ['UCLA_Loneliness_PRE_ALL', 'UCLA_Loneliness_POST_ALL']
lonely = prep_for_plot(cols, lambda x: x, uw_datasets["all"])
print(lonely.describe())
print(ttest_rel(lonely.UCLA_Loneliness_PRE_ALL, lonely.UCLA_Loneliness_POST_ALL))

plot_box(lonely, "UCLA Loneliness Scale",
        xticks=['Pre','Post'])

# break this out by status
# index_cols = ['Minority_B1','Engineer_MID','FirstGen_B1',
#               'MYU_Services_STARS_01_POST','DirectAdmit_POST','DISC']
index_cols = ['MYU_Services_STARS_01_POST']
cols = ['UCLA_Loneliness_POST_ALL']
plot_group(uw_datasets["all"], cols, index_cols, mergeconfig,
            remove_neg_value = True, plot_type = "bar",ylabel = "Loneliness", ymax = 65,
          title = "UW Only")
# for col in index_cols:
#     df2 = df[cols + [col]]
#     #print(df2[col].value_counts())
#     #print(df2.head())
#     df2 = df2.loc[df2[col] >= 0]
#     df2.boxplot(by=col)
# #XX do engineer vs non engineer?
index_cols = ['LOC_ALL']
cols = ['UCLA_Loneliness_PRE_ALL', 'UCLA_Loneliness_POST_ALL']
plot_group(datasets["all"], cols, index_cols, mergeconfig,
           remove_neg_value = True, plot_type = "bar",ylabel = "Loneliness", ymax = 65,
          title = "")

index_cols = ['Engineer_MID']
cols = ['UCLA_Loneliness_POST_ALL']
plot_group(datasets["all"], cols, index_cols, mergeconfig,
           remove_neg_value = True, plot_type = "bar",ylabel = "Loneliness", ymax = 65,
          title = "")

index_cols = ['College_Engineering_MYU_ALL']
cols = ['UCLA_Loneliness_POST_ALL']
plot_group(uw_datasets["all"], cols, index_cols, mergeconfig,
           remove_neg_value = True, plot_type = "bar",ylabel = "Loneliness", ymax = 65,
          title = "UW only: ")

index_cols = ['Engineer_MID']
cols = ['UCLA_Loneliness_POST_ALL']
plot_group(uw_datasets["engineer_nodirect_nostars"], cols, index_cols, mergeconfig,
           remove_neg_value = True, plot_type = "bar",ylabel = "Loneliness", ymax = 65,
          title = "Non Stars/DA only at UW:")

index_cols = ['Engineer_MID']
cols = ['UCLA_Loneliness_POST_ALL']
plot_group(uw_datasets["stars_or_da"], cols, index_cols, mergeconfig,
           remove_neg_value = True, plot_type = "bar",ylabel = "Loneliness", ymax = 65,
          title = "Stars or Direct Admit: ")

index_cols = ['Engineer_MID']
cols = ['UCLA_Loneliness_POST_ALL']
plot_group(uw_datasets["da"], cols, index_cols, mergeconfig,
           remove_neg_value = True, plot_type = "bar",ylabel = "Loneliness", ymax = 65,
          title = "Direct Admit only:")


### Social Media

#### Check Social Media: Phase I CMU Slide 44 Data

In [None]:
%run Helpers.ipynb
#'SMQU_Facebook_MID', 'SMQU_Facebook_B1', 'SMQU_Instagram_MID', 'SMQU_Instagram_B1', 'SMQU_Snapchat_MID', 'SMQU_Snapchat_B1',
#'SMQP_Facebook_MID', 'SMQP_Facebook_B1','SMQP_Instagram_MID', 'SMQP_Instagram_B1', 'SMQP_Snapchat_MID', 'SMQP_Snapchat_B1',
#'SMCOPE_boredom_MID', 'SMCOPE_boredom_B1',  'SMCOPE_break_MID', 'SMCOPE_break_B1',
#'SMCOPE_stress_MID', 'SMCOPE_stress_B1', 'SMCOPE_support_MID', 'SMCOP_support_B1']

# first get percentages using each site
cols = ['SMQU_Facebook_B1', 'SMQU_Instagram_B1', 'SMQU_Snapchat_B1']
xticks = ['Facebook Checks', 'Instagram Checks', 'Snapchat Checks']
ax = plot_thresholds(prep_for_plot(cols, lambda x: x, datasets["all"]), "Social Media Checking", 
         xticks = xticks, bins=20)
# verticle line indicating the threshold
ax.axvline(x=3)
ax.axvline(x=5)


xlabels = {"Daily or more":1, "Weekly":2, "Less":3}
print("---------------")
bins = prep_for_plot(cols, lambda data: answer_assign_bins(data, cols, [0,3, 5, 7], list(xlabels.values())), 
                    datasets["all"])

# extract cols and don't modify for now
table = prep_for_plot(cols, lambda data: percentage_for_values(data, cols,order=xlabels), 
                      bins)

plt = create_bar_plot(table.T, "Social Media Use: Checks", xticks=list(labels.keys()))

#### Post Social Media: Phase I CMU Slide 44 Data

In [None]:
#'SMQU_Facebook_MID', 'SMQU_Facebook_B1', 'SMQU_Instagram_MID', 'SMQU_Instagram_B1', 'SMQU_Snapchat_MID', 'SMQU_Snapchat_B1',
#'SMQP_Facebook_MID', 'SMQP_Facebook_B1','SMQP_Instagram_MID', 'SMQP_Instagram_B1', 'SMQP_Snapchat_MID', 'SMQP_Snapchat_B1',
#'SMCOPE_boredom_MID', 'SMCOPE_boredom_B1',  'SMCOPE_break_MID', 'SMCOPE_break_B1',
#'SMCOPE_stress_MID', 'SMCOPE_stress_B1', 'SMCOPE_support_MID', 'SMCOP_support_B1']

# first get percentages using each site
cols = ['SMQP_Facebook_B1', 'SMQP_Instagram_B1', 'SMQP_Snapchat_B1']
xticks = ['Facebook Posts', 
          'Instagram Posts', 'Snapchat Posts']
ax = plot_thresholds(prep_for_plot(cols, lambda x:x, datasets["all"]), "Social Media Posts", 
         xticks = xticks, bins=20)
# verticle line indicating the threshold
ax.axvline(x=3)
ax.axvline(x=5)


xlabels = ["Daily or more", "Weekly", "Less"]
print("---------------")
smu = prep_for_plot(cols, lambda data: answer_assign_bins(data, cols, [0,3, 5, 7], xlabels), datasets["all"])
# extract cols and don't modify for now
table = prep_for_plot(cols, lambda data: percentage_for_values(data, cols, order=xlabels), smu)
plt = create_bar_plot(pd.DataFrame(table).T, "Social Media Use: Posts", xticks=xticks)

In [None]:
%run Helpers.ipynb

# first get percentages using each site
cols = ['SMCOPE_boredom_B1', 'SMCOPE_boredom_MID', 'SMCOPE_boredom_POST',
        'SMCOPE_break_B1', 'SMCOPE_break_MID', 'SMCOPE_break_POST',
         'SMCOPE_stress_B1', 'SMCOPE_stress_MID', 'SMCOPE_stress_POST']
xticks = ['Boredom (pre)', 'Boredom (mid)','Boredom (post)',
          'Study break (pre)', 'Study break (mid)', 'Study break (post)', 
          '> Stress (pre)', '> Stress (mid)', '> Stress (post)']
xlabels = ["Agree/Agree Strongly", "Neutral/Don't Agree"]
print("plot thresholds")
ax = plot_thresholds(prep_for_plot(cols, lambda x: x, datasets["all"]), "Social Media Coping", 
         xticks = xticks, bins=20)
# verticle line indicating the threshold
ax.axvline(x=4)
ax.axvline(x=21)
print("prepping bins")
smcope = prep_for_plot(cols, lambda data: answer_assign_bins(data, cols, [-1,4, 6], xlabels), datasets["all"])
print("---------------")
print(smcope.head(2))

# extract cols and don't modify for now
table = prep_for_plot(cols, lambda data: percentage_for_values(data, cols, order=xlabels), smcope)
plt = create_bar_plot(pd.DataFrame(table).T, "Social Media Use: Posts", xticks=xticks)



In [None]:
%run Helpers.ipynb
# extract cols and don't modify for now
cols = ['SMCOPE_support_B1', 'SMCOPE_support_MID']

x_labels = defaultdict(lambda: 'NA',{6:'> 1/day', 5:'Daily', 3:'Weekly', 4:'A few times a week', 2:'Less than Weekly', 1:'Never'})
table = prep_for_plot(cols, lambda data: answer_percentages(data, cols[0], labels=x_labels), datasets["all"])
table.replace(-1, 0)
create_bar_plot(pd.DataFrame(table), "Social Support, Pre", rot=45)
table = prep_for_plot(cols, lambda data: answer_percentages(data, cols[1], labels=x_labels), datasets["all"])
table.replace(-1, 0)
create_bar_plot(pd.DataFrame(table), "Social Support, Mid", rot=45)

#xx need to make y axis the same for both...


### Depression and Stress

#### BDI II

In [None]:
#print(df_uw_engineer.shape, 
#df_uw_stars.shape, 
#df_uw_urm_notstar.shape, 
#df_uw_direct_engineer.shape, 
#df_uw_engin_else.shape)

print(uw_datasets["engineer"].shape, 
     uw_datasets["stars"].shape,
     uw_datasets["urm_nostar"].shape,
     uw_datasets["direct_engineer"].shape,
     uw_datasets["engin_else"].shape,
     uw_datasets["firstgen_engineer"].shape,
     uw_datasets["hope"].shape)

In [None]:
a = uw_datasets["engineer"]
uw_datasets["direct_engineer_include_star"] = a[(a.MYU_Services_STARS_01_POST == 1) | (a.DirectAdmit_POST == 1)]
uw_datasets["hope_engineer_no_star"] = a[(a.MYU_Services_STARS_01_POST != 1) & (a.DirectAdmit_POST != 1)]

In [None]:
# Depression... 
# %run Helpers.ipynb

cols = ['BDI_Suicidality_PRE_ALL', 'BDI_Suicidality_POST_ALL']
# cols = ['BDI_II_Suicidality_MID']
x_labels = {-1:'NA', 0:'No thoughts of killing myself', 1:'Thoughts of killing myself', 
            2:'I would like to kill myself', 3:'I would kill myself if I had the chance'}
print(uw_datasets["all"].BDI_Suicidality_POST_ALL.value_counts())

In [None]:
import textwrap
# for df_tmp, type_tmp in zip([df_all, df_cmu, df_uw, df_engineer, df_engineering], ["All", "CMU", "UW", "UW Engineer Hope", "UW College of Engineering"]):
for df_tmp, type_tmp in zip([uw_datasets["CSE"], uw_datasets["CSE stars_or_da"], uw_datasets["CSE_nodirect_nostars"]], 
                            ["UW", "UW Direct Admit Engineer", "UW Hope Engineer"]):

    print(df_tmp.shape)
    print(df_tmp.BDI_Suicidality_PRE_ALL.value_counts())
    print(df_tmp.BDI_Suicidality_POST_ALL.value_counts())

    df_tmp = df_tmp[cols]
    x_labels = {-1:'No Answer', 0:'No thoughts', 1:'Thoughts', 
                2:'Would like', 3:'Would try'}
    # Pre
    table1 = prep_for_plot(cols, lambda data: answer_percentages(data, cols[0], labels=x_labels), 
                           df_tmp)
    #table1.ix[('I would like to kill myself', 'id')] = 0
    table1 = table1.rename(index=str, columns={'id': "Jan Percent"})
    print(table1.head(2))
    #create_bar_plot(table1, cols[0])

    # Post
    table3 = prep_for_plot(cols, lambda data: answer_percentages(data, cols[1], labels=x_labels), 
                           df_tmp)
    #table2.ix[('NA','id')] = 0
    table3 = table3.rename(index=str, columns={'id': "June Percent"})
    print(table3.head(2))

    #create_bar_plot(table2, cols[1])
    table = pd.merge(table1, table3, left_index=True, right_index=True, how='outer')
    print("Table for:", type_tmp)
#     print(table)
    ax = table.plot.bar(rot=45)
    ax.set_ylabel("Percentage")
    #ax.set_xticklabels([-1,1,2,3,4,5],rotation = 0)
#     ax.set_title("Suicidality Question Over Periods on " + type_tmp)
    ax.set_ylim(0,100)
    ax.set_xlabel("")
    #ax = create_bar_plot(df, "Suicidality Question Over Periods on " + type_tmp, 
    #                  xticks=[-1,1,2,3,4,5],  ylabel = "Percentage", ylim=(0,100))
    wording = table.index.tolist()
    texts = ["-1","1","2","3","4"]
    #ax.text(x = 1.5, y = 30, s = "\n".join([textwrap.fill(str(i) + "=" + x, 35) for i,x in zip(texts,wording)]))
    
    
"\n".join([textwrap.fill(str(i + 1) + "=" + x, 20) for i,x in enumerate(wording)])

In [None]:
sum(uw_datasets["hope"].BDI_Suicidality_POST_ALL == 1)

In [None]:
cols

In [None]:
sum(uw_datasets["direct_engineer"].BDI_Suicidality_PRE_ALL == 0)

In [None]:
import textwrap
cols = ['BDI_Suicidality_PRE_ALL', 'BDI_Suicidality_POST_ALL']
bdi_percentages1 =  {}
x_labels = {-1:'NA', 0:'No thoughts of killing myself', 1:'Thoughts of killing myself', 
                2:'I would like to kill myself', 3:'I would kill myself if I had the chance'}

# for df_tmp, type_tmp in zip([uw_datasets["stars"], 
#                              uw_datasets["engineer_direct_nostars"], 
#                              uw_datasets["engineer_nodirect_nostars"]],
#                              ["STARS", "Direct Admit (not STARS)", "Other Students"]):
for df_tmp, type_tmp in zip([uw_datasets["stars"][uw_datasets["stars"]["CSE_POST"] == 1],
                             uw_datasets["CSE_direct_nostars"],
                             uw_datasets["CSE_nodirect_nostars"]],
                            ["STARS", "Direct Admit", "Other Students"]):
#     print(df_tmp.shape)
    df_tmp = df_tmp[cols]
    # Pre
    table1 = prep_for_plot(cols, lambda data: answer_percentages(data, cols[0], labels=x_labels), df_tmp)
    #table1.ix[('I would like to kill myself', 'id')] = 0
    table1 = table1.rename(index=str, columns={'id': "Pre Percent"})
#     print(table1)
    #create_bar_plot(table1, cols[0])
    # Post
    table3 = prep_for_plot(cols, lambda data: answer_percentages(data, cols[1], labels=x_labels), df_tmp)
    #table2.ix[('NA','id')] = 0
    table3 = table3.rename(index=str, columns={'id': "Post Percent"})
    bdi_percentages1[type_tmp] = [float(table1.iloc[1,:]),float(table3.iloc[1,:])]
    
bdi_percentages2 =  {}
# for df_tmp, type_tmp in zip([uw_datasets["stars"], 
#                              uw_datasets["engineer_urm_nostar"], 
#                              uw_datasets["engineer_nourm_nostar"]],
#                             ["STARS", "URM", "Other Students"]):
for df_tmp, type_tmp in zip([uw_datasets["stars"][uw_datasets["stars"]["CSE_POST"] == 1],
                             uw_datasets["CSE_urm_nostar"],
                             uw_datasets["CSE_nourm_nostar"]],
                            ["STARS", "Direct Admit", "Other Students"]):
#     print(df_tmp.shape)
    df_tmp = df_tmp[cols]
    # Pre
    table1 = prep_for_plot(cols, lambda data: answer_percentages(data, cols[0], labels=x_labels), df_tmp)
    #table1.ix[('I would like to kill myself', 'id')] = 0
    table1 = table1.rename(index=str, columns={'id': "Pre Percent"})
#     print(table1)
    #create_bar_plot(table1, cols[0])
     # Post
    table3 = prep_for_plot(cols, lambda data: answer_percentages(data, cols[1], labels=x_labels), df_tmp)
    #table2.ix[('NA','id')] = 0
    table3 = table3.rename(index=str, columns={'id': "Post Percent"})
    bdi_percentages2[type_tmp] = [float(table1.iloc[1,:]),float(table3.iloc[1,:])]
    
bdi_percentages3 =  {}
# for df_tmp, type_tmp in zip([uw_datasets["stars"],
#                              uw_datasets["engineer_firstgen_nostar"],
#                              uw_datasets["engineer_nofirstgen_nostar"]],
#                             ["STARS", "First Gen", "Other Students"]):
for df_tmp, type_tmp in zip([uw_datasets["stars"][uw_datasets["stars"]["CSE_POST"] == 1],
                             uw_datasets["CSE_firstgen_nostar"],
                             uw_datasets["CSE_nofirstgen_nostar"]],
                            ["STARS", "Direct Admit", "Other Students"]):
#     print(df_tmp.shape)
    df_tmp = df_tmp[cols]
    # Pre
    table1 = prep_for_plot(cols, lambda data: answer_percentages(data, cols[0], labels=x_labels), df_tmp)
    #table1.ix[('I would like to kill myself', 'id')] = 0
    table1 = table1.rename(index=str, columns={'id': "Pre Percent"})
#     print(table1)
    #create_bar_plot(table1, cols[0])
    # Post
    table3 = prep_for_plot(cols, lambda data: answer_percentages(data, cols[1], labels=x_labels), df_tmp)
    #table2.ix[('NA','id')] = 0
    table3 = table3.rename(index=str, columns={'id': "Post Percent"})
    bdi_percentages3[type_tmp] = [float(table1.iloc[1,:]),float(table3.iloc[1,:])]
print(bdi_percentages1, bdi_percentages2, bdi_percentages3)

#### CES-D: Slide 14 (UW is more depressed than CMU(no answers yet! weather?))

In [None]:
%run Helpers.ipynb

In [None]:
cols = ['CES_D_PRE_ALL', 'CES_D_POST_ALL']
#plot_box(prep_for_plot(cols, lambda x:x, df), "CES_D", 
#         xticks = ["Winter Break", "Spring Break"])

table = prep_for_plot(cols, lambda x: x, datasets["all"])
print(table.describe())
print(ttest_rel(table.CES_D_PRE_ALL, table.CES_D_POST_ALL))

#df_all = df

#df_cmu = df[df.LOC_ALL == 1]

#df_uw = df[df.LOC_ALL==0]

#df_engineer = df[df.Engineer_MID == 1]
#df_uw['Try_Engineer'] = df_uw[(df_uw.Engineer_MID==1) & (df_uw.College_Engineering_MYU_ALL==0)]

ax = plot_mean(table, "CES_D", 
         xticks = ["Pre",  "Post"])

# horizontal line indicating the threshold
ax.plot([16, 16,16], "k--")

# break this out by status
# index_cols = ['Minority_B1','Engineer_MID','FirstGen_B1',
#               'MYU_Services_STARS_01_POST','DirectAdmit_POST', 'DISC']
index_cols = ['College_Engineering_MYU_ALL','Minority_MYU_ALL',
               'LOC_ALL']
value_cols = ['CES_D_POST_ALL']

plot_group(datasets["all"], value_cols, index_cols, mergeconfig,
           plot_type = "bar", remove_neg_value=True, ymin = 0, ymax = 50,
           title = "CES-D Score (all)", ylabel = "CES-D Score")

index_cols = ['MYU_Services_STARS_01_POST', 'Engineer_MID', 'College_Engineering_MYU_ALL','Minority_MYU_ALL']
#plot_group(uw_datasets["all"], value_cols, index_cols, mergeconfig,
#           plot_type = "bar", remove_neg_value=True, ymin = 0, ymax = 50,
#           title = "CES-D Score (UW)", ylabel = "CES-D Score")

index_cols = ['MYU_Services_STARS_01_POST', 'College_Engineering_MYU_ALL','Minority_MYU_ALL',
              'DirectAdmit_POST']
plot_group(datasets["engineer"], value_cols, index_cols, mergeconfig,
           plot_type = "bar", remove_neg_value=True, ymin = 0, ymax = 50,
           title = "CES-D Score (UW) for engineering hopefuls ", ylabel = "CES-D Score")

index_cols = ['College_Engineering_MYU_ALL','Minority_MYU_ALL']
#plot_group(cmu_datasets["all"], value_cols, index_cols, mergeconfig,
#           plot_type = "bar", remove_neg_value=True, ymin = 0, ymax = 50,
#           title = "CES-D Score (CMU)", ylabel = "CES-D Score")
#

# for col in index_cols:
#     df2 = df[cols + [col]]
#     #print(df2[col].value_counts())
#     #print(df2.head())
#     df2 = df2.loc[df2[col] >= 0]
#     ax = df2.boxplot(by=col)
#     ax.set_ylim(0, 80)
#     ax.plot([16, 16,16,16], "k--")



In [None]:
index_cols = ['LOC_ALL']
plot_group(datasets["all"], value_cols, index_cols, mergeconfig,
           plot_type = "bar", remove_neg_value=True, ymin = 0, ymax = 40,
           title = "CES-D Score (ALL) for engineering ", ylabel = "CES-D Score")

In [None]:
mergeconfig["mergecols"]["DirectAdmit_New"] = {
  'Description': 'DirectAdmit_New',
  'Direct Admit': 1,
  'Not Direct Admit': 0,
  'Unknown': -1}

In [None]:
a = (uw_datasets["CSE"]["DirectAdmit_POST"] == 1) | (uw_datasets["CSE"]["MYU_Services_STARS_01_POST"] == 1)

In [None]:
uw_datasets["CSE"]["DirectAdmit_New"] = a.replace(True, 1).replace(False,0)

In [None]:
# index_cols = ['DirectAdmit_New']
index_cols = ["DirectAdmit_POST"]
value_cols = ['CES_D_POST_ALL']
plot_group(uw_datasets["CSE"], value_cols, index_cols, mergeconfig,
           plot_type = "bar", remove_neg_value=True, ymin = 0, ymax = 40,
           title = "CES-D Score (UW) for engineering ", ylabel = "CES-D Score")

In [None]:
index_cols = ['DirectAdmit_POST']
plot_group_sample(uw_datasets["engineering"], value_cols, index_cols, mergeconfig,
           plot_type = "bar", remove_neg_value=True, ymin = 0, ymax = 57,
           title = "CES-D Score (UW) for engineering ", ylabel = "CES-D Score")

In [None]:
% run Helpers.ipynb

In [None]:
# CES-D stacked plot instead

cols = ['CES_D_PRE_ALL', 'CES_D_POST_ALL']


ax = plot_thresholds(prep_for_plot(cols, lambda x: x, uw_datasets["all"]), "CES_D", 
         xticks = ["Pre", "Post"], bins=20)


# verticle line indicating the threshold
ax.axvline(x=15)
ax.axvline(x=21)

print("---------------")
xlabels = ["No risk (<16)", "At risk (16-20)", "Depressed (21+)"]
cesd = prep_for_plot(cols, lambda data: answer_assign_bins(data, cols, [0,16, 21, 80], xlabels), 
                     uw_datasets["all"])


# extract cols and don't modify for now
legend = ["Pre", "Post"]
table = prep_for_plot(cols, lambda data: percentage_for_values(data, cols, order=xlabels), cesd)
print(table)
plot = create_bar_plot(pd.DataFrame(table), "CESD at Pre  vs Post", ylabel = "Percentage of participants", legend=["Pre", "Post"], rot = 1)

In [None]:
cols = ["CES_D_PRE_ALL", "CES_D_POST_ALL"]
cesd_percentage1 = {}
# for df_tmp, type_tmp in zip([uw_datasets["stars"],
#                              uw_datasets["engineer_direct_nostars"],
#                              uw_datasets["engineer_nodirect_nostars"]],
#                             ["STARS", "Direct Admit", "Other Students"]):
for df_tmp, type_tmp in zip([uw_datasets["stars"][uw_datasets["stars"]["CSE_POST"] == 1],
                             uw_datasets["CSE_direct_nostars"],
                             uw_datasets["CSE_nodirect_nostars"]],
                            ["STARS", "Direct Admit", "Other Students"]):
    xlabels = ["No risk (<16)", "At risk (16-20)", "Depressed (21+)"]
    cesd = prep_for_plot(cols, lambda data: answer_assign_bins(data, cols, [0,16, 21, 80], xlabels), df_tmp)
    # extract cols and don't modify for now
    legend = ["Pre", "Post"]
    table = prep_for_plot(cols, lambda data: percentage_for_values(data, cols, order=xlabels), cesd)
    cesd_percentage1[type_tmp] = table.iloc[0,:].tolist()
#     plot = create_bar_plot(pd.DataFrame(table), "CESD at Pre vs  Post on " + type_tmp, ylabel = "Percentage of participants", legend=["Pre", "Mid", "Post"], rot = 1)
#     plot.legend(["Pre", "Post"], loc = "upper left")
print(cesd_percentage1)

cesd_percentage2 = {}
# for df_tmp, type_tmp in zip([uw_datasets["stars"],
#                              uw_datasets["engineer_urm_nostar"],
#                              uw_datasets["engineer_nourm_nostar"]],
#                             ["STARS", "URM", "Other Students"]):
for df_tmp, type_tmp in zip([uw_datasets["stars"][uw_datasets["stars"]["CSE_POST"] == 1],
                             uw_datasets["CSE_urm_nostar"],
                             uw_datasets["CSE_nourm_nostar"]],
                            ["STARS", "Direct Admit", "Other Students"]):
    xlabels = ["No risk (<16)", "At risk (16-20)", "Depressed (21+)"]
    cesd = prep_for_plot(cols, lambda data: answer_assign_bins(data, cols, [0,16, 21, 80], xlabels), df_tmp)
    # extract cols and don't modify for now
    legend = ["Pre", "Post"]
    table = prep_for_plot(cols, lambda data: percentage_for_values(data, cols, order=xlabels), cesd)
    cesd_percentage2[type_tmp] = table.iloc[0,:].tolist()
#     plot = create_bar_plot(pd.DataFrame(table), "CESD at Pre vs  Post on " + type_tmp, ylabel = "Percentage of participants", legend=["Pre", "Mid", "Post"], rot = 1)
#     plot.legend(["Pre", "Post"], loc = "upper left")
print(cesd_percentage2)

cesd_percentage3 = {}
# for df_tmp, type_tmp in zip([uw_datasets["stars"],
#                              uw_datasets["engineer_firstgen_nostar"],
#                              uw_datasets["engineer_nofirstgen_nostar"]],
#                              ["STARS", "First Gen", "Other Students"]):
for df_tmp, type_tmp in zip([uw_datasets["stars"][uw_datasets["stars"]["CSE_POST"] == 1],
                             uw_datasets["CSE_firstgen_nostar"],
                             uw_datasets["CSE_nofirstgen_nostar"]],
                            ["STARS", "Direct Admit", "Other Students"]):
    xlabels = ["No risk (<16)", "At risk (16-20)", "Depressed (21+)"]
    cesd = prep_for_plot(cols, lambda data: answer_assign_bins(data, cols, [0,16, 21, 80], xlabels), df_tmp)
    # extract cols and don't modify for now
    legend = ["Pre", "Post"]
    table = prep_for_plot(cols, lambda data: percentage_for_values(data, cols, order=xlabels), cesd)
    cesd_percentage3[type_tmp] = table.iloc[0,:].tolist()
#     plot = create_bar_plot(pd.DataFrame(table), "CESD at Pre vs  Post on " + type_tmp, ylabel = "Percentage of participants", legend=["Pre", "Mid", "Post"], rot = 1)
#     plot.legend(["Pre", "Post"], loc = "upper left")
print(cesd_percentage3)

#### Percieved Stress

In [None]:
%run Helpers.ipynb

In [None]:
# Stress... 

cols = [ 'PSS_PRE_ALL', 'PSS_POST_ALL']

stress = prep_for_plot(cols, lambda x: x, uw_datasets['all'])
print(stress.describe())
print(ttest_rel(stress.PSS_PRE_ALL, stress.PSS_POST_ALL))

ax = plot_mean(stress, "PSS", 
         xticks = ["Jan", "June"])

# horizontal line indicating the threshold
ax.plot([20, 20,20], "k--")

# break this out by status
# index_cols = ['Minority_B1','Engineer_MID','FirstGen_B1',
#               'MYU_Services_STARS_01_POST','DirectAdmit_POST','DISC']
index_cols = ['MYU_Services_STARS_01_POST']
value_cols = ['PSS_POST']
plot_group(datasets["all"], value_cols, index_cols, mergeconfig,
           plot_type = "bar", remove_neg_value=True, ymin = 0, ymax = 50,
          title = "PSS Score", ylabel = "PSS Score")
# for col in index_cols:
#     df2 = df[cols + [col]]
#     #print(df2[col].value_counts())
#     #print(df2.head())
#     df2 = df2.loc[df2[col] >= 0]
#     ax = df2.boxplot(by=col)
#     ax.set_ylim(0, 80)
#     ax.plot([16, 16,16,16], "k--")

index_cols = ['College_Engineering_MYU_ALL','Minority_MYU_ALL']
value_cols = ['PSS_POST_ALL']

#plot_group(datasets["all"], value_cols, index_cols, mergeconfig,
#           plot_type = "bar", remove_neg_value=True, ymin = 0, ymax = 50,
#           title = "PSS Score (all)", ylabel = "PSS Score")

index_cols = ['MYU_Services_STARS_01_POST', 'Engineer_MID', 'College_Engineering_MYU_ALL','Minority_MYU_ALL',
             'DirectAdmit_POST']
plot_group(uw_datasets["engineer"], value_cols, index_cols, mergeconfig,
           plot_type = "bar", remove_neg_value=True, ymin = 0, ymax = 50,
           title = "PSS Score (UW)", ylabel = "PSS Score")

#index_cols = ['MYU_Services_STARS_01_POST', 'College_Engineering_MYU_ALL','Minority_MYU_ALL']
#plot_group(uw_datasets["hope"], value_cols, index_cols, mergeconfig,
#           plot_type = "bar", remove_neg_value=True, ymin = 0, ymax = 50,
#           title = "PSS Score (UW) for engineering hopefuls ", ylabel = "PSS Score")

#index_cols = ['College_Engineering_MYU_ALL','Minority_MYU_ALL']
#plot_group(cmu_datasets["all"], value_cols, index_cols, mergeconfig,
#           plot_type = "bar", remove_neg_value=True, ymin = 0, ymax = 50,
#           title = "PSS Score (CMU)", ylabel = "PSS Score")


In [None]:
index_cols = ['LOC_ALL']

plot_group(datasets["all"], value_cols, index_cols, mergeconfig,
           plot_type = "bar", remove_neg_value=True, ymin = 0, ymax = 40,
           title = "PSS Score (all)", ylabel = "PSS Score")

In [None]:
index_cols = ['DirectAdmit_POST']
value_cols = ['PSS_POST_ALL']
plot_group(uw_datasets["CSE"], value_cols, index_cols, mergeconfig,
           plot_type = "bar", remove_neg_value=True, ymin = 0, ymax = 40,
           title = "PSS Score (UW) for engineering ", ylabel = "PSS Score")

In [None]:
index_cols = ['DirectAdmit_POST']
plot_group_sample(uw_datasets["engineer"], value_cols, index_cols, mergeconfig,
           plot_type = "bar", remove_neg_value=True, ymin = 0, ymax = 50,
           title = "PSS Score (UW) for engineering ", ylabel = "PSS Score")

In [None]:
cols = [ 'PSS_PRE_ALL', 'PSS_POST_ALL']
pss_percentage1 = {}
# for df_tmp, type_tmp in zip([uw_datasets["stars"],
#                              uw_datasets["engineer_direct_nostars"],
#                              uw_datasets["engineer_nodirect_nostars"]],
#                             ["STARS", "Direct Admit", "Other Students"]):
for df_tmp, type_tmp in zip([uw_datasets["stars"][uw_datasets["stars"]["CSE_POST"] == 1],
                             uw_datasets["CSE_direct_nostars"],
                             uw_datasets["CSE_nodirect_nostars"]],
                            ["STARS", "Direct Admit", "Other Students"]):
    xlabels = ["Low Stress", "High Stress"]
    pss = prep_for_plot(cols, lambda data : answer_assign_bins(data, cols, [0,20, 80], xlabels), df_tmp)
    # extract cols and don't modify for now
    legend = ["Pre", "Mid","Post"]
    table = prep_for_plot(cols, lambda data: percentage_for_values(pss, cols, order=xlabels), pss)
    pss_percentage1[type_tmp] = table.iloc[0,:].tolist()
#     plot = create_bar_plot(pd.DataFrame(table), "PSS at Pre vs Mid vs Post for " + type_tmp, legend=['Pre','Mid','Post'])
#     plot.legend(["Pre", "Mid", "Post"], loc = "upper left")
print(pss_percentage1)

pss_percentage2 = {}
# for df_tmp, type_tmp in zip([uw_datasets["stars"],
#                              uw_datasets["engineer_urm_nostar"],
#                              uw_datasets["engineer_nourm_nostar"]],
#                             ["STARS", "URM", "Other Students"]):
for df_tmp, type_tmp in zip([uw_datasets["stars"][uw_datasets["stars"]["CSE_POST"] == 1],
                             uw_datasets["CSE_urm_nostar"],
                             uw_datasets["CSE_nourm_nostar"]],
                            ["STARS", "Direct Admit", "Other Students"]):
    xlabels = ["Low Stress", "High Stress"]
    pss = prep_for_plot(cols, lambda data : answer_assign_bins(data, cols, [0,20, 80], xlabels), df_tmp)
    # extract cols and don't modify for now
    legend = ["Pre", "Mid","Post"]
    table = prep_for_plot(cols, lambda data: percentage_for_values(pss, cols, order=xlabels), pss)
    pss_percentage2[type_tmp] = table.iloc[0,:].tolist()
#     plot = create_bar_plot(pd.DataFrame(table), "PSS at Pre vs Mid vs Post for " + type_tmp, legend=['Pre','Mid','Post'])
#     plot.legend(["Pre", "Mid", "Post"], loc = "upper left")
print(pss_percentage2)

pss_percentage3 = {}
# for df_tmp, type_tmp in zip([uw_datasets["stars"],
#                              uw_datasets["engineer_firstgen_nostar"],
#                              uw_datasets["engineer_nofirstgen_nostar"]],
#                             ["STARS", "First Gen", "Other Students"]):
for df_tmp, type_tmp in zip([uw_datasets["stars"][uw_datasets["stars"]["CSE_POST"] == 1],
                             uw_datasets["CSE_firstgen_nostar"],
                             uw_datasets["CSE_nofirstgen_nostar"]],
                            ["STARS", "Direct Admit", "Other Students"]):
    xlabels = ["Low Stress", "High Stress"]
    pss = prep_for_plot(cols, lambda data : answer_assign_bins(data, cols, [0,20, 80], xlabels), df_tmp)
    # extract cols and don't modify for now
    legend = ["Pre", "Mid","Post"]
    table = prep_for_plot(cols, lambda data: percentage_for_values(pss, cols, order=xlabels), pss)
    pss_percentage3[type_tmp] = table.iloc[0,:].tolist()
#     plot = create_bar_plot(pd.DataFrame(table), "PSS at Pre vs Mid vs Post for " + type_tmp, legend=['Pre','Mid','Post'])
#     plot.legend(["Pre", "Mid", "Post"], loc = "upper left")
print(pss_percentage3)

In [None]:
uw_datasets["engineer_urm_nostar"].shape

#### Loneliness

In [None]:
import textwrap
cols = ['UCLA_Loneliness_PRE_ALL', 'UCLA_Loneliness_POST_ALL']
ucla_percentage1 = {}
for df_tmp, type_tmp in zip([uw_datasets["stars"],
                             uw_datasets["engineer_direct_nostars"],
                             uw_datasets["engineer_nodirect_nostars"]],
                            ["STARS", "Direct Admit", "Other Students"]):
    xlabels = ["Low Loneliness", "High Loneliness"]
    uclad = prep_for_plot(cols, lambda data : answer_assign_bins(data, cols, [0,50, 80], xlabels), df_tmp)
    # extract cols and don't modify for now
    legend = ["Pre", "Mid","Post"]
    table = prep_for_plot(cols, lambda data: percentage_for_values(uclad, cols, order=xlabels), uclad)
    ucla_percentage1[type_tmp] = table.iloc[0,:].tolist()
    plot = create_bar_plot(pd.DataFrame(table), "Loneliness at Pre vs Mid vs Post for " + type_tmp) 
                            #legend=['Pre','Mid','Post'])
#     plot.legend(["Pre", "Mid", "Post"], loc = "upper left")
print(ucla_percentage1)

ucla_percentage2 = {}
for df_tmp, type_tmp in zip([uw_datasets["stars"],
                             uw_datasets["engineer_urm_nostar"],
                             uw_datasets["engineer_nourm_nostar"]],
                            ["STARS", "URM", "Other Students"]):
    xlabels = ["Low Loneliness", "High Loneliness"]
    uclad = prep_for_plot(cols, lambda data : answer_assign_bins(data, cols, [0,50, 80], xlabels), df_tmp)
    # extract cols and don't modify for now
    legend = ["Pre", "Mid","Post"]
    table = prep_for_plot(cols, lambda data: percentage_for_values(uclad, cols, order=xlabels), uclad)
    ucla_percentage2[type_tmp] = table.iloc[0,:].tolist()
    plot = create_bar_plot(pd.DataFrame(table), "Loneliness at Pre vs Mid vs Post for " + type_tmp)#, legend=['Pre','Mid','Post'])
#     plot.legend(["Pre", "Mid", "Post"], loc = "upper left")
print(ucla_percentage2)

ucla_percentage3 = {}
for df_tmp, type_tmp in zip([uw_datasets["stars"],
                             uw_datasets["engineer_firstgen_nostar"],
                             uw_datasets["engineer_nofirstgen_nostar"]],
                            ["STARS", "First Gen", "Other Students"]):
    xlabels = ["Low Loneliness", "High Loneliness"]
    uclad = prep_for_plot(cols, lambda data : answer_assign_bins(data, cols, [0,50, 80], xlabels), df_tmp)
    # extract cols and don't modify for now
    legend = ["Pre", "Mid","Post"]
    table = prep_for_plot(cols, lambda data: percentage_for_values(uclad, cols, order=xlabels), uclad)
    ucla_percentage3[type_tmp] = table.iloc[0,:].tolist()
    plot = create_bar_plot(pd.DataFrame(table), "Loneliness at Pre vs Mid vs Post for " + type_tmp)#, legend=['Pre','Mid','Post'])
#     plot.legend(["Pre", "Mid", "Post"], loc = "upper left")
print(ucla_percentage3)

#### ISEL Appraisel

In [None]:
import textwrap
cols = ['ISEL_APPRAISEL_B2', 'ISEL_APPRAISEL_POST']
isel_percentage1 = {}
print(uw_datasets["stars"].ISEL_APPRAISEL_B2.value_counts())
for df_tmp, type_tmp in zip([uw_datasets["stars"],
                             uw_datasets["engineer_direct_nostars"],
                             uw_datasets["engineer_nodirect_nostars"]],
                            ["STARS", "Direct Admit", "Other Students"]):
    xlabels = ["Low Appraisel", "High Appraisel"]
    iseld = prep_for_plot(cols, lambda data : answer_assign_bins(data, cols, [0,4, 6], xlabels), df_tmp)
    # extract cols and don't modify for now
    legend = ["Pre", "Post"]
    table = prep_for_plot(cols, lambda data: percentage_for_values(iseld, cols, order=xlabels), iseld)
    isel_percentage1[type_tmp] = table.iloc[0,:].tolist()
    plot = create_bar_plot(pd.DataFrame(table), "ISEL Appraisel at Pre vs Post for " + type_tmp) 
                            #legend=['Pre','Mid','Post'])
#     plot.legend(["Pre", "Mid", "Post"], loc = "upper left")
print(isel_percentage1)

isel_percentage2 = {}
for df_tmp, type_tmp in zip([uw_datasets["stars"],
                             uw_datasets["engineer_urm_nostar"],
                             uw_datasets["engineer_nourm_nostar"]],
                            ["STARS", "URM", "Other Students"]):
    xlabels = ["Low Appraisel", "High Appraisel"]
    iseld = prep_for_plot(cols, lambda data : answer_assign_bins(data, cols, [0,4,6], xlabels), df_tmp)
    # extract cols and don't modify for now
    legend = ["Pre", "Post"]
    table = prep_for_plot(cols, lambda data: percentage_for_values(iseld, cols, order=xlabels), iseld)
    isel_percentage2[type_tmp] = table.iloc[0,:].tolist()
    plot = create_bar_plot(pd.DataFrame(table), "ISEL Appraisel at Pre vs Post for " + type_tmp)#, legend=['Pre','Mid','Post'])
#     plot.legend(["Pre", "Mid", "Post"], loc = "upper left")
print(isel_percentage2)

isel_percentage3 = {}
for df_tmp, type_tmp in zip([uw_datasets["stars"],
                             uw_datasets["engineer_firstgen_nostar"],
                             uw_datasets["engineer_nofirstgen_nostar"]],
                            ["STARS", "First Gen", "Other Students"]):
    xlabels = ["Low Appraisel", "High Appraisel"]
    iseld = prep_for_plot(cols, lambda data : answer_assign_bins(data, cols, [0,4,6], xlabels), df_tmp)
    # extract cols and don't modify for now
    legend = ["Pre", "Post"]
    table = prep_for_plot(cols, lambda data: percentage_for_values(iseld, cols, order=xlabels), iseld)
    isel_percentage3[type_tmp] = table.iloc[0,:].tolist()
    plot = create_bar_plot(pd.DataFrame(table), "ISEL Appraisel at Pre vs Mid vs Post for " + type_tmp)#, legend=['Pre','Mid','Post'])
#     plot.legend(["Pre", "Mid", "Post"], loc = "upper left")
print(isel_percentage3)

#### BDI + Depression + Stress

In [None]:
print(uw_datasets["stars"].shape, 
      uw_datasets["engineer_direct_nostars"].shape,
      uw_datasets["engineer_nodirect_nostars"].shape)

In [None]:
print(uw_datasets["stars"].shape,
      uw_datasets["engineer_urm_nostar"].shape,
      uw_datasets["engineer_nourm_nostar"].shape)

In [None]:
print(uw_datasets["stars"].shape,
      uw_datasets["engineer_firstgen_nostar"].shape,
      uw_datasets["engineer_nofirstgen_nostar"].shape)

In [None]:
bdi_percentages

In [None]:
'''Hard coded plot of 4.4.5.1, 4.4.5.2 and 4.4.5.3'''

bdi_percentages = [bdi_percentages1, bdi_percentages2, bdi_percentages3]
cesd_percentages = [cesd_percentage1, cesd_percentage2, cesd_percentage3]
pss_percentages = [pss_percentage1, pss_percentage2, pss_percentage3]
print(pss_percentages)
# ucla_percentages = [ucla_percentage1, ucla_percentage2, ucla_percentage3]
# isel_percentages = [isel_percentage1, isel_percentage2, isel_percentage3]
# print(isel_percentages)

for ii,bdi,cesd,pss in zip(range(4), bdi_percentages, cesd_percentages, pss_percentages):#,isel_percentages):
    BDIII = 100 - np.array(list(bdi.values()))
    Depression = 100 - np.array(list(cesd.values()))
    Stress = 100 - np.array(list(pss.values()))
#     ISEL = 100 - np.array(list(isel.values()))
    colors = ["#00AA90","#9B6E23","#D0104C"]

    x = [[1,3], [5,7], [9,11],[13,15]]

    fig, ax = plt.subplots()
    label0 = "STARS (N = " + str(uw_datasets["stars"][uw_datasets["stars"]["CSE_POST"] == 1].shape[0]) + ")"
    label1 = ""
    label2 = ""
    if (ii == 0):
        label1 = "Direct Admit (N = " + str(uw_datasets["CSE_direct_nostars"].shape[0]) + ")"
        label2 = "Other (N = " + str(uw_datasets["CSE_nodirect_nostars"].shape[0]) + ")"
    elif (ii == 1):
        label1 = "URM (N = " + str(uw_datasets["CSE_urm_nostar"].shape[0]) + ")"
        label2 = "Other (N = " + str(uw_datasets["CSE_nourm_nostar"].shape[0]) + ")"
    else:
        label1 = "First Gen (N = " + str(uw_datasets["CSE_firstgen_nostar"].shape[0]) + ")"
        label2 = "Other (N = " + str(uw_datasets["CSE_nofirstgen_nostar"].shape[0]) + ")"
    
    ax.plot(x[0], BDIII[2], '-.', color = colors[2], label = label2)
    ax.plot(x[0], BDIII[1], '-+', color = colors[1], label = label1)
    # ax.plot(x[0], BDIII[3], '-*', color = 'black', label = "Other (N = 45)")
    ax.plot(x[0], BDIII[0], '--', color = colors[0], label = label0)

    ax.plot(x[1], Depression[0], '--', color = colors[0])
    ax.plot(x[1], Depression[1], '-+', color = colors[1])
    # ax.plot(x[1], Depression[3], '-*', color = 'black')
    ax.plot(x[1], Depression[2], '-.', color = colors[2])

    ax.plot(x[2], Stress[0], '--', color = colors[0])
    ax.plot(x[2], Stress[1], '-+', color = colors[1])
    # ax.plot(x[2], Stress[3], '-*', color = 'black')
    ax.plot(x[2], Stress[2], '-.', color = colors[2])
    
    
#     ax.plot(x[3], ISEL[0], '--', color = colors[0])
#     ax.plot(x[3], ISEL[1], '-+', color = colors[1])
#     # ax.plot(x[2], Stress[3], '-*', color = 'black')
#     ax.plot(x[3], ISEL[2], '-.', color = colors[2])
    
    ax.set_ylim([-3,110])
    ax.set_xticks([2,6,10])#,14])
    ax.set_xticklabels(["Jan         June\n\nSuicide Risk (BDI)",
                        "Jan         June\n\nDepression (CES-D)",
                        "Jan         June\n\nStress (PSS)",
                       ])
#                         "Jan         June\n\nSocial Support (ISEL)"])
    plt.xticks(rotation=45)

    ax.set_ylabel("Percentage of Population / %")
    ax.yaxis.grid()
    ax.set_title("Jan vs. June Evaluation")
    ax.legend(loc='upper right', bbox_to_anchor=(1.0, 1.0))
    plt.show()

### UW vs CMU

In [None]:
# Stress... 

cols = [ 'PSS_PRE_ALL', 'PSS_POST_ALL']
stress = prep_for_plot(cols, lambda x: x, datasets["all"])
print(stress.describe())
print(ttest_rel(stress.PSS_PRE_ALL, stress.PSS_POST_ALL))

# for col in index_cols:
#     df2 = df[cols + [col]]
#     #print(df2[col].value_counts())
#     #print(df2.head())
#     df2 = df2.loc[df2[col] >= 0]
#     ax = df2.boxplot(by=col)
#     ax.set_ylim(0, 80)
#     ax.plot([16, 16,16,16], "k--")

index_cols = [
               'LOC_ALL']
value_cols = ['PSS_POST_ALL']

plots = plot_group(df_all, value_cols, index_cols, mergeconfig,
           plot_type = "bar", remove_neg_value=True, ymin = 0, ymax = 50,
           title = "PSS Score (all)", ylabel = "PSS Score")

plots[0].plot([20, 20], "k--")




In [None]:
# PSS stacked plot instead
cols = ['PSS_B2', 'PSS_MID', 'PSS_POST']
ax = plot_thresholds(prep_for_plot(cols, lambda x: x, datasets["all"]), "PSS", 
         xticks = ["Pre", "Mid","Post"], bins=20)


# verticle line indicating the threshold
ax.axvline(x=20)

print("---------------")
xlabels = ["NA", "Low Stress", "High Stress"]
pss = prep_for_plot(cols, lambda data: answer_assign_bins(data, cols, [-1,0,20, 80], xlabels), 
                   datasets["all"])

print(pss.head())

# extract cols and don't modify for now
legend = ["Pre", "Mid","Post"]
table = prep_for_plot(cols, lambda data: percentage_for_values(data, cols, order=xlabels), pss)
print(table)
create_bar_plot(pd.DataFrame(table), "PSS at Pre vs Mid vs Post", legend=['Pre','Mid','Post'])

#### Stress Comparisons by Group

In [None]:
%run Helpers.ipynb
print("getting firstgen counts")
categories = ['FirstGen_B1','Gender_MFO_MID'] # Engineer_MID
cols = ['PSS_B2', 'PSS_MID', 'PSS_POST']
# XX TODO make sure this is a correct mapping, right now if order changes there's a problem {1:'Male',2:'Female'}
#firstgen_labels = ['No','Yes']
levels = [["Not FirstGen", "FirstGen"], ["Male", "Female"]]

plt = plot_percentages(uw_datasets["engineer"], categories, "Percent with low PSS", cols, 
                        lambda seq: percent_matching(lambda x: 0<=x<=19, seq), levels=levels)

plt = plot_percentages(uw_datasets["engineer"], categories, "Percent with high PSS", cols, 
                        lambda seq: percent_matching(lambda x: 20<=x<=80, seq), levels=levels)

plt = plot_percentages(uw_datasets["engineer"], [categories[1]], "High PSS by Gender", cols, 
                        lambda seq: percent_matching(lambda x: 20<=x<=80, seq))
plt = plot_percentages(uw_datasets["engineer"], ['DISC_ALL'], "High PSS by Discrimination", cols, 
                        lambda seq: percent_matching(lambda x: 20<=x<=80, seq))






### Major Life Events

#### Type of MLE Events

In [None]:
vals = ['MLE_CLASS', 'MLE_RELATE', 'MLE_ILL',
            'MLE_LEGAL_FINANCIAL','MLE_DISCRIMINATION','MLE_GENDER_VIOLENCE','MLE_VIOLENCE','MLE_ASSAULT','MLE_LIVING']
#x_labels = {-1:'NA', 1:'Yes', 0:'No'}


table = pd.DataFrame()
#display(HTML(df.to_html()))

time = ['_B2', '_MID', '_POST']
res = pd.DataFrame(index=vals, columns=time)
print(res)

for period in time:
    for val in vals:
        
        
        print("total" + val+" during period"+ period)
        print(datasets["all"][val+period].agg(sum))
        print(datasets["all"][val+period].value_counts()[1])

        #Pre 
        printD(period, ": Table 1")
        table = prep_for_plot(list(map(lambda x: x+period, vals)), 
                              lambda data: answer_percentages(data, val+period), uw_datasets["engineer"])
        try:
            pct = table.loc[1, 'id']
        except KeyError:
            pct = 0
            
        res.at[val,period]=pct
    

display(HTML(res.to_html()))
res.index = ["class","rltshp","health","finance","discrim","violence","assault","living"]
create_bar_plot(res, "Types of Major Events", legend=['Pre','Mid','Post'])

# break this out by status
index_cols = ['Minority_B1','URM_B1','Engineer_MID','FirstGen_B1','Gender_MFO_MID',
              'MYU_Services_STARS_01_POST','DirectAdmit_POST']
for category in vals:
    category = category+'_MID'
    df_tmp = uw_datasets["engineer_simplegender"].loc[uw_datasets["engineer_simplegender"][category] >= 0]
    category_counts = df_tmp[category].value_counts()
    category_counts = category_counts.to_frame()
    #print(category_counts)
    for col in index_cols:
        #print("making plot for " + col)
        #plt.figure()
        df_tmp_narrow = df_tmp.loc[df_tmp[col] >= 0]
        table = df_tmp_narrow.groupby(category)[col]
        counts = table.value_counts().unstack()
        #print(col + ", " + category)
        #print(counts)
        category_counts = df_tmp[category].value_counts()
        category_counts = category_counts.to_frame()
        try:
            category_counts[col+"_0"]=counts.loc[:,0]
        except:
            category_counts[col+"_0"]=0
        try:
            category_counts[col+"_1"]=counts.loc[:,1]
        except:
            category_counts[col+"_1"]=0
            
        category_counts=category_counts[[col+"_0",col+"_1"]]
        #print(category_counts)
        #print("sum")
        agg = category_counts.agg(sum)
        #print(agg[0])
        #print("done")
        category_counts[col+"_0"] = 100*category_counts[col+"_0"]/agg[0]
        category_counts[col+"_1"] = 100*category_counts[col+"_1"]/agg[1]        
        #print(category_counts)

        #print(counts)
        #plot = counts.plot(kind='bar', stacked=True)
        #plt.show()
        #plot.bar(counts)
        #print(df2[col].value_counts())
        #print(df2.head())
        #plt.show()
        theplot = category_counts.plot(kind='bar')
        theplot.set_title(category)
     
for category in vals:
    category = category + '_MID'
    df_tmp = uw_datasets["engineer_simplegender"].loc[uw_datasets["engineer_simplegender"][category] >=0]
    for col in index_cols:
        df_tmp_narrow = df_tmp.loc[df_tmp[col] >=0]
        df_tmp_narrow.plot(kind='box')

In [None]:
#XX may not work anymore
cols = ['Discrimination','Rel_Violence'#,'Assault'
        ]

print("getting minor counts")
categories = ['Minority_B1','Engineer_MID']
# XX TODO make sure this is a correct mapping, right now if order changes there's a problem {1:'Male',2:'Female'}
minority_labels = []

#table = prep_for_plot(cols, lambda data: percentage_for_values(df, cols),smu)
#plt = create_bar_plot(pd.DataFrame(table).T, "Social Media Use: Posts", xticks=xticks)
df_tmp = uw_datasets["engineer"]
df_tmp['Discrimination'] = df_tmp.MLE_DISCRIMINATION_MID + df_tmp.MLE_DISCRIMINATION_POST
df_tmp['Rel_Violence'] = df_tmp.MLE_GENDER_VIOLENCE_MID+df_tmp.MLE_GENDER_VIOLENCE_POST
df_tmp['Assault'] = df_tmp.MLE_ASSAULT_MID+df_tmp.MLE_ASSAULT_POST

levels = [["Not Minority", "Minority"], ["Not Eng", "Engineer"]]

plot_percentages(df_tmp, categories, "% discrimination in the last year by minority and major", cols, 
                 agg=lambda seq: percent_matching(lambda x: x>=1, seq), levels=levels, max=30)

categories = ['Minority_B1','Engineer_MID']

levels = [["Not Minority", "Minority"],["Not Eng", "Engineer"]]


plot_percentages(df_tmp, categories, "% discrimination in the last year by minority and major", cols, 
                 agg=lambda seq: percent_matching(lambda x: x>=1, seq), levels=levels)


categories = ['Gender_MFO_MID']
df_tmp = df_tmp.drop(index=list(df_tmp[df_tmp.Gender_MFO_MID==-1].index))
df_tmp = df_tmp.drop(index=list(df_tmp[df_tmp.Gender_MFO_MID==0].index))
levels = [ ['Male', 'Female']]

#plot_percentages(df2.drop(df2[df2.Engineer_MID ==0].index), categories,
#                 "% discrimination in the last year by gender, engineers only", cols, 
#                 agg=lambda seq: percent_matching(lambda x: x==1, seq), levels=levels)

categories = ['Gender_MFO_MID', 'Engineer_MID']

levels = [ ['Male', 'Female'], ['Not Eng', 'Eng']]

plot_percentages(df_tmp, categories,
                 "% discrimination in the last year by gender, and engineering", cols, 
                 agg=lambda seq: percent_matching(lambda x: x>=1, seq), levels=levels, max=30)

categories = ['Minority_B1', 'Gender_MFO_MID', 'Engineer_MID']
levels = [["Not Minority", "Minority"],['Male','Female'],['Non Eng', 'Engineer']]

plot_percentages(df_tmp, categories, "% discrimination in the last year for engineers, broken down by gender and minority status", cols, 
                 agg=lambda seq: percent_matching(lambda x: x>=1, seq), levels=levels, max=30)


df3 = copy.deepcopy(df_tmp)
df3 = df3.drop(df3[df3.Engineer_MID == 0].index)
categories = ['Minority_B1', 'Gender_MFO_MID']
levels = [["Not Minority", "Minority"],['Male','Female']]

plot_percentages(df3, categories, "% discrimination in the last year for engineers, broken down by gender and minority status", cols, 
                 agg=lambda seq: percent_matching(lambda x: x>=1, seq), levels=levels, max=30)


categories = ['DISC_ALL']

plot_percentages(df3, categories, "% discrimination in the last year, compared to ema reports of discrimination", cols, 
                 agg=lambda seq: percent_matching(lambda x: x>=1, seq),  max=30)


In [None]:
#XX may not work anymore
cols = ['Discrimination','Rel_Violence'#,'Assault'
        ]

print("getting minor counts")
categories = ['URM_B1','Engineer_MID']
# XX TODO make sure this is a correct mapping, right now if order changes there's a problem {1:'Male',2:'Female'}
minority_labels = []

#table = prep_for_plot(cols, lambda data: percentage_for_values(df, cols),smu)
#plt = create_bar_plot(pd.DataFrame(table).T, "Social Media Use: Posts", xticks=xticks)
df2 = uw_datasets["CSE"]
df2['Discrimination'] = df2.MLE_DISCRIMINATION_MID + df2.MLE_DISCRIMINATION_POST
df2['Rel_Violence'] = df2.MLE_GENDER_VIOLENCE_MID+df2.MLE_GENDER_VIOLENCE_POST
df2['Assault'] = df2.MLE_ASSAULT_MID+df2.MLE_ASSAULT_POST

levels = [["Not Minority", "Minority"], ["Not Eng", "Engineer"]]

plot_percentages(df2, categories, "% discrimination in the last year by minority and major", cols, 
                 agg=lambda seq: percent_matching(lambda x: x>=1, seq), levels=levels, max=30)

categories = ['URM_B1','Engineer_MID']

levels = [["Not Minority", "Minority"],["Not Eng", "Engineer"]]


plot_percentages(df2, categories, "% discrimination in the last year by minority and major", cols, 
                 agg=lambda seq: percent_matching(lambda x: x>=1, seq), levels=levels)


categories = ['Gender_MFO_MID']
df2 = df2.drop(index=list(df2[df2.Gender_MFO_MID==-1].index))
df2 = df2.drop(index=list(df2[df2.Gender_MFO_MID==0].index))
levels = [ ['Male', 'Female']]

#plot_percentages(df2.drop(df2[df2.Engineer_MID ==0].index), categories,
#                 "% discrimination in the last year by gender, engineers only", cols, 
#                 agg=lambda seq: percent_matching(lambda x: x==1, seq), levels=levels)

categories = ['Gender_MFO_MID', 'Engineer_MID']

levels = [ ['Male', 'Female'], ['Not Eng', 'Eng']]

plot_percentages(df2, categories,
                 "% discrimination in the last year by gender, and engineering", cols, 
                 agg=lambda seq: percent_matching(lambda x: x>=1, seq), levels=levels, max=100)

categories = ['URM_B1', 'Gender_MFO_MID', 'Engineer_MID']
levels = [["Not Minority", "Minority"],['Male','Female'],['Non Eng', 'Engineer']]

plot_percentages(df2, categories, "% discrimination in the last year for engineers, broken down by gender and URM status", cols, 
                 agg=lambda seq: percent_matching(lambda x: x>=1, seq), levels=levels, max=100)


df3 = copy.deepcopy(df2)
df3 = df3.drop(df3[df3.CSE_POST == 0].index)
categories = ['URM_B1', 'Gender_MFO_MID']
levels = [["Not Minority", "Minority"],['Male','Female']]
plot_percentages(df3, categories, "% discrimination in the last year for engineers, broken down by gender and minority status", cols, 
                 agg=lambda seq: percent_matching(lambda x: x>=1, seq), levels=levels, max=100)


categories = ['DISC_ALL']

plot_percentages(df3, categories, "% discrimination in the last year, compared to ema reports of discrimination", cols, 
                 agg=lambda seq: percent_matching(lambda x: x>=1, seq),  max=100)


In [None]:
%run Helpers.ipynb
#XX may not work anymore
cols = ['Discrimination','Rel_Violence'#,'Assault'
        ]

print("getting minor counts")
categories = ['URM_MYU_ALL','Gender_MYU_ALL','Sexuality_MID']
# XX TODO make sure this is a correct mapping, right now if order changes there's a problem {1:'Male',2:'Female'}
minority_labels = []

#table = prep_for_plot(cols, lambda data: percentage_for_values(df, cols),smu)
#plt = create_bar_plot(pd.DataFrame(table).T, "Social Media Use: Posts", xticks=xticks)
print("only engineers")
df2 = uw_datasets["engineer_simplegender"]
df2['Discrimination'] = df2.MLE_DISCRIMINATION_MID + df2.MLE_DISCRIMINATION_POST
df2['Rel_Violence'] = df2.MLE_GENDER_VIOLENCE_MID+df2.MLE_GENDER_VIOLENCE_POST
df2['Assault'] = df2.MLE_ASSAULT_MID+df2.MLE_ASSAULT_POST

levels = [["Not Minority", "Minority"], ["Male", "Female"], ["Straight", "LGBTQIA"]]

plot_percentages(df2, categories, "% discrimination in the last year for engineers, by minority status, male and female, sexual orientation", cols, 
                 xticks=cols, agg=lambda seq: percent_matching(lambda x: x>=1, seq), levels=levels, max=50)


categories = ['URM_MYU_ALL','Gender_MYU_ALL']
levels = [["Not Minority", "Minority"], ["Male", "Female"]]

plot_percentages(df2, categories, "% discrimination in the last year for engineers, by minority status, male and female", cols, 
                 xticks=cols, agg=lambda seq: percent_matching(lambda x: x>=1, seq), levels=levels, max=50)


categories = ['DISC_ALL']

plot_percentages(df2, categories, "% discrimination in the last year, compared to ema reports of discrimination", cols, 
                 xticks=cols, agg=lambda seq: percent_matching(lambda x: x>=1, seq),  max=50)


#### Plot for Slide 27 (Discrimination among Engineers)

In [None]:
%run Helpers.ipynb
#XX may not work anymore

print("getting minor counts")
categories = ['URM_MYU_ALL','Female','Sexuality_MID']
# XX TODO make sure this is a correct mapping, right now if order changes there's a problem {1:'Male',2:'Female'}
minority_labels = []

#table = prep_for_plot(cols, lambda data: percentage_for_values(df, cols),smu)
#plt = create_bar_plot(pd.DataFrame(table).T, "Social Media Use: Posts", xticks=xticks)
print("only engineers")
df2 = uw_datasets["engineer"]
df2['Female'] = df2.Gender_MYU_ALL==2
print(df2.Female.value_counts())
print(df2.Gender_MYU_ALL.value_counts())

df2['Discrimination'] = df2.MLE_DISCRIMINATION_MID + df2.MLE_DISCRIMINATION_POST
df2['Discrimination_Y'] = df2.Discrimination >0
df2['Discrimination_N'] = df2.Discrimination == 0
df2['Violence'] = df2.MLE_GENDER_VIOLENCE_MID+df2.MLE_GENDER_VIOLENCE_POST
df2['Violence_Y'] = df2.Violence >0
df2['Violence_N'] = df2.Violence == 0
df2['Assault'] = df2.MLE_ASSAULT_MID+df2.MLE_ASSAULT_POST
df2['Assault_Y'] = df2.Assault >0
df2['Assault_N'] = df2.Assault == 0

levels = [["Not Minority", "Minority"], ["Male", "Female"], ["Straight", "LGBTQIA"]]
cols = ['Discrimination_N','Discrimination_Y']#,'Assault']

plot_percentages(df2, categories, "% discrimination in the last year for engineers, by minority status, male and female, sexual orientation", cols, 
                 xticks=cols, agg=lambda seq: percent_matching(lambda x: x>=1, seq), levels=levels, max=100)


categories = ['URM_MYU_ALL','Female']
levels = [["Not URM", "URM"], ["Male", "Female"]]

plot_percentages(df2, categories, "% discrimination in the last year for engineers, by minority status, male and female", cols, 
                 xticks=cols, agg=lambda seq: percent_matching(lambda x: x>=1, seq), levels=levels, max=100)

cols = ['Discrimination_N','Discrimination_Y', 'Assault_Y', 'Assault_N']

plot_percentages(df2, categories, "% discrimination in the last year for engineers, by minority status, male and female", cols, 
                 xticks=cols, agg=lambda seq: percent_matching(lambda x: x>=1, seq), levels=levels, max=100)

categories = ['DISC_ALL']
cols = ['Discrimination_N','Discrimination_Y']#,'Assault']
plot_percentages(df2, categories, "% discrimination in the last year, compared to ema reports of discrimination", cols, 
                 xticks=cols, agg=lambda seq: percent_matching(lambda x: x>=1, seq),  max=100)


In [None]:
categories = ['URM_MYU_ALL','Gender_MYU_ALL']
levels = [["Not URM", "URM"], ["Male", "Female"]]

plot_percentages(df2, categories, "% discrimination in the last year for engineers, by minority status, male and female", cols, 
                 xticks=cols, agg=lambda seq: percent_matching(lambda x: x>=1, seq), levels=levels, max=100)

In [None]:
% run Helpers.ipynb

In [None]:
df3 = copy.deepcopy(df2)
df3 = df3.drop(df3[df3.Engineer_MID == 0].index)
categories = ['Minority_B1', 'Gender_MFO_MID']
levels = [["Not Minority", "Minority"],['Male','Female']]

plot = plot_percentages(df3, categories, "% discrimination and violence in the last year for engineers\nbroken down by gender and minority status", cols, 
                 agg=lambda seq: percent_matching(lambda x: x>=1, seq), levels=levels, max=30)
plot.set_ylabel = "Percentage"
plot.set_xticklabels(cols)

df3 = copy.deepcopy(df2)
df3 = df3.drop(df3[df3.Engineer_MID == 1].index)
categories = ['Minority_B1', 'Gender_MFO_MID']
#levels = [["Not Minority", "Minority"],['Male','Female']]

plot = plot_percentages(df3, categories, "% discrimination and violence in the last year for non engineers\nbroken down by gender and minority status", cols, 
                 agg=lambda seq: percent_matching(lambda x: x>=1, seq), levels=levels, max=100)
plot.set_ylabel = "Percentage"
plot.set_xticklabels(cols)

In [None]:
df3 = copy.deepcopy(df2)
df3 = df3.drop(df3[df3.Engineer_MID == 0].index)
categories = ['URM_B1', 'Gender_MFO_MID']
levels = [["Not Minority", "Minority"],['Male','Female']]

plot = plot_percentages(df3, categories, "% discrimination and violence in the last year for engineers\nbroken down by gender and minority status", cols, 
                 agg=lambda seq: percent_matching(lambda x: x>=1, seq), levels=levels, max=30)
plot.set_ylabel = "Percentage"
plot.set_xticklabels(cols)

df3 = copy.deepcopy(df2)
df3 = df3.drop(df3[df3.Engineer_MID == 1].index)
categories = ['URM_B1', 'Gender_MFO_MID']
#levels = [["Not Minority", "Minority"],['Male','Female']]

plot = plot_percentages(df3, categories, "% discrimination and violence in the last year for non engineers\nbroken down by gender and minority status", cols, 
                 agg=lambda seq: percent_matching(lambda x: x>=1, seq), levels=levels, max=30)
plot.set_ylabel = "Percentage"
plot.set_xticklabels(cols)

In [None]:
%run Helpers.ipynb

In [None]:
categories = ['Minority_B1', 'Gender_MFO_MID','Sexuality_MID']
levels = [["Not Minority", "Minority"],['Male','Female'],['Straight','LGBTQ']]

df3.groupby(by = categories).count()

In [None]:
df3 = copy.deepcopy(df2)
categories = ['Minority_B1', 'Gender_MFO_MID', 'Sexuality_MID']
levels = [["Not Minority", "Minority"],['Male','Female'],["Straight", "LGBTQ"]]
labels = ["Not Minority, Male, N = 21", 
          "Not Minority, Female, N = 45", 
          "Minority, Male, N = 46", 
          "Minority, Female, N = 81"]
labels = ["Not Minority, Male, Straight, N = 7", 
          "Not Minority, Male, LGBTQ, N = 1", 
          "Not Minority, Female, Straight, N = 26", 
          "Not Minority, Female, LGBTQ, N = 3", 
          "Minority, Male, Straight, N = 10", 
          "Minority, Male, LGBTQ, N = 6", 
          "Minority, Female, Straight, N = 44"
          "Minority, Female, LGBTQ, N = 5"]

plot = plot_percentages(df3, categories, 
  "% discrimination and violence in the last year\nbroken down by gender and minority status", cols, 
  agg=lambda seq: percent_matching(lambda x: x>=1, seq),
  max=30, legend = True, hatches = True, #levels=levels)
  legend_labels = labels)
plot.set_ylabel("Percentage / %")
plot.set_xticklabels(cols)

In [None]:
df3 = copy.deepcopy(df2)
categories = ['URM_B1', 'Gender_MFO_MID', 'Sexuality_MID']
levels = [["Not Minority", "Minority"],['Male','Female'],["Straight", "LGBTQ"]]
labels = ["Not Minority, Male, N = 21", 
          "Not Minority, Female, N = 45", 
          "Minority, Male, N = 46", 
          "Minority, Female, N = 81"]
labels = ["Not Minority, Male, Straight, N = 7", 
          "Not Minority, Male, LGBTQ, N = 1", 
          "Not Minority, Female, Straight, N = 26", 
          "Not Minority, Female, LGBTQ, N = 3", 
          "Minority, Male, Straight, N = 10", 
          "Minority, Male, LGBTQ, N = 6", 
          "Minority, Female, Straight, N = 44"
          "Minority, Female, LGBTQ, N = 5"]

plot = plot_percentages(df3, categories, 
  "% discrimination and violence in the last year\nbroken down by gender and minority status", cols, 
  agg=lambda seq: percent_matching(lambda x: x>=1, seq),
  max=30, legend = True, hatches = True, #levels=levels)
  legend_labels = labels)
plot.set_ylabel("Percentage / %")
plot.set_xticklabels(cols)