# Initialization

In [None]:
import pandas as pd
import numpy as np
import io, os, sys, types
from IPython import get_ipython
from nbformat import read
from IPython.core.interactiveshell import InteractiveShell
from IPython.core.display import HTML
from scipy.stats import ttest_rel
import matplotlib.pyplot as plt
import copy

In [None]:
%run Helpers.ipynb
DEBUG=False

# Helper Functions for Loading Data

In [None]:
# To load a survey, we first read the csv file, 
# then use the transdict to convert the headers to standard names, 
# and drop the columns we are not interested in. 
# Finally, we drop any entries that do not have an appropriate participant ID (>max_PID)
# Finally performs data cleaning. 
# @filedir the location of the data files
# @dataconfig the config file
def load_surveys(config, dataconfig):
    """Loads a survey. Drops columns not of interest and any entry 
    without a valid participant ID (>max_PID) 
    Also transforms column names to a standard set."""
    
    surveydir = config["surveydir"]
    configdir = config["configdir"]
    surveys = dataconfig["surveys"]
    results = {}
    
    for survey in surveys:
        print("--------------------------------------------------")
        print("loading survey: ", survey)
        print("--------------------------------------------------")

        survey_config = dataconfig[survey]
        file = surveydir + survey_config["file"]

        # Load the json file that has the transdict in it
        with open(configdir + survey_config["transdict"], 'r') as file_obj:
            transdict = json.load(file_obj)
            file_obj.close()
            
        # the unique tag for this survey
        name = survey_config["name"]
        # index variables that should have the same name in all surveys
        index = survey_config["index"]
        # max_PID the maximum PID number for a participant
        max_PID = survey_config["max_pid"]
        # the maximum number of survey questions that can have no answer. If -1, don't drop
        max_skip = survey_config["max_skip"]
        # specialty cleaning code for this survey (by default does nothing)
        custom_cleaning = eval(survey_config["cleaning"])
        # index variable for merging
        mergeindex = dataconfig["mergeindex"]

        print(survey_config["notes"])
        
        #################################### read the survey in ##########################################
        df = pd.read_csv(file, header=[0,1]) 
        printD(df.shape)
        
        #################################### set up column names for transdict ###########################
        # participant ID (PID) should not have a survey-specific name; same with recordId (ID)
        # Everything else *SHOULD* have name after it.
        for key in transdict:
            val = transdict[key]
            if (not (val in index)):
                transdict[key] = val+"_"+name# set up column names for dropping from survey

        printD("rename columns")
        columns = df.columns.values
        columns = np.array([transdict[x[0]] for x in columns])
        df.columns = columns
        #display(HTML(df.head(20).to_html()))
        printD(df.shape)

        #################################### set up column names for dropping from transdict #############
        printD("drop columns")
        # the columns to drop (that we don't need) (by default drop_cols)
        drop = dataconfig["drop"]
        drop = [item if item in index else item + "_" + name for item in drop]
        drop = np.intersect1d(columns,drop)
        printD(drop)
        df = df.drop(columns=drop)
        printD(df.shape)

        #################################### check for correct PID size ###################################
        print("todo: have a list of correct PIDs rather than a max pid")
        query = 'PID <= ' + str(max_PID)
        printD("dropping people who did not have a correct PID, #=", df.query(query).shape[0])
        df = df.query(query)
        printD(df.shape)

        #################################### check for completion (qualtrics variable) ####################
        printD("dropping surveys with incomplete status")
        try:
            valid = df['status'+'_'+name] == 0
            printD("dropping surveys with a response value of 0 (not complete)")
            df = df[valid]
        except:
            print("survey has not status")
        printD(df.shape)
    
         #################################### drop any row with at least max_skip entries missing #########
        if (max_skip > 0):
            printD("dropping people who didn't answer lots of questions")
            na_list = df.isnull().sum(axis=1)
            printD("dropping people who skipped at least", max_skip, "questions. numskipped for each participant: ", 
                   df.isnull().sum(axis=1).tolist())
            print("Dropping people who didn't answer lots of questions", df["PID"][na_list > max_skip].tolist())
#             print(na_th_id)
            df = df[na_list <= max_skip]
            printD(df.shape)
        
        #################################### fill NAs with appropriate choices #############################
        # By default:
        # Dates should be ignored. text should be changed to blanks (these are text columns).
        # And numeric columns should be converted to -99
        # And everything else should be converted to ''
        for column in df.columns.values:
            printD(column)
            try: 
                df[column] = pd.to_numeric(df[column], downcast = "integer")
#                 df[column] = df[column].fillna(-1)
#                 df[column] = df[column].apply(np.int64)
                printD("number")
            except: 
                try: 
                    df[column] = pd.to_datetime(df[column])
                    printD("date")
                except:
                    df[column] = df[column].fillna('')
                    printD("text")
        
         #################################### check for duplicates #############################

        print("finding duplicates")
        dups = find_duplicate_pids(max_PID, df)
        print("duplicates", dups)
    
        #################################### run custom cleaning function #############################

        printD("custom cleaning")
        df = custom_cleaning(df)
        printD(df.shape)

        #################################### check that diffs are resolved #############################
        dups = find_duplicate_pids(max_PID, df)
        print("duplicates", dups)
        print_diffs(max_PID, df)

        #################################### set the index up #############################
        # Defaults into the first value in the index array
        print("setting index")
        df[mergeindex] = df[index[0]]
        df = df.set_index(mergeindex)
        df[mergeindex] = df.index

        
        print(df.shape)
        display(HTML(df.head(n=2).to_html()))
        results[survey] = df
    return results

                
def find_duplicate_pids(max_PID, survey):
    """Counts the numbert of times each PID from 0 to max_PID occurs in survey.
        Then returns a list of those that occur more than once"""
    pids = np.zeros(max_PID)
    printD(max_PID)
    printD(pids)
    for item in survey['PID']:
        printD(item)
        pids[item] += 1 
    printD(pids)
    pids = np.where(pids>1.0)[0]
    return pids

#PrintDs the contents of the columns that differ, for each column that has a difference
# This is done for each PID for which there are duplicates.
def print_diffs(max_PID, survey):
    """PrintDs the contents of the columns that differ, for each column that has a difference
    This is done for each PID for which there are duplicates."""
    print("printing diffs")
    pids = find_duplicate_pids(max_PID, survey)
    if (len(pids) == 0):
        printD ("no duplicates left")
    else:
        printD("duplicates", pids)
        for pid in pids:
            printD(pid)
            dups = survey[survey.PID==pid]
            printD(dups)
            for column in dups:
                printD(dups[column].values)
                try:
                    #unique_elements, counts_elements = np.unique(dups[column].values, return_counts=True)
                    #print(unique_elements)
                    #print(counts_elements)
                    unique = np.unique(dups[column].values)
                except:
                    print("couldn't test unique")
                    print(dups[column])
                if len(unique)==2:
                    printD("two the same")
                    printD(column)
                    printD(unique) 
            printD(dups[column])   
            
# Remove a duplicate row from the survey, identified by ID
# pid is the participant id for whom there is a duplicate;
# testvar is a variable that can be used to show that there 
# It is meaningless except to help the writer eyeball things
# survey is the survey that has the duplicate.
def remove_dup(pid, survey, ID, testvar=''):
    """Remove a duplicate row from the survey, identified by ID
    pid is the participant id for whom there is a duplicate;
    testvar is a variable that can be used to show that there 
    It is meaningless except to help the writer eyeball things
    survey is the survey that has the duplicate.
    """
    printD("---------------------------------------------", pid)
    dups = survey[survey.PID==pid] if (testvar == '') else survey[survey.PID==pid][testvar] 
    if len(dups)>1:
        dups = survey[survey.PID==pid]
        printD("number of empty values", dups.isnull().sum(axis=1).tolist(), dups['PID'].values)
        printD ("checking for id ", ID, " in data for ", survey[survey.ID==ID]['PID'].values)
        if (pid in survey[survey.ID==ID]['PID'].values):
            printD("removing ID", ID, "for",  survey[survey.ID==ID]['PID'].values)
            survey = survey[survey.ID != ID]
            dups = survey[survey.PID==pid] if (testvar == '') else survey[survey.PID==pid][testvar] 
            printD("number of dups left for ",pid," is: ", len(dups))
        else: 
            printD("PID ",pid," isn't in ", survey[survey.ID==ID]['PID'].values)
    else:
        printD("no duplicates",dups)
    return survey

# Vals is a dictionary of keys (current values) and replacements. All cols in survey
# should be searched for those keys and have them replaced with the appropriate values. 
def cleanup_vals(survey, cols, vals):
    """Vals is a dictionary of keys (current values) and replacements. All cols in survey
       should be searched for those keys and have them replaced with the appropriate values."""
    for key in vals:
        #print(col)
        for col in cols:
            printD(key)
            #if key in survey[col]:
            try:
                survey[col] = survey[col].replace(key, vals[key])
                printD("\'"+key+"\' in "+col)
            except:
                printD("\'"+key+"\' not in "+col)
            #for item in survey[col]:
            #    print(item, ',', flush=True)
            #print('')
    return survey

## Cleaning functions for specific Surveys

Cleaning has several steps. First we drop rows with too many questions unanswered, which may
vary from survey to survey. Then we repair problem data and assign na valuse to empty cells. 
Finally we find and address duplicate entries. 

In [None]:
def uw_baseline1_specific(survey):
    printD("uw baseline1 specific")
    printD(survey.columns)
    # Bug #1 -- clean up Housing answers, data validation wasn't in place and some were textual
    # this helper function converts them to numbers. 
    printD("fixing housing")
    cols = ['Housing_Q2_'+str(i)+'_B1' for i in [1,2,3,4,5]]
    cols = cols + ['Housing_Q2_B1', 'Housing_Homenum_B1']
    printD(cols)
    vals = { "None":0, "None ":0, "Older Sister, Younger Sister":2, "Mom and Dad":2, "2 (+1 sibling)":3, "80*":80,
       "80+":80, "Roomate":2, "Roomate ":2, "10+":10, "Roommate ":2}
    survey = cleanup_vals(survey, cols, vals)

    # Bug #2 -- We asked twice if people were engineers. We need to merge those columns
    # in Baseline1 and Baseline3. Right now, the isengineer function accounts for this.
    print("XXX TODO Bug in uw_baseline1_specific")
  
    # Bug #3 -- remove incorrect entry for PID 141. These were basically the same. Check the excel to see this.
    survey = remove_dup(141, survey,  'R_3hF7l4fLFnTwoCS', 'Age_B1')
    
    survey['LOC_B1']=0

    return survey
    
def uw_baseline2_specific(survey):

    # Bug #1 -- we asked twice if people were engineers. We need to merge those columns
    # in Baseline2 and Baseline3.  Right now, the isengineer function accounts for this.
    # X X TODO
    print("XXX TODO Bug in uw_baseline2_specific")

    # Bug #2 -- we had no NA for major life events. Some people selected everything. Need to remove answers
    # from those people. Right now anyone who answers that 20 things happened in their lifetime 
    # AND SKIPS NO QUESTIONS is moved over to NAs instead
    print("cleaning up MLE so we NA bad answers")
    cols = ['MLE_Q1_ChooseMajor','MLE_Q1_ChangeMajor','MLE_Q1_Drop','MLE_Q1_Miss','MLE_Q1_InsDis','MLE_Q1_Cheat',
        'MLE_Q1_Workload','MLE_Q1_LowGrades','MLE_Q1_Repeat','MLE_Q1_PartnerProb',
        'MLE_Q1_RoommateProb','MLE_Q1_FamilyProb','MLE_Q1_Divorce','MLE_Q1_Illness',
        'MLE_Q1_Injury','MLE_Q1_FamilyHealth','MLE_Q1_Pregnant','MLE_Q1_LegalCrisis','MLE_Q1_Income',
        'MLE_Q1_Living','MLE_Q1_SeeFamily','MLE_Q1_FriendDie','MLE_Q1_Discrimination','MLE_Q1_Robbery',
        'MLE_Q1_Rape','MLE_Q1_SexAssault','MLE_Q1_PhysAbuse','MLE_Q1_KnifeGun','MLE_Q1_EmotAbuse',
        'MLE_Q1_ObsAssault','MLE_Q1_FinanceCrisis','MLE_Q1_Horror']
    print("selecting columns")
    df = survey.loc[:,'MLE_Q1_ChooseMajor_B2':'MLE_Q1_Horror_B2']
    nas = {3:-1,0:0,1:1,2:2}
    for index, row in df.iterrows():
        counts = row.value_counts()
        if ((counts.get(3,0) >20) and (counts.get(-1,0) is 0)):
            #print(index)
            # df.loc[2,'B':'C']=df.loc[2,'B':'C'].map(lambda x: 3)
            #print("-------- BEFORE ----------")
            #print(baseline2_survey.loc[index, 'MLE_Q1_ChooseMajor_B2':'MLE_Q1_Horror_B2'])
            survey.loc[index, 'MLE_Q1_ChooseMajor_B2':'MLE_Q1_Horror_B2']=survey.loc[index, 'MLE_Q1_ChooseMajor_B2':'MLE_Q1_Horror_B2'].map(lambda x: nas[x], na_action = "ignore")
            #print("-------- AFTER ----------")
            #print(baseline2_survey.loc[index, 'MLE_Q1_ChooseMajor_B2':'MLE_Q1_Horror_B2'])

    df = survey.loc[:,'MLE_Q1_ChooseMajor_B2':'MLE_Q1_Horror_B2']
    #print(df.head())
    for index, row in df.iterrows():
        counts = row.value_counts()
        if ((counts.get(3,0) >20) and (counts.get(-1,0) is 0)):
             print("index needs updated")
             print(index)
             print(counts)
             #print(baseline2_survey.loc[index, 'MLE_Q1_ChooseMajor_B2':'MLE_Q1_Horror_B2'])

    # Bug #3 
    # will need to clean PSQI_Q2_FALL -- how long does it take you to fall asleep. Sample answers to build a
    # solution from: ['5 min', '60 minutes','60+ minutes','15min','15-30','0-10 minutes','&lt;10 min',
    #                  '12:00am','10-15 mins','14mim']
    print("XXX TODO Bug in uw_baseline2_specific")
    

    survey['LOC_B2']=0

    return survey

def uw_mid_specific(survey):
    # Bug #1 clean up Housing answers, data validation wasn't in place and some were textual
    # this helper function converts them to numbers. 
    printD("cleaning up housing so it is numeric")
    cols = ['Housing_Q2_'+str(i)+'_MID' for i in [1,2,3,4,5]]
    cols = cols + ['Housing_Q2_MID','Housing_Homenum_MID']
    printD(cols)
    vals = {"None":0, "None ":0, "Older Sister, Younger Sister":2, "Mom and Dad":2, "2 (+1 sibling)":3, "80*":80,
           "80+":80, "Roomate":2, "Roomate ":2, "10+":10, "Roommate ":2}
    survey = cleanup_vals(survey, cols, vals)
    
    
    # Bug #2 -- we asked twice if people were engineers. We need to merge those columns
    # Right now, the isengineer function accounts for this.


    # XXX TODO
    # will need to clean PSQI_Q2_FALL -- how long does it take you to fall asleep. Sample answers to build a
    # solution from: ['5 min', '60 minutes','60+ minutes','15min','15-30','0-10 minutes','&lt;10 min',
    #                  '12:00am','10-15 mins','14mim']
    print("XXX TODO Bug in uw_mid_specific")


    #DUPS: [ 11  15]

    # remove incorrect entry for PID 11. They were close to the same.
    # one had slightly less detail about major, drinking etc
    survey = remove_dup(11, survey,  'R_2wysy4zTpMXrmhc', 'Major_Proposed_TEXT_MID')

    # remove incorrect entry for PID 15. They were almost identical
    # with one being slightly more complete (I chose that one). Could go the otherway and
    # always pick the first one...
    survey = remove_dup(15, survey,  'R_1Fggwk8xdyz7dG8', 'Major_Proposed_TEXT_MID')

    # remove incorrect entry for PID 63. One was partly filled out, the other
    # was complete but had a wrong answer (a time instead of # minutes). 
    # I hand added in the correct type answer from the deleted entry instead. 
    #print(baseline3_survey.loc[207, 'PID'])
    #survey.loc[survey['ID']=='R_ZgcUPyocJ64o8pP', 'PSQI_Q2_FALL_MID'] = '30'

    # remove incorrect entry for PID 167. 
    #survey = remove_dup(167, survey,  'R_d5Sr9qs3KghSS8V', 'PSQI_Q1_BEDTIME_MID')
  
    print_diffs(220, survey)
    
    survey['LOC_MID']=0

    return survey

def uw_post_specific(survey):
    # Fix known bugs in data for Baseline4

    # Bug #1 clean up Housing answers, data validation wasn't in place and some were textual
    # this helper function converts them to numbers. 
    print("cleaning up housing so it is numeric")
    cols = ['Housing_Q2_'+str(i)+'_POST' for i in [1,2,3,4,5]]
    cols = cols + ['Housing_Q2_POST','"Housing_Homenum_POST']
    print(cols)
    vals = {"None":0, "None ":0, "Older Sister, Younger Sister":2, "Mom and Dad":2, "2 (+1 sibling)":3, "80*":80,
       "80+":80, "Roomate":2, "Roomate ":2, "10+":10, "Roommate ":2}
    survey = cleanup_vals(survey, cols, vals)

    # XXX TODO
    # will need to clean PSQI_Q2_FALL -- how long does it take you to fall asleep. Sample answers to build a
    # solution from: ['5 min', '60 minutes','60+ minutes','15min','15-30','0-10 minutes','&lt;10 min',
    #                  '12:00am','10-15 mins','14mim']
    print("XXX TODO Bug in uw_post_specific")
    
    #set(survey["UWEXP_Q4_1_B4"])

    survey['LOC_POST']=0

    return survey

def uw_discrimination_specific(survey):
    survey['LOC_EMA']=0

    return survey

def cmuII_post_specific(survey):
    # duplicates [365 394]
    # Answered twice (morning and evening). Took twice is as long the first time; keeping that
    survey = remove_dup(365, survey,  'R_7Pq7PhR9acvorCh', 'duration_CMUII_POST')
    # Answered twice, on the 15th and 24th of May. Kept the first one (longer, both were complete)
    survey = remove_dup(394, survey, 'R_1gMA2O4n2xe0ri1', 'duration_CMUII_POST')
    
    survey['LOC_CMUII_POST']=1

    return survey


def cmuII_baseline1_specific(survey):
    # duplicates [247]

    # One of these two people entered the wrong ID by mistake. Maybe find the ID that is missing?
    CMUII_PIDS=[542,250,329,319,312,276,281,538,489,507,283,365,378,278,245,365,607,302,476,208,268,410,485,397,271,474,237,254,251,665,500,247,361,551,233,318,555,212,303,671,518,294,451,672,427,284,477,460,415,417,209,338,218,577,360,422,343,402,523,240,339,248,479,609,291,393,255,413,369,430,512,210,379,492,559,581,331,337,398,595,454,342,333,465,336,431,226,525,321,322,457,362,563,267,483,257,256,332,300,394,662,239,304,374,461,450,444,205,203,409,522,228,435,350,293,244,308,274,260,414,528,275,539,515,373,269,264,285,452,213,463,298,376,330,263,541,445,217,202,644,615,380,246,231,344,424,207,390,290,292,326,386,270,201,230,532,266,309,453,306,510,262,659,491,617,229,242,693,613,497,297,629,214,664,706,531,243,714,234,252,395,572,653,711,327,279,220,438,282,383,462,261,396,381,272,215,655,200,211,669,355,311,363,670,235,359,206,334,259,631,204,241,394,473,466,368,295,412,289,305,432,472,540,434,506,277,307,352,487,513,236,586,419,668,301,416,224,514,401,686,517,315,314]
    survey_PIDS=survey['PID']
    #print("possible indexes that ")
    #print(survey_PIDS)
    #print(CMUII_PIDS)
    print(np.setdiff1d(CMUII_PIDS, survey_PIDS))
    
    # removing both for now. don't know which is right...
    #survey = remove_dup(236, survey,  'R_O3dSSLF1JQtH0Yh', 'Housing_Q1_CMUII_B1')
    survey = remove_dup(247, survey,  'R_1NbkalYT58DZhCQ', 'Housing_Q1_CMUII_B1')
    
    survey['LOC_CMUII_B1']=1

    return survey

def cmuII_baseline2_specific(survey):
    # duplicates [236 259 326 328]

    # This survey was less complete
    survey = remove_dup(236, survey,  'R_1g5sTRUNrZp7xKe', 'duration_CMUII_B2')
    # This survey was less complete
    survey = remove_dup(259, survey,  'R_2tEx8tnHygzVmTV', 'duration_CMUII_B2')
    # was totally incomplete
    survey = remove_dup(326, survey,  'R_1dF9iqQ9TWM4Exf', 'duration_CMUII_B2')
    # was much les complete. Big thing it had was MLEs that were not originally reported. Consider merging?
    survey = remove_dup(328, survey,  'R_bjbilLmmRd1AWop', 'duration_CMUII_B2')

    survey['LOC_CMUII_B2']=1

    return survey



# Helper Functions for Creating Scales

In [None]:
def create_scales(config, dataconfig, surveys):
    configdir = config["configdir"]
    survey_names = dataconfig["surveys"]
    
    # Load the json file that has the scale info in it
    with open(configdir+dataconfig["scaleconfig"], 'r') as file_obj:
        scaleconfig = json.load(file_obj)
        file_obj.close()
            
    scales = scaleconfig["scales"]
    functions = scaleconfig["scale_functions"]
    for scale, function in functions.items():
        functions[scale] = eval(function)
    
    for survey in survey_names:
        print("calculating scales for ", survey)
        name = dataconfig[survey]["name"]
        surveys[survey] = calculate_scales(scales, functions, surveys[survey], name)
        display(HTML(surveys[survey].head(n=50).to_html()))

    # XXX TODO check if we should add any of the baseline1 scales to Baseline4 before we run it. 
    #(BFI, MFQ, SES)
    
    return surveys


## Helper functions for scale creation

# Merge Surveys

In [None]:
# A number of different summary function for different scales

#  Calculates many scales by looping through them all and running 
#  any that are valid for this data set
def calculate_scales(scales, scale_functions, df, survey_id): 
    ''' Calculates many scales by looping through them all and running 
    any that are valid for this data set'''
    for name, items in scales.items():
        fun = scale_functions.get(name, scale_functions.get('Default'))
        print("function is: ", fun)
        print("scale is: ", name)
        printD("Name is: ", name)
        printD("items are: ", items)
        name = name + "_" + str(survey_id)
        if name not in df.columns:
            printD("calculating scale")
            #try:
            res = calculate_scale(fun, items, df, name, survey_id)
            df = res
            #except:
            #    print("scale is: ", name)
            #    print("Name is: ", name)
            #    print("scale failed: ", name)
        else:
            print(name, "already have, no calculation")
        print("Survey size: ", df.shape)
    return df

# The user can pass in a function for summarizing data across columns
# and this will calculate a result. 
# summary_fun is a function that takes as input the current summary scores for a particular
#              scale, the scores for the current item of that scale, and the total number of items
# scale_name is the name of the scale (to be looked up in global variable scales) TODO pass this in?
# survey_results is a set of survey results over which to calculate the score
# result_name is the name that the final summary for the scale should be given
# survey_id is which survey (e.g. B1 for baseline 1)
# normalize is whether to normalize on a 0-1 basis after calculating
# Returns a DataFrame containing the entire survey passed in, with the new column added (named result_name) 
def calculate_scale(summary_fun, scale_items, survey, result_name, survey_id, normalize=False):
    scores = pd.Series(np.zeros(survey.shape[0]), index=survey.index, name=result_name)
    print("calculating: " , result_name)
    printD(scale_items)
    for item_name in scale_items: 
        if (survey_id != "ALL"): item_name = item_name+"_"+survey_id
        if (item_name not in survey.columns):
            print(item_name + " not in columns")
            return survey
        newitem = survey.loc[:,item_name]
        scores = summary_fun(scores,newitem,item_name,len(scale_items))
        printD("item is: ", item_name)
        printD(newitem.value_counts())

    printD("adding",result_name,result_name,pd.Series(scores).value_counts())
    if (normalize):
        max = scores.max()
        scores = scores.apply(lambda x: x/max)
    survey = pd.concat((survey, scores.rename(result_name)), axis=1)
    printD(survey.head())
    return survey

# For scales that aren't defined yet, just return the values of the scores
# for the last item in the scale
def noop(scores, newitem, item_name, num_scale_items):
    printD(item_name, " is not yet defined as a scale")
    return scores

# calculates the mean across all items
# expects as input a Series of scores (to be incrementally added to), the newitem (to add in), the item name, 
# and the number of total items in the scale being summarized
def summary_mean(scores, newitem, item_name, num_scale_items):
    # XXX TODO double check with daniela's code what to do with missing values
    print("XXX TODO for mean score calculation")
    newitem = newitem.apply(lambda x: 0 if (x<0) else x)

    scores = scores.add(newitem/num_scale_items,fill_value = 0)
    return scores

# adds together the scores on all the scale items
def summary_sum(scores, newitem, item_name, num_scale_items):
    newitem = newitem.apply(lambda x: 0 if (x<0) else x)

    scores = scores.add(newitem,fill_value = 0)
    scores = scores.apply(lambda x: -1 if (x<=-1) else x)

    return scores

# Combines scale items from different phases/locations (should have no overlap)
def summary_sum_merge(scores, newitem, item_name, num_scale_items):
    print(item_name)
    print("values for", item_name, pd.Series(newitem).value_counts())

    # Scores are initialized to all 0s. So: if this is the first one we've seen, just use it
    if (max(scores) == 0):
       return newitem
    
    scores = scores.add(newitem,fill_value = 0)
    
    print(pd.Series(scores).value_counts())
    return scores
    
# when merging, a scale will have unknowns for all PIDs who weren't in that location
# def pick_valid(x, y): 
#     printD("x: ", x, "y: ", y)
#     if ((x is None or x<=-1) and (y is None or y<=-1)):
#         return -1
#     elif (y is None or y <=-1):
#         return x
#     elif (x is None or x <=-1):
#         return y
#     else:
#         return x + y

# adds together the scores on all the scale items that have 1 
def binary_sum(scores, newitem, item_name, num_scale_items):
    newitem = newitem.apply(lambda x: 0 if (x<0) else x)

    scores = scores.add(newitem,fill_value = 0)
    scores = scores.apply(lambda x: 1 if (x>=1) else 0)
    return scores
    
    
def BDI_Correct_range(scores, newitem, item_name, num_scale_items):
    # CMU seems to have their data with values 5 and higher instead of 1 and higher
    # 21-items 
    # Scaling 0-3
   
    print("Calculating BDI Scale ", item_name)
    print("min: ",min(newitem)," max: ",max(newitem))   
    print(pd.Series(newitem).value_counts())

    rescale = {8: 3, 7:2, 5:0, 6:1}
        
    if ("Pessimism" in item_name):
        rescale = {7: 3, 6:2, 5:0, 4:1}
    elif ("Failure" in item_name):
        rescale = {8: 3, 6:2, 5:0, 4:1}
    elif ("Suicid" in item_name):
        rescale = {9: 3, 8:2, 5:0, 6:1}
    elif (("Sleep" in item_name) or
          ("Appetite" in item_name)):
        if(max(newitem) == 7):
            rescale = {1:0, 2:1,3:1, 4:2,5:2,6:3,7:3}
        elif(max(newitem) == 11):
            rescale = {5:0, 6:1,7:1, 8:2,9:2,10:3,11:3}
    
    if max(newitem) > 4 : # need to rescale scores significantly
        newitem = newitem.map(lambda x: rescale.get(x,0))
    elif (min(newitem) == 1 or max(newitem) == 4): # need to map from 4,3,2,1 to 3,2,1,0
        newitem = newitem.add(-1,fill_value = 0)
    
    print("min: ",min(newitem)," max: ",max(newitem))   
    
    # Scores are initialized to all 0s. So: if this is the first one we've seen, just use it
    print("newitem",pd.Series(newitem).value_counts())
    print("scores",pd.Series(newitem).value_counts())

    if (max(scores) == 0):
        scores = newitem
    else: 
        scores = scores.add(newitem,fill_value = 0)
    
    return scores
    
def sum_with_reversals(scores, newitem, item_name, reverse_list, reverse_max):
    newitem = newitem.apply(lambda x: 0 if (x<0) else x)
    if (item_name in reverse_list): 
        printD(newitem)
        newitem = reverse_max.subtract(newitem,fill_value = 0) #XXXX NEED TO MAKE SURE 1-4 is correct here
        printD(newitem)
    scores = scores.add(newitem,fill_value = 0)
    return scores


# adds together the scores on all the scale items. Reverses correct scales
def cesd(scores, newitem, item_name, num_scale_items):
    # 0 1 2 3 : 'CES_D_1','CES_D_2','CES_D_3', 'CES_D_5','CES_D_6','CES_D_7','CED_S_9','CED_S_10','CES_D_11',
    #           'CES_D_13','CES_D_14','CES_D_15', 'CES_D_17','CES_D_18','CES_D_19','CES_D_20'],
    # 3 2 1 0 : 'CES_D_4', 'CES_D_8','CES_D_12','CES_D_16',
    # These scale items need to be reversed
    print("cesd bug: currently not checking number of missing items. Should only allow 4")
    print("min: ",min(newitem)," max: ",max(newitem))   

    reverse = ["CES_D_4","CES_D_8","CES_D_12","CES_D_16"]
    if (min(newitem) == 1 or max(newitem) == 4):
        newitem = newitem.subtract(1,fill_value = 0)
  
    print("min: ",min(newitem)," max: ",max(newitem))   
    
    scores = sum_with_reversals(scores, newitem, item_name, reverse, 3)
 
    print(pd.Series(newitem).value_counts())
    return scores



# adds together the scores on all the scale items. Reverses correct scales
def ucla(scores, newitem, item_name, num_scale_items):
    printD(item_name)
     # Comment UCLA loneliness Scale (ULCA_LS)
     # Total number of items: 20
     # Scale 1 - never to 4 - always 
     # The total score is calculated by finding the sum of 20 items
     # Range: 0 to 80 - higher score indicating more loneliness
    # Items #1,5,6,9,10,15,16,19,20 are reversed scored
    
    reverse = ['UCLA_Q1_1','UCLA_Q1_5','UCLA_Q1_6','UCLA_Q1_9','UCLA_Q1_10','UCLA_Q2_5','UCLA_Q2_6','UCLA_Q2_9','UCLA_Q2_10']
    # These scale items need to be reversed
    print("min: ",min(newitem)," max: ",max(newitem))     
    scores = sum_with_reversals(scores, newitem, item_name, reverse, 3)
    print(pd.Series(newitem).value_counts())
    return scores

# adds together the scores on all the scale items. Reverses correct scales
def isel(scores, newitem, item_name, num_scale_items):
    print(item_name)
     # Number of items - 12
     # scaling 1-4
     # To score, sum across all items (reverse- code items 1, 2, 7, 8, 11, 12)
     # Appraisal: item numbers 2, 4, 6, 11
    
    
    rescale = {8: 3, 7:2, 6:1, 5:0}
        
    if max(newitem) == 8 : # need to rescale scores significantly
        newitem = newitem.map(lambda x: rescale.get(x,0))
        
    reverse = ['KISEL_12_1','KISEL_12_2','KISEL_12_7','KISEL_12_8','KISEL_12_11','KISEL_12_12']
    # These scale items need to be reversed
    print("min: ",min(newitem)," max: ",max(newitem))     
    scores = sum_with_reversals(scores, newitem, item_name, reverse, 3)
    print(pd.Series(newitem).value_counts())
    return scores

# adds together the scores on all the scale items. Reverses correct scales
def pss(scores, newitem, item_name, num_scale_items):
   # PSS-10 scores are obtained by reversing the scores on the four positive items, e.g., 0=4, 1=3, 2=2, etc. 
   # and then summing across all 10 items.  
   # Items 4,5, 7, and  8 are the positively stated items so they need to be reversed scored 
    reverse = ['KPSS_4','KPSS_5','KPSS_7','KPSS_8']
    if (min(newitem) == 1 or max(newitem) == 5):
        newitem = newitem.subtract(1,fill_value = 0) # correct for UW data
    print("min: ",min(newitem)," max: ",max(newitem))     
    scores = sum_with_reversals(scores, newitem, item_name, reverse, 3)
    print(pd.Series(newitem).value_counts())
    return scores

def brs(scores, newitem, item_name, num_scale_items):
    # Brief Resilience Scale 
    # 6-items 
    # Reverse Scored items: 2, 4, 6 
    reverse = ['BRS_2','BRS_4','BRS_6']
    print("min: ",min(newitem)," max: ",max(newitem))     
    scores = sum_with_reversals(scores, newitem, item_name, reverse, 3)
    print(pd.Series(newitem).value_counts())
    return scores

def sss(scores, newitem, item_name, num_scale_items):
    # - 2-way social support scale 
    # 21 items
    # Scale: 0 - not at all to 5 - always 
    # Subscales:
    #  Receiving emotional support: 1, 4, 6, 10, 16, 18, 
    #  Giving emotional support: 3, 7, 14, 19, 21
    #  Receiving instrumental support: 5, 8, 11, 15
    #  Giving instrumental support: 2, 9, 12, 17, 20
    #  Receiving support (instrumental + emotional) 
    # Giving support (instrumental + emotional) 

    reverse = ['SSS_Q1_1','BRS_4','BRS_6']
    print("min: ",min(newitem)," max: ",max(newitem))     
    scores = sum_with_reversals(scores, newitem, item_name, reverse, 3)
    print(pd.Series(newitem).value_counts())
    return scores

# calculates the number of items reporting an event this year or this quarter
def mle_thisyear(scores, newitem, item_name, num_scale_items): 
    newitem = newitem.apply(lambda x: 0 if (x<0) else x)

    scores = scores.add(newitem.apply(lambda x: 1 if(x<=2) else 0),fill_value=0)
    scores = scores.apply(lambda x: 1 if (x>=1) else 0)
    return scores

# calculates the number of items reporting an event this quarter
def mle_thisquarter(scores, newitem, item_name, num_scale_items):
    newitem = newitem.apply(lambda x: 0 if (x<0) else x)

    # in Baseline 4 we changed the question to only ask about the last quarter, thus it 
    printD("ml this quarter")
    printD(item_name)
    #print(newitem.apply(lambda x: 1 if (x == 1) else 0))
    scores = scores.add(newitem.apply(lambda x: 1 if(x == 1) else 0),fill_value=0)     
    scores = scores.apply(lambda x: 1 if (x>=1) else 0)
    return scores
    
# returns a 1 if anything matches engineering intent, and 0 otherwise
def isengineer(scores, newitem, item_name, num_scale_items):
    printD("isengineer")
    if ("ngineer" in item_name):
        scores = scores.add(newitem,fill_value=0)
    elif ("TEXT" in item_name):
        newitem = newitem.astype(str)
        newitem = newitem.apply(lambda x: 1 if (("engineer" in x) or ("omputer" in x)) else 0)
        scores = scores.add(newitem,fill_value=0)
    printD("making binary")
    scores = scores.apply(lambda x: 1 if(x>=1) else 0)
    return scores

# This is adjusted in 'scales' and  here
# returns a 1 if anything matches minority, and 0 otherwise
def isurm(scores, newitem, item_name, num_scale_items):
    if (("frican" in item_name) or ("Latin" in item_name) or ("Islander" in item_name) or ("Native" in item_name)):
        scores = scores.add(newitem.apply(lambda x: 1 if(x==1) else 0),fill_value=0)
    elif ("TEXT" in item_name):
        scores = scores.add(newitem.apply(lambda x: 1 if ("racial" in x) else 0),fill_value=0)
    scores = scores.apply(lambda x: 1 if(x>=1) else 0)
    return scores

# This is adjusted in 'scales' and  here
# returns a 1 if anything matches minority, and 0 otherwise
def isminority(scores, newitem, item_name, num_scale_items):
    if (("Asian" in item_name) or ("frican" in item_name) or ("Latin" in item_name) or ("Islander" in item_name) or ("Native" in item_name)):
        scores = scores.add(newitem.apply(lambda x: 1 if(x==1) else 0),fill_value=0)
    elif ("TEXT" in item_name):
        scores = scores.add(newitem.apply(lambda x: 1 if ("racial" in x) else 0),fill_value=0)
    scores = scores.apply(lambda x: 1 if(x>=1) else 0)
    return scores

# This is adjusted in 'scales' and  here
# returns a 1 if anything matches minority, and 0 otherwise
def isminorityfirstgen(scores, newitem, item_name, num_scale_items):
    printD(item_name)
    if (("Minority" in item_name) or ("Latin" in item_name) or ("Islander" in item_name) or ("Native" in item_name)):
        scores = scores.add(newitem.apply(lambda x: 1 if(x==1) else 0),fill_value=0)
    elif ("TEXT" in item_name):
        scores = scores.add(newitem.apply(lambda x: 1 if ("racial" in x) else 0),fill_value=0)
    if ("Firstgen" in item_name):
        scores = newitem.apply(lambda x: 1 if(x==1) else 0)
        
    scores = scores.apply(lambda x: 1 if(x>=1) else 0)
    return scores

def sexuality(scores, newitem, item_name, num_scale_items):
    printD(item_name)

    scores = scores.add(newitem.apply(lambda x: 1 if (x>=3) else 0),fill_value=0)
    scores = scores.apply(lambda x: 1 if(x>=1) else 0)
    return scores

# counts everyone who is a first generation student
def isfirstgen(scores, newitem, item_name, num_scale_items):
    printD(item_name)
    # if scores are all zeros turn to 1 because we will
    # either replace scores or multiply against them
    if (not scores.sum()):
        scores = scores.apply(lambda x: 1)
        
    # self reported first gen status
    if ("Firstgen" in item_name):
        scores = newitem.apply(lambda x: 0 if(x==1) else 1)
    # first gen status based on parent
    if (("mother" in item_name) or ("father" in item_name)):
        printD("parent is firstgen")
        printD(newitem)
        scores = scores * newitem.apply(lambda x: 0 if (x > 3 or x < 0) else 1)
        
    printD(scores)
    return scores

 
# adjust scoring for employment so that 0 = not employed, 1 = part time and 2 = fulltime
def employment(scores, newitem, item_name, num_scale_items):
    # XXX TODO double check with daniela's code what to do with missing values
    print("XXX TODO for employment score calculation")
    newitem = newitem.apply(lambda x: -1 if (x<0) else x)

    scores = newitem.add(-1,fill_value = 0)
    return scores

# adds together scores that represent active use, and normalises to 1 or 0
def substance_sum(scores, newitem, item_name, num_scale_items):
    printD(item_name)
    if (("Nico" in item_name) or ("Mari" in item_name) or ("oke" in item_name)):
        printD("other substance ", item_name)
        newitem = newitem.apply(lambda x: 0 if (x==2) else 1)
    else:
        printD("alcohol", item_name)
        newitem = newitem.add(-1,fill_value = 0)
        
    scores = scores.add(newitem,fill_value = 0)
    scores = scores.apply(lambda x: 1 if(x>=1) else 0)
    return scores

# Checks if a text item has any text in it (1) or not (0)
def hastext(scores, newitem, item_name, num_scale_items):
    printD(item_name)
    scores = scores.add(newitem.apply(lambda x: 0 if (x=='') else 1),fill_value=0)
    scores = scores.apply(lambda x: 1 if (x>=1) else 0)
    return scores

# Changes a scale that has values (-1, 1) as options to (0,1)
def na_to_0(scores, newitem, item_name, num_scale_items):
    printD(item_name)
    printD("new item value counts")
    printD(newitem.value_counts())
    printD("scores item value counts")
    printD(scores.value_counts())
    printD(newitem)
    newitem = newitem.fillna(-1)
    scores = scores.add(newitem.apply(lambda x: 0 if (x<0) else 1),fill_value=0)
    scores = scores.apply(lambda x: 1 if (x>=1) else 0)
    return scores

def renumber_item(numbermap, default):
    dmap = defaultdict(lambda: default, numbermap)
    return lambda scores, newitem, item_name, num_scale_items: renumber(scores, newitem, item_name, num_scale_items, dmap)

def renumber(scores, newitem, item_name, num_scale_items, numbermap):
    printD(item_name)
    printD("new item value counts")
    printD(newitem.value_counts())
    printD("scores item value counts")
    printD(scores.value_counts())
    printD(newitem)
    scores = scores.add(newitem.apply(lambda x: numbermap[x]),fill_value=0)
    return scores

# adds one if they use media daily or more often
# def media_sum(scores, newitem, item_name, num_scale_items):
#     printD(item_name)
    
#     newitem = newitem.apply(lambda x: 1 if (x==2 or x==1) else 0)
        
#     scores = scores + newitem
#     scores = scores.apply(lambda x: 1 if(x>=1) else 0)
#     return scores

In [None]:
def merge_surveys(config, dataconfig, mergeconfig, surveys):
    survey_names = dataconfig["surveys"]

    # load the config info for how to merge
    cols = mergeconfig["mergecols"].keys()
    left_index = bool(mergeconfig["left_index"])
    right_index = bool(mergeconfig["right_index"])
    how = mergeconfig["how"]
    nan = mergeconfig["nan"]
    merge_surveys = mergeconfig["surveys"]
    mergeindex = dataconfig["mergeindex"]
    print("surveys to merge together")
    print(merge_surveys)
    # This will hold the final result
    df = pd.DataFrame()

    # loop through the surveys to merge
    for suffix, survey_names in merge_surveys.items():
        for survey_name in survey_names:
            # Get the survey
            right = surveys[survey_name]
            print(survey_name," ",right.shape,"-------------------------------------------------")
            try:
                print("printing out suicidality scores for ", survey_name)
                if("cmuII_post" in survey_name):
                    print(right.BDI_II_Suicidality_CMUII_POST.value_counts())
                elif ("cmuII_baseline2" in survey_name):
                    print(right.BDI_II_Suicidality_CMUII_B2.value_counts())
                elif ("uw_baseline2" in survey_name):
                    print(right.BDI_II_Suicidality_B2.value_counts())
                elif ("uw_post" in survey_name):
                    print(right.BDI_II_Suicidality_POST.value_counts())
                elif ("uw_mid" in survey_name):
                    print(right.BDI_II_Suicidality_MID.value_counts())
            except:
                print("")
            # narrow to columns of interest            
            useful_cols = right.columns.intersection(cols)    
            right = right.loc[:, useful_cols]
            print(useful_cols)
 
            # convert the index to specify a suffix
            right.index = right.index.astype(str) + "_" + suffix

            # Merge into the global merge
            if (df.shape[0] == 0):
                df = copy.deepcopy(right)
            else:
                df = pd.merge(df, right, left_index=True, right_index=True, 
                          how="outer", on = "PID")
    return df

def merge_scales(config, dataconfig, df):
    print(df.shape)
    # calculate the post merge scales, if any
    # Load the json file that has the scale info in it
    configdir = config["configdir"]
    with open(configdir+dataconfig["scaleconfig"], 'r') as file_obj:
        scaleconfig = json.load(file_obj)
        file_obj.close()
            
    scales = scaleconfig["post_merged_scales"]
    print(scales)
    
    functions = scaleconfig["scale_functions"]
    for scale, function in functions.items():
        functions[scale] = eval(function)
    
    df = calculate_scales(scales, functions, df, 'ALL')

    display(HTML(df.head(n=10).to_html()))
    
    return df

# tried to assign codes using category types but I couldn't get this working
# XX TODO
def merge_label_items(mergeconfig, df):
    # create categorical variables and assign labels
    for col in df.columns:
        codes = []
        try:
            categories = mergeconfig["mergecols"][col]
            printD("found: ",col), col
            codes = list(mergeconfig["mergecols"][col].values())
            categories = list(mergeconfig["mergecols"][col].keys())
        except:
            print("No labels for: ", col)
            codes = []
        
        if codes:
            print("has labels...", col)
                
            print("categories: ", categories)
            print(codes)
            
            cat = pd.Categorical.from_codes(codes, categories=categories)
                
            print(cat)
            df[col] = df[col].astype(cat)
            print("res", df[col])
            
    return df

# Splits on the final data

In [None]:
def generate_datasets(dataconfig, df):
    queries = dataconfig["datasets"]
    results = {}
    results["all"] = df
    for name, query in queries.items():
        results[name] = df.query(query)
        print("name ", name)
        display(HTML(results[name].head(5).to_html()))
   
    return results

# Graveyard

In [None]:
# textual columns whose na values should be ''
all_text = ['ID','SES_7','SES_8','SES_9','SES_10','SES_11','SES_12','SES_13','SES_14','SES_15','SES_16',
            'Contacts_2_TEXT','Contacts_3_TEXT',
            'Contacts_4_TEXT','Contacts_5_TEXT',
            'Contacts_6_TEXT','Contacts_7_TEXT',
            'Contacts_8_TEXT',
            'Contacts_9_TEXT','Contacts_10_TEXT',
            'Contacts_11_TEXT',
            'Contacts_12_TEXT','Contacts_13_TEXT',
            'Contacts_14_TEXT',
            'Contacts_15_TEXT','Contacts_16_TEXT',
            'Contacts_17_TEXT',
            'Contacts_18_TEXT','Contacts_19_TEXT',
            'Contacts_20_TEXT',
            'Contacts_21_TEXT',
            'phoneProvider_TEXT',
            'Orientation_Other','Orientation_TEXT',
            'College_Other_TEXT','Major_DirectAdmit_TEXT',
            'Major_Proposed_TEXT','Major_Second_TEXT',
            'Major_1_TEXT','Major_2_TEXT','Major_3_TEXT',            
            'Housing_Q1_TEXT','UsualLang',
            'Race_TEXT',
            'PSQI_Q1_BEDTIME','PSQI_Q2_FALL',
            'PSQI_Q3_MORNTIME','PSQI_Q4_HOURS',
            'PSQI_Q5_10_TEXT',
            'SES_7','SES_8','SES_9',
            'SES_10','SES_11','SES_12','SES_13','SES_14','SES_15','SES_16',
            'UWEXP_Q9_TEXT','UWEXP_Services_Other_TEXT',
            'MSLQ_Q1_TEXT','MLE_Q2_TEXT', 'MLE_Q5_2_TEXT',
            'Sports_Q2_6_TEXT','Sports_Q4_6_TEXT', 'Sports_Q7_14_TEXT',
            'SM_Usage_Q1_5_TEXT','SM_Usage_Q2_5_TEXT',
            'CSU_Smoke_Q1C_TEXT','CSU_Smoke_Q1B_TEXT', 'CSU_Smoke_Q1D_TEXT','CSU_Smoke_Q1E_TEXT',
            'CSU_Nico_Q2A_TEXT','CSU_Nico_Q2B_TEXT',
            'CSU_Alch_Q3B_TEXT','CSU_Alch_Q4A_TEXT', 'CSU_Alch_Q5_TEXT','CSU_Alch_Q6_TEXT','CSU_Alch_Q7_TEXT',
            'CSU_Mari_Q8A_TEXT',
            'QID124_TEXT_fbb53c314b4b4eb3a88a8ed0Topics',
            'FinalReflection']