# Initializations

This notebook works better if you run

```conda install -c conda-forge jupyter_contrib_nbextensions``` 

on your command line.
and find the configuration screen (available at the bottom of your ```Edit``` menu)
Then turn on ```codefolding```, ```table of contents```, and ```initialization cells```

In [None]:
import pandas as pd
import numpy as np
import io, os, sys, types
from IPython import get_ipython
from nbformat import read
from IPython.core.interactiveshell import InteractiveShell
from IPython.core.display import HTML
from scipy.stats import ttest_rel
import matplotlib.pyplot as plt
import copy
import json
import csv
import ast
import statsmodels.api as sm
import statsmodels.formula.api as smf

get_ipython().magic(u'matplotlib inline')

In [None]:
%matplotlib inline

In [None]:
# Loads the helpers notebook, and the config files
%run Helpers.ipynb
%run LoadData.ipynb
hello_world()
DEBUG=False

In [None]:
with open("userconfig.json", 'r') as file_obj:
    config = json.load(file_obj)
#     file_obj.close()
filedir = config["configdir"]
surveydir = config["surveydir"]

# Load the json file that has the data loading and cleaning info in it. 
# this is currently v1 and that can change if we make tweaks to things 
# like the cutoff for non-response
with open(filedir + "dataconfig.json", 'r') as file_obj:
    dataconfig = json.load(file_obj)
#     file_obj.close()

## Load All surveys

In [None]:
# loads the surveys and cleans them
clean_surveys = load_surveys(config, dataconfig)

In [None]:
for i in clean_surveys:
    clean_surveys[i].to_csv(surveydir + i + "_cleaned.csv", index = False)

In [None]:
# reload the data to do new merging
clean_surveys = {}
for i in ['uw_baseline1', 'uw_baseline2', 'uw_mid', 'uw_post', 'uw_discrimination', 'cmuII_baseline1', 'cmuII_baseline2', 'cmuII_post']:
    clean_surveys[i] = pd.read_csv(surveydir + i + "_cleaned.csv")
    clean_surveys[i].set_index("PID",inplace = True)
    clean_surveys[i]["PID"] = clean_surveys[i].index

## Calculate Scales

In [None]:
scaled_surveys = create_scales(config, dataconfig, copy.deepcopy(clean_surveys))

In [None]:
for i in scaled_surveys:
    scaled_surveys[i].to_csv(surveydir + i + "_with_scale.csv", index = False)

In [None]:
# reload the data to do new merging
scaled_surveys = {}
for i in ['uw_baseline1', 'uw_baseline2', 'uw_mid', 'uw_post', 'uw_discrimination', 'cmuII_baseline1', 'cmuII_baseline2', 'cmuII_post']:
    scaled_surveys[i] = pd.read_csv(surveydir + i + "_with_scale.csv")
    scaled_surveys[i].set_index("PID",inplace = True)
    scaled_surveys[i]["PID"] = scaled_surveys[i].index

## Merge into a single dataframe by row

In [None]:
# merge all the baseline files by rows (allowing null columns if one column appears in one survey but not in another)

configdir = config["configdir"]
with open(configdir+dataconfig["mergeconfig"], 'r') as file_obj:
    print(configdir)
    mergeconfig = json.load(file_obj)
    file_obj.close()
with open(configdir+dataconfig["scaleconfig"], 'r') as file_obj:
    scaleconfig = json.load(file_obj)
    file_obj.close()

merged, merged_uw, merged_cmu = merge_surveys_by_row(config, dataconfig, mergeconfig, copy.deepcopy(scaled_surveys))
merged.to_csv(surveydir+"merged_by_row_all.csv")
merged_uw.to_csv(surveydir+"merged_by_row_uw.csv")
merged_cmu.to_csv(surveydir+"merged_by_row_cmu.csv")

## Merged into a single dataframe by column

In [None]:
###### Create an exportable or analyzable summary file that merges baseline 1 and baseline 3. 


merged = merge_surveys_by_column(config, dataconfig, mergeconfig, scaled_surveys)
display(HTML(merged.head(n=10).to_html()))
merged.index.name = "PID"
merged = merge_scales(config, dataconfig, merged)
merged.to_csv(surveydir+dataconfig["mergedfile"])

# # XX I had this note: default for discrimination columns should be nan => 0
# # XX but I am currently setting it to -1. 
# print("TODO: determine whether discrimination nans need to be set to 0 and how to do that")


## Load Merged Data from CSV

In [None]:
# Loads the helpers notebook, and the config files
#%run Helpers.ipynb
#%run LoadData.ipynb

with open("userconfig.json", 'r') as file_obj:
    config = json.load(file_obj)
    file_obj.close()
filedir = config["configdir"]

# Load the json files that have the data loading and cleaning info.
# this is currently v1 and that can change if we make tweaks to things 
# like the cutoff for non-response
with open(filedir+config["dataconfig"], 'r') as file_obj:
    dataconfig = json.load(file_obj)
    file_obj.close()
with open(filedir+dataconfig["mergeconfig"], 'r') as file_obj:
    mergeconfig = json.load(file_obj)
    file_obj.close()
        
file = config["surveydir"]+dataconfig["mergedfile"]
merged = pd.read_csv(file) 

display(HTML(merged.head(5).to_html()))  

## Generate population specific data files

In [None]:
#%run LoadData.ipynb
# Load the json file that has the data loading and cleaning info in it. 
# this is currently v1 and that can change if we make tweaks to things 
# like the cutoff for non-response
with open(filedir + "dataconfig.json", 'r') as file_obj:
    dataconfig = json.load(file_obj)


#merged_cmu = copy.deepcopy(merged)
merged_cmu = merged[merged.LOC_ALL == 1]

#merged_uw = copy.deepcopy(merged)
merged_uw = merged[merged.LOC_ALL == 0]

df_all = merged
df_cmu = merged_cmu
df_uw = merged_uw

print("------------generating datasets for all")
datasets = generate_datasets(dataconfig, df_all)
print("------------generating datasets for CMU")
cmu_datasets = generate_datasets(dataconfig, df_cmu)
print("------------generating datasets for UW")
uw_datasets = generate_datasets(dataconfig, df_uw)

# Contact List Extraction

## PID - contact type - contact num

In [None]:
contact_type = ["Contacts_2", "Contacts_3", "Contacts_4", "Contacts_5", "Contacts_6", "Contacts_7", "Contacts_8", "Contacts_9", "Contacts_10", "Contacts_11", "Contacts_12", "Contacts_13", "Contacts_14", "Contacts_15", "Contacts_16", "Contacts_17", "Contacts_18", "Contacts_19", "Contacts_20", "Contacts_21"]
contact_num = ["Contacts_2_TEXT", "Contacts_3_TEXT", "Contacts_4_TEXT", "Contacts_5_TEXT", "Contacts_6_TEXT", "Contacts_7_TEXT", "Contacts_8_TEXT", "Contacts_9_TEXT", "Contacts_10_TEXT", "Contacts_11_TEXT", "Contacts_12_TEXT", "Contacts_13_TEXT", "Contacts_14_TEXT", "Contacts_15_TEXT", "Contacts_16_TEXT", "Contacts_17_TEXT", "Contacts_18_TEXT", "Contacts_19_TEXT", "Contacts_20_TEXT", "Contacts_21_TEXT"]

In [None]:
contact_map = {1.0:"Family", 2.0:"Local Friend", 3.0: "Out-of-town Friend", 11.0 : "Other"}

In [None]:
contact_list_pre = []

a = clean_surveys["uw_baseline2"][contact_type]
b = clean_surveys["uw_baseline2"][contact_num]
for row_index, row in enumerate(a.iterrows()):
    l = row[1]
    for col_index,t in enumerate(l):
        if (not pd.isna(t)):
            n_ori = b.iloc[row_index, col_index]
            n = str(n_ori).replace(" ","").replace("(","").replace(")","").replace("-","").replace("+","")
            if (n == ""): continue
            contact_list_pre.append([row[0], contact_map[t], n])
contact_list_pre = pd.DataFrame(contact_list_pre, columns = ["PID", "Type", "Number"])

In [None]:
contact_list_mid = []

a = clean_surveys["uw_mid"][contact_type]
b = clean_surveys["uw_mid"][contact_num]
for row_index, row in enumerate(a.iterrows()):
    l = row[1]
    for col_index,t in enumerate(l):
        if (not pd.isna(t)):
            n_ori = b.iloc[row_index, col_index]
            n = str(n_ori).replace(" ","").replace("(","").replace(")","").replace("-","").replace("+","")
            if (n == ""): continue
            contact_list_mid.append([row[0], contact_map[t], n])
contact_list_mid = pd.DataFrame(contact_list_mid, columns = ["PID", "Type", "Number"])

In [None]:
contact_list_post = []

a = clean_surveys["uw_post"][contact_type]
b = clean_surveys["uw_post"][contact_num]
for row_index, row in enumerate(a.iterrows()):
    l = row[1]
    for col_index,t in enumerate(l):
        if (not pd.isna(t)):
            n_ori = b.iloc[row_index, col_index]
            n = str(n_ori).replace(" ","").replace("(","").replace(")","").replace("-","").replace("+","")
            if (n == ""): continue
            contact_list_post.append([row[0], contact_map[t], n])
contact_list_post = pd.DataFrame(contact_list_post, columns = ["PID", "Type", "Number"])

In [None]:
contact_list_pre = contact_list_pre.drop_duplicates(["PID", "Type", "Number"])
contact_list_mid = contact_list_mid.drop_duplicates(["PID", "Type", "Number"])
contact_list_post = contact_list_post.drop_duplicates(["PID", "Type", "Number"])
contact_list = pd.concat([contact_list_pre, contact_list_mid, contact_list_post]).drop_duplicates(["PID", "Type", "Number"])

In [None]:
contact_list.to_csv(surveydir + "contact_list.csv",index = False)

In [None]:
contact_list = pd.read_csv(surveydir + "contact_list.csv")

## PID - device id - number - hash value

In [None]:
file_call = "calls.csv"
file_message = "messages.csv"
file_pid = "C:/Users/orson/Desktop/Myself/HCI/UWiSchool/Projects/UWEXP/Code/script-input/sensors/pid_device_participants-180630.json"
# file_plugin = "sefidgar_12_plugin_contacts.xlsx"
file_device = "aware_device.csv"

In [None]:
df_device = pd.read_csv(surveydir + file_device)
# df_plugin = pd.read_excel(file_plugin)
df_call = pd.read_csv(surveydir + file_call)
df_messages = pd.read_csv(surveydir + file_message)
with open(file_pid, "r") as f:
    df_pid = json.load(f)

In [None]:
df_messages = df_messages.drop_duplicates()
df_call = df_call.drop_duplicates()

In [None]:
dic_pid_to_deviceid = {}
dic_deviceid_to_pid = {}
for k, v in df_pid.items():
    dic_pid_to_deviceid[int(k)] = v
    for vv in v:
        dic_deviceid_to_pid[vv] = int(k)
for i in contact_list["PID"].tolist():
    if (i not in dic_pid_to_deviceid):
        dic_pid_to_deviceid[i] = []
for i in list(set(df_messages.device_id)):
    if (i not in dic_deviceid_to_pid):
        dic_deviceid_to_pid[i] = -1

In [None]:
df_messages["PID"] = df_messages["device_id"].map(dic_deviceid_to_pid)

In [None]:
import hashlib
def generate_potential_sha1_output(num):
    num1 = num[0:3]
    num2 = num[3:6]
    num3 = num[6:]
    ns = []
    ns.append(num1 + num2 + num3)
    ns.append(num1 + "-" + num2 + "-" + num3)
    ns.append("(" + num1 + ") " + num2 + "-" + num3)
    ns.append(num1 + " " + num2 + " " + num3)
    ns.append("(" + num1 + ")" + num2 + "-" + num3)
    ns1 = ["+1 " + x for x in ns]
    ns += ns1
    sha1_output = [hashlib.sha1(x.encode()).hexdigest() for x in ns]
    return sha1_output
# contact_list["hash_value"] = contact_list["Number"].apply(generate_potential_sha1_output)
contact_list["trace"] = contact_list["Number"].apply(lambda x : hashlib.sha1(str(x).encode()).hexdigest())

In [None]:
df_messagesss = df_messages.merge(contact_list, how = "left", on = ["PID", "trace"])

In [None]:
df_messagesss_extacted = df_messagesss[["PID","device_id","trace","Type", "Number"]]

In [None]:
df_messagesss_extacted.to_csv(surveydir + "messages_merged_type.csv", index = False,sep = "\t")

# Checking Scores against qualtrics