In [12]:
import json
import numpy as np
import pandas as pd
from collections import defaultdict

In [2]:
f = open('../json_datasets/api_coding.json')
api_coding = json.load(f)
f.close()

f = open('../json_datasets/api_codinginstance.json')
api_codinginstance = json.load(f)
f.close()

f = open('../json_datasets/api_policy.json')
api_policy = json.load(f)
f.close()

f = open('../json_datasets/api_policyinstance.json')
api_policyinstance = json.load(f)
f.close()

f = open('../json_datasets/api_timingsession.json')
api_timingsession = json.load(f)
f.close()

### Stats on api_coding

In [3]:
keys = []
ids = []
parents = []
created_dts = []
question_types = []

for coding in api_coding: 
    for key in coding.keys():
        keys.append(key)
        
    ids.append(coding['id'])
    parents.append(coding['parent'])
    created_dts.append(coding['created_dt'])
    questions = json.loads(coding['questions'])
    
    for question in questions: 
        q_type = question['question']
        
        if type(q_type) is list:
            q_type = ' '.join(q_type)
            
        question_types.append(q_type)
    
keys = set(keys)
# print(keys)

question_types = set(question_types)
# print(question_types)

In [4]:
num_parents = sum([1 for p in parents if p != None])
# print(num_parents)

if num_parents > 0: 
    parents = set(parents)

In [5]:
from datetime import datetime
dates = []

for date in created_dts: 
    d = date.partition(' ')[0]
    d = d.replace('-', '/')
    dates.append(d)

In [7]:
coding_data = {'Question Type' : list(question_types)}

coding_labels = []
for id in ids: 
    coding_label = 'coding_' + str(id)
    coding_labels.append(coding_label)
    coding_data[coding_label] = np.full((len(question_types),), 'no')

In [8]:
coding_df = pd.DataFrame(coding_data,
                   columns=['Question Type'] + coding_labels)

In [9]:
for coding in api_coding: 
    id = coding['id']
    col = 'coding_' + str(id)
    
    questions = json.loads(coding['questions'])
    
    for question in questions: 
        q_type = question['question']
        
        if type(q_type) is list:
            q_type = ' '.join(q_type)
            
        coding_df.loc[(coding_df['Question Type'] == q_type), col] = 'yes'

In [601]:
print(f"""
Total number of codings in the dataset: {len(api_coding)}
Total number of codings with parents in the dataset: {num_parents}
""")

if num_parents > 0: 
    print(f"Coding parents in the dataset: {parents}")

print(f"""
Coding date range: {min(dates)} - {max(dates)}
Total number of question types: {len(question_types)}
""")

coding_df.to_csv('coding.csv')


Total number of codings in the dataset: 11
Total number of codings with parents in the dataset: 0


Coding date range: 2020/05/22 - 2020/10/19
Total number of question types: 218



### api_policy & api_policyinstance

In [180]:
policy_data = {'policy_id': [], 'company_name': []}

for policy in api_policy: 
    policy_data['policy_id'].append(policy['id'])
    policy_data['company_name'].append(policy['company_name'])
    
policy_df = pd.DataFrame(policy_data, columns=policy_data.keys())
#policy_df

In [181]:
pi_data = {'id': [], 'policy_id': []}

for pi in api_policyinstance: 
    pi_data['id'].append(pi['id'])
    pi_data['policy_id'].append(pi['policy_id'])
    
pi_df = pd.DataFrame(pi_data, columns=pi_data.keys())
#pi_df

In [560]:
pi_mapping = pd.merge(pi_df, policy_df, on='policy_id', how='inner')
pi_mapping.index = pi_mapping['id']

pi_mapping

Unnamed: 0_level_0,id,policy_id,company_name
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
20,20,54,match.com
21,21,55,Google (2018 archive)
22,22,56,Facebook (2018 archive)
23,23,57,Google
54,54,107,Google.com
...,...,...,...
381,381,578,Cooper Genomincs
382,382,579,Novartis Ireland
383,383,580,Allrecipes
384,384,581,Fortune


### api_timingsession

In [None]:
api_timingsession

In [184]:
tss = defaultdict(list)

for ts in api_timingsession: 
    tss[ts['coding_id']].append(ts)

### api_codinginstance

In [None]:
api_codinginstance

In [434]:
cis = defaultdict(list)

for ci in api_codinginstance: 
    cis[ci['coding_id']].append(ci)

### Coding Instances for Coding 1

In [605]:
coding_1 = api_coding[0]
print('Coding #', coding_1['id'])
print('Parent: ',coding_1['parent'])
print('Created Date: ',coding_1['created_dt'])

coding_1_q = json.loads(api_coding[0]['questions'])
print('Number of Questions in Coding 1: ', len(coding_1_q))

print(coding_1_q[0].keys())

print("""
info: question text,
values: question answer values stored in a list: ie [0, 1],
question: labels the question
""")

Coding # 1
Parent:  None
Created Date:  2020-05-22 15:35:39.726674+00
Number of Questions in Coding 1:  6
dict_keys(['info', 'values', 'question'])

info: question text,
values: question answer values stored in a list: ie [0, 1],
question: labels the question



In [606]:
coding1_questions = {'index': [], 'coding_q_ids': [], 'q_types': [], 'q_texts': [], 'ans_vals': []}

for idx, question in enumerate(coding_1_q):
    label = 'coding1_q' + str(idx)
    for ans_val in question['values']: 
        coding1_questions['index'].append(idx)
        coding1_questions['coding_q_ids'].append(label)
        coding1_questions['q_types'].append(question['question'])
        coding1_questions['q_texts'].append(question['info'])
        coding1_questions['ans_vals'].append(ans_val)

coding1_df = pd.DataFrame(coding1_questions, columns=coding1_questions.keys())
coding1_df.columns = ['Index', 'Coding Question #', 'Question Type', 'Question Text', 'Answer Value']
#coding1_df

In [607]:
# adding companies encoded in coding1 to the df columns
coding1_companies = set()

for ci in cis[1]: 
    coding1_companies.add(pi_mapping.iloc[ci['policy_instance_id']]['company_name'])

for company in coding1_companies:
    coding1_df[company] = np.zeros((len(coding1_questions['coding_q_ids']),), dtype=int)
    
#coding1_df

In [608]:
valid_inst = 0 

for ci in cis[1]: 
    company = pi_mapping.iloc[ci['policy_instance_id']]['company_name']
    
    coding_values = json.loads(ci['coding_values'])
    
    if (ci['coder_email'].endswith('@nyu.edu') or ci['coder_email'] == 'florencia.m.wurgler@gmail.com'):
        valid_inst += 1
        for q in coding_values: 
            index = int(q)
            for ans_val in coding_values[q]['values']:
                val = ans_val
                if(val.isdigit()):
                    val = int(val)
                coding1_df.loc[(coding1_df['Index'] == index) & (coding1_df['Answer Value'] == val), company] += 1

In [609]:
print(f"""
{valid_inst} out of {len(cis[1])} are valid coding instances, 
in other words the coding was done by a NYU affiliated user and attempts that were not completed is also counted
""")
coding1_df.to_csv('coding1.csv')


12 out of 16 are valid coding instances, 
in other words the coding was done by a NYU affiliated user and attempts that were not completed is also counted



In [311]:
coding1_meta = {'Index': [], 'Coding Question #': [], 'OTHER': [], 'SILENT': [], 'FMW': []}

for ci in cis[1]: 
    coding_values = json.loads(ci['coding_values'])
    
    if (ci['coder_email'].endswith('@nyu.edu') or ci['coder_email'] == 'florencia.m.wurgler@gmail.com'):
        for q in coding_values: 
            if ('OTHER' in coding_values[q]['values']) or ('SILENT' in coding_values[q]['values']): 
                coding1_meta['Index'].append(int(q))
                coding1_meta['Coding Question #'].append('coding1_' + q)
                
                if 'OTHER' in coding_values[q]['values']:
                    coding1_meta['OTHER'].append(coding_values[q]['values']['OTHER'])
                else:
                    coding1_meta['OTHER'].append('')
                
                if 'SILENT' in coding_values[q]['values']:
                    coding1_meta['SILENT'].append(coding_values[q]['values']['SILENT'])
                else:
                    coding1_meta['SILENT'].append('')

                if ci['coder_email'] == 'florencia.m.wurgler@gmail.com': 
                    coding1_meta['FMW'].append(True)
                else:
                    coding1_meta['FMW'].append(False)

                    
coding1_meta_df = pd.DataFrame(data=coding1_meta)
df_size = len(coding1_meta_df)
q_types = []
q_texts = []

for idx in range(df_size): 
    q_types.append(coding_1_q[coding1_meta_df.iloc[idx]['Index']]['question'])
    q_texts.append(coding_1_q[coding1_meta_df.iloc[idx]['Index']]['info'])
    
coding1_meta_df['Question Type'] = q_types
coding1_meta_df['Question Text'] = q_texts

coding1_meta_df = coding1_meta_df.sort_values(by='Coding Question #')
coding1_meta_df = coding1_meta_df[['Index', 'Coding Question #', 'Question Type', 'Question Text', 'OTHER',
                                  'SILENT', 'FMW']]

coding1_meta_df

Unnamed: 0,Index,Coding Question #,Question Type,Question Text,OTHER,SILENT,FMW
0,0,coding1_0,First Party Collection,Does the company collect data for it's own use?,,False,False
3,0,coding1_0,First Party Collection,Does the company collect data for it's own use?,TEST,False,False
8,0,coding1_0,First Party Collection,Does the company collect data for it's own use?,testing,,False
9,0,coding1_0,First Party Collection,Does the company collect data for it's own use?,,True,False
10,0,coding1_0,First Party Collection,Does the company collect data for it's own use?,False,,True
4,1,coding1_1,Third Party Collection,Does the company share data with third parties?,new field,,False
1,2,coding1_2,User Choice,Does the company give users an option to not s...,TEST,False,False
5,2,coding1_2,User Choice,Does the company give users an option to not s...,User Choice,False,False
11,2,coding1_2,User Choice,Does the company give users an option to not s...,,True,True
2,3,coding1_3,Data Security,Does the company conform to a data standard?,,False,False


### Timing Session on Coding 1

In [312]:
coding1_ts = {}

for ts in tss[1]: 
    if ts['policy_instance_id'] in coding1_ts:
        coding1_ts[ts['policy_instance_id']].append(ts['session_timing'])
    else:
        coding1_ts[ts['policy_instance_id']] = []
        coding1_ts[ts['policy_instance_id']].append(ts['session_timing'])
        


In [328]:
coding1_ts_data = {'company_name': [], 'median_blur': [], 'median_focus': []}

for k,v in coding1_ts.items():
    blur_ts = []
    focus_ts = []

    for ts in v: 
        temp = json.loads(ts)
        blur_ts.append(temp['total_blur'])
        focus_ts.append(temp['total_focus'])
    
    coding1_ts_data['company_name'].append(pi_mapping.iloc[k]['company_name'])
    coding1_ts_data['median_blur'].append(int(np.median(np.array(blur_ts))))
    coding1_ts_data['median_focus'].append(int(np.median(np.array(focus_ts))))
    
coding1_ts_df = pd.DataFrame(data={'policy instance id': coding1_ts.keys(), 
                                   'company name': coding1_ts_data['company_name'],
                                   'median blur': coding1_ts_data['median_blur'],
                                   'median focus': coding1_ts_data['median_focus']})

coding1_ts_df

Unnamed: 0,policy instance id,company name,median blur,median focus
0,61,bigtent.com,0,3226
1,67,foursquare.com,484101,21050
2,85,ashleymadison.com,0,0
3,78,reverbnation.com,31888,5106
4,113,corp.ign.com,5083,3329
5,122,stumbleupon.com,0,0
6,220,ironhorsevineyards.com,0,4878
7,211,foodallergy.org,4046,6598
8,226,lodgemfg.com,0,0
9,199,lids.com,185,5219


### Coding Instances for Coding 2

In [None]:
#in each coding value, the 'sentence' key holds the details for tos: , ccpa_policy, gdpr_policy, privacy_policy, 
#'confidence'

In [610]:
coding_2 = api_coding[1]
print('Coding #', coding_2['id'])
print('Parent: ',coding_2['parent'])
print('Created Date: ',coding_2['created_dt'])

coding_2_q = json.loads(api_coding[1]['questions'])
print('Number of Questions in Coding 2: ', len(coding_2_q))

print(coding_2_q[0].keys())

print("""
info: contains the question text and the response text seperated by the delimeter \\n,
values: question answer values stored in a list: ie [0, 1],
question: labels the question
""")

Coding # 2
Parent:  None
Created Date:  2020-06-08 19:03:53.634844+00
Number of Questions in Coding 2:  86
dict_keys(['info', 'values', 'question'])

info: contains the question text and the response text seperated by the delimeter \n,
values: question answer values stored in a list: ie [0, 1],
question: labels the question



In [611]:
coding2_questions = {'index': [], 'coding_q_ids': [], 'q_types': [], 'q_texts': [], 'q_resps': [], 'ans_vals': []}

for idx, question in enumerate(coding_2_q):
    label = 'coding2_q' + str(idx)
    
    info_split = question['info'].split('\n')
    q_text = info_split[0]
    q_resps = info_split[1:]
    
    if idx == 66:
        q_resps = q_resps[0].split(', ')
        q_resps[0], q_resps[1] = q_resps[1], q_resps[0]
    elif idx == 70: 
        q_resps = ['0=contact info in different section', '1=contact info in same section', '2=direct link included in same section']
    elif idx == 72:
        q_resps = ['0=no', '1=yes']
    elif idx == 75:
        q_resps = q_resps[1:]
    elif idx == 79:
        q_resps = q_resps[2:]
    elif idx == 81:
        q_resps.append('N/A')
    elif idx == 82 or idx == 83:
        q_resps = ['1=yes', '0=no', '.=N/A']
    elif idx == 84:
        q_resps = ['0=no', '1=two methods identified']
    elif idx == 85:
        q_resps = ['0', '1']
    
    for val_idx, ans_val in enumerate(question['values']):
        coding2_questions['index'].append(idx)
        coding2_questions['coding_q_ids'].append(label)
        coding2_questions['q_types'].append(question['question'])
        coding2_questions['q_texts'].append(q_text)
        coding2_questions['q_resps'].append(q_resps[val_idx])
        coding2_questions['ans_vals'].append(ans_val.strip())

coding2_df = pd.DataFrame(coding2_questions, columns=coding2_questions.keys())
coding2_df.columns = ['Index', 'Coding Question #', 'Question Type', 'Question Text', 'Question Response', 'Answer Value']
# coding2_df

In [612]:
# adding companies encoded in coding1 to the df columns
coding2_companies = set()

for ci in cis[2]: 
    coding2_companies.add(pi_mapping.iloc[ci['policy_instance_id']]['company_name'])

for company in coding2_companies:
    coding2_df[company] = np.zeros((len(coding2_questions['coding_q_ids']),), dtype=int)

In [613]:
valid_inst = 0 

for ci in cis[2]: 
    company = pi_mapping.iloc[ci['policy_instance_id']]['company_name']
    
    coding_values = json.loads(ci['coding_values'])
    
    if (ci['coder_email'].endswith('@nyu.edu') or ci['coder_email'] == 'florencia.m.wurgler@gmail.com'):
        valid_inst += 1
        for q in coding_values: 
            index = int(q)
            for ans_val in coding_values[q]['values']:
                val = ans_val
                if(val.isdigit()):
                    val = int(val)
                coding2_df.loc[(coding2_df['Index'] == index) & (coding2_df['Answer Value'] == val), company] += 1

In [614]:
print(f"""
{valid_inst} out of {len(cis[2])} are valid coding instances, 
in other words the coding was done by a NYU affiliated user and attempts that were not completed is also counted
""")
coding2_df.to_csv('coding2.csv')


40 out of 49 are valid coding instances, 
in other words the coding was done by a NYU affiliated user and attempts that were not completed is also counted



### Coding Instances for Coding 3
#### DOES NOT HAVE ANY CODING INSTANCES

In [423]:
coding_3 = api_coding[2]
print('Coding #', coding_3['id'])
print('Parent: ',coding_3['parent'])
print('Created Date: ',coding_3['created_dt'])

coding_3_q = json.loads(api_coding[2]['questions'])
print('Number of Questions in Coding 3: ', len(coding_3_q))

print(coding_3_q[0].keys())

print("""
info: contains the question text and the response text seperated by the delimeter \\n,
values: question answer values stored in a list: ie ['Does', 'Does Not'],
question: labels the question
""")

Coding # 3
Parent:  None
Created Date:  2020-06-08 19:32:05.722316+00
Number of Questions in Coding 3:  36
dict_keys(['info', 'values', 'question'])

info: contains the question text and the response text seperated by the delimeter \n,
values: question answer values stored in a list: ie ['Does', 'Does Not'],
question: labels the question



In [431]:
coding3_questions = {'index': [], 'coding_q_ids': [], 'q_types': [], 'q_texts': [], 'q_resps': [], 'ans_vals': []}

for idx, question in enumerate(coding_3_q):
    label = 'coding3_q' + str(idx)
    
    info_split = question['info'].split('\n')
    q_text = info_split[0]
    q_resps = info_split[2:]
    
    for val_idx, ans_val in enumerate(question['values']):
        coding3_questions['index'].append(idx)
        coding3_questions['coding_q_ids'].append(label)
        coding3_questions['q_types'].append(question['question'])
        coding3_questions['q_texts'].append(q_text)
        coding3_questions['q_resps'].append(q_resps[val_idx])
        coding3_questions['ans_vals'].append(ans_val.strip())

coding3_df = pd.DataFrame(coding3_questions, columns=coding3_questions.keys())
coding3_df.columns = ['Index', 'Coding Question #', 'Question Type', 'Question Text', 'Question Response', 'Answer Value']
coding3_df

Unnamed: 0,Index,Coding Question #,Question Type,Question Text,Question Response,Answer Value
0,0,coding3_q0,First Party Collection/Use - Does/Does Not,Use this optional attribute to denote if the p...,[Does] - The first party does engage in the de...,Does
1,0,coding3_q0,First Party Collection/Use - Does/Does Not,Use this optional attribute to denote if the p...,[Does Not] - The first party does not engage i...,Does Not
2,1,coding3_q1,First Party Collection/Use - Collection Mode,Use this optional attribute to denote if the d...,[Explicit] - The company/organization collects...,Explicit
3,1,coding3_q1,First Party Collection/Use - Collection Mode,Use this optional attribute to denote if the d...,[Implicit] - The company/organization collects...,Implicit
4,1,coding3_q1,First Party Collection/Use - Collection Mode,Use this optional attribute to denote if the d...,[Unspecified] - It is not specified or unclear...,Unspecified
...,...,...,...,...,...,...
254,34,coding3_q34,International and Specific Audiences - Audienc...,Select which audience the policy segment refer...,[Other] - Other specific audience group not me...,Other
255,35,coding3_q35,Other - Other Type,What other aspect not covered in the other cat...,[Introductory/Generic] - It's a paragraph that...,Introductory/Generic
256,35,coding3_q35,Other - Other Type,What other aspect not covered in the other cat...,[Practice not covered] - The paragraph describ...,Practice not covered
257,35,coding3_q35,Other - Other Type,What other aspect not covered in the other cat...,[Privacy contact information] - The paragraph ...,Privacy contact information


In [615]:
coding3_df.to_csv('coding3.csv')

### Coding Instances for Coding 4
#### DOES NOT HAVE ANY CODING INSTANCES

In [446]:
coding_4 = api_coding[3]
print('Coding #', coding_4['id'])
print('Parent: ',coding_4['parent'])
print('Created Date: ',coding_4['created_dt'])

coding_4_q = json.loads(api_coding[3]['questions'])
print('Number of Questions in Coding 4: ', len(coding_4_q))

print(coding_4_q[0].keys())

print("""
info: contains the question text,
type: type of question ie multiselect,
notes: '{'opp_notes': '', 'ccpa_notes': '', 'gdpr_notes': ''}',
values: question answer values stored in a list: ie ['0', '1'],
details: response texts seperated by the delimeter \\n,
question: labels the question,
identifier: question id 
""")

Coding # 4
Parent:  None
Created Date:  2020-06-15 07:45:08.525915+00
Number of Questions in Coding 4:  101
dict_keys(['info', 'type', 'notes', 'values', 'details', 'question', 'identifier'])

info: contains the question text,
type: type of question ie multiselect,
notes: '{'opp_notes': '', 'ccpa_notes': '', 'gdpr_notes': ''}',
values: question answer values stored in a list: ie ['0', '1'],
details: response texts seperated by the delimeter \n,
question: labels the question,
identifier: question id 



In [643]:
coding4_questions = {'index': [], 'coding_q_ids': [], 'q_types': [], 'q_texts': [], 'q_resps': [], 'ans_vals': []}

for idx, question in enumerate(coding_4_q):
    label = 'coding4_q' + str(idx)
    
    resps = question['details'].split('\n')
    
#     print('idx:', idx)
#     print('values:', question['values'])
#     print('resps:', resps)
    
    if idx == 0: 
        resps.insert(0, '0') 
    if idx == 9 or idx == 21 or idx == 22 or idx == 63:
        resps = ['[ 0 ] - no', '[ 1 ] - yes', '[ DND ] - DND']
    if idx == 15: 
        resps = ['[ 0 ] - no', '[ 1 ] - yes', '[ . ] - n/a',  'DND=does not disclose']
    if idx == 20: 
        resps.insert(4, '4')
        resps.insert(11, '11') 
    if idx == 27:
        resps = ['[ 0 ] - gen\'l ID ("trusted 3rd parties"', '[ DND ] - DND', '[ 1 ] - specific/named identification']
    if idx == 40:
        resps.insert(16, '[ DND ] - DND')
    if idx == 92: 
        resps = ['[ 0 ] - contact info in different section', '1=contact info in same section', '2=direct link included in same section)']
    if idx == 94: 
        resps = ['[ 1 ] - yes', '0 = no', '. = N/A']
    if idx == 95: 
        resps = ['[ 0 ] - no', '[ 1 ] - two methods identified', '2 = one method identified']
        
    for val_idx, ans_val in enumerate(question['values']):
        coding4_questions['index'].append(idx)
        coding4_questions['coding_q_ids'].append(label)
        coding4_questions['q_types'].append(question['question'])
        coding4_questions['q_texts'].append(question['info'])
        coding4_questions['q_resps'].append(resps[val_idx])
        coding4_questions['ans_vals'].append(ans_val)

coding4_df = pd.DataFrame(coding4_questions, columns=coding4_questions.keys())
coding4_df.columns = ['Index', 'Coding Question #', 'Question Type', 'Question Text', 'Question Response', 'Answer Value']
coding4_df.to_csv('coding4.csv')

### Coding Instances for Coding 5

In [644]:
coding_5 = api_coding[4]
print('Coding #', coding_5['id'])
print('Parent: ',coding_5['parent'])
print('Created Date: ',coding_5['created_dt'])

coding_5_q = json.loads(api_coding[4]['questions'])
print('Number of Questions in Coding 5: ', len(coding_5_q))

print(coding_5_q[0].keys())

print("""
info: contains the question text,
type: type of question ie multiselect,
values: question answer values stored in a list: ie ['0', '1'],
details: response texts seperated by the delimeter \\n,
opp_info:,
question: labels the question,
ccpa_info:, 
gdpr_info:,
identifier: question id ,
values_raw:
""")

Coding # 5
Parent:  None
Created Date:  2020-06-22 07:40:42.483447+00
Number of Questions in Coding 5:  101
dict_keys(['info', 'type', 'values', 'details', 'opp_info', 'question', 'ccpa_info', 'gdpr_info', 'identifier', 'values_raw'])

info: contains the question text,
type: type of question ie multiselect,
values: question answer values stored in a list: ie ['0', '1'],
details: response texts seperated by the delimeter \n,
opp_info:,
question: labels the question,
ccpa_info:, 
gdpr_info:,
identifier: question id ,
values_raw:



In [648]:
coding5_questions = {'index': [], 'coding_q_ids': [], 'q_ids': [], 
                     'q_types': [], 'q_texts': [], 'q_resps': [], 'ans_vals': []}

for idx, question in enumerate(coding_5_q):
    label = 'coding5_q' + str(idx)
    
    q_resps = question['details'].split('\n')
    
    print('idx:', idx)
    print('values:', question['values'])
    print('resps:', q_resps)
    
    if idx == 9 or idx == 21 or idx == 22 or idx == 63: 
        q_resps = ['[ 0 ] - no', '[ 1 ] - yes', '[ DND ] - DND'] 
    elif idx == 27: 
        q_resps = ['[ 0 ] - gen\'l ID ("trusted 3rd parties")', '[ 1 ] - specific/named entity or named category of third party', '[ DND ] - DND']
    elif idx == 20: 
        question['values'].remove('11')
        
    
    for val_idx, ans_val in enumerate(question['values']):
        coding5_questions['index'].append(idx)
        coding5_questions['coding_q_ids'].append(label)
        coding5_questions['q_ids'].append(question['identifier'])
        coding5_questions['q_types'].append(question['question'])
        coding5_questions['q_texts'].append(question['info'])
        coding5_questions['q_resps'].append(q_resps[val_idx])
        coding5_questions['ans_vals'].append(ans_val)

coding5_df = pd.DataFrame(coding5_questions, columns=coding5_questions.keys())
coding5_df.columns = ['Index', 'Coding Question #', 'Question Identifier', 'Question Type', 
                      'Question Text', 'Question Response', 'Answer Value']
# coding5_df

idx: 0
values: ['0', '1']
resps: ['[ 0 ] - no', '[ 1 ] - yes']
idx: 1
values: ['1', '2', '3', '4', '5', '6']
resps: ['[ 1 ] - hyperlinked, at the bottom of the main page (Specht)', '[ 2 ] - visibly located on the main page, but no click required to accept, and no notice that privacy policy must be agreed upon (Nguyen v. Barnes and Noble)', '[ 3 ] - visible on the main page, with notice that user must agree to it, but no clicking requirement to agree (Rodman v. Safeway)', '[ 4 ] - visible on main page, with a “sign in” wrap that doesn’t unambiguously inform the user that clicking signifies assent to Privacy Policy. (Berkson v. Gogo)', '[ 5 ] - visible on main page, with a hyperlinked Privacy Policy and a requirements that the user unambiguously click on “I agree” to PP. (Fteja v. Facebook)', '[ 6 ] - visible on main page, with a Privacy Policy on a scroll down box  and a requirements that the user unambiguously click on “I agree” to PP.']
idx: 2
values: ['0', '1']
resps: ['[ 0 ] - no', 

In [650]:
# adding companies encoded in coding1 to the df columns
coding5_companies = set()

for ci in cis[5]: 
    coding5_companies.add(pi_mapping.iloc[ci['policy_instance_id']]['company_name'])

for company in coding5_companies:
    coding5_df[company] = np.zeros((len(coding5_questions['coding_q_ids']),), dtype=int)

In [651]:
valid_inst = 0 

for ci in cis[5]: 
    company = pi_mapping.iloc[ci['policy_instance_id']]['company_name']
    
    coding_values = json.loads(ci['coding_values'])
    
    if (ci['coder_email'].endswith('@nyu.edu') or ci['coder_email'] == 'florencia.m.wurgler@gmail.com'):
        valid_inst += 1
        for q in coding_values: 
            if q == '':
                continue
            if q.isdigit():
                index = int(q)
                for ans_val in coding_values[q]['values']:
                    val = ans_val
                    if val.isdigit():
                        val = int(val)

                    if coding_values[q]['values'][ans_val] == True:
                        coding5_df.loc[(coding5_df['Index'] == index) & 
                                       (coding5_df['Answer Value'] == val), company] += 1
            else:
                for ans_val in coding_values[q]['values']:
                    val = ans_val
                    if val.isdigit():
                        val = int(val)

                    if coding_values[q]['values'][ans_val] == True:
                        coding5_df.loc[(coding5_df['Question Identifier'] == q) & 
                                       (coding5_df['Answer Value'] == val), company] += 1

coding5_df

Unnamed: 0,Index,Coding Question #,Question Identifier,Question Type,Question Text,Question Response,Answer Value,dslreports.com,indabamusic.com,fool.com,...,us.mouthshut.com,soundclick.com,filefront.com,my.opera.com,christianmingle.com,zooppa.com,mate1.com,bigtent.com,millionairemate.com,epernicus.com
0,0,coding5_q0,v2_2020_txt,Notice - Consent and Registration,Users are asked to manifest consent to PP when...,[ 0 ] - no,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,coding5_q0,v2_2020_txt,Notice - Consent and Registration,Users are asked to manifest consent to PP when...,[ 1 ] - yes,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,coding5_q1,v0_2020,Notice - Registration and Salience,For potential registrants: How salient is the ...,"[ 1 ] - hyperlinked, at the bottom of the main...",1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,coding5_q1,v0_2020,Notice - Registration and Salience,For potential registrants: How salient is the ...,"[ 2 ] - visibly located on the main page, but ...",2,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,coding5_q1,v0_2020,Notice - Registration and Salience,For potential registrants: How salient is the ...,"[ 3 ] - visible on the main page, with notice ...",3,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
398,99,coding5_q99,v_82_2020,GDPR-Automated Processes,"If firm engages in automated decisionmaking, d...",[ 1 ] - yes,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
399,99,coding5_q99,v_82_2020,GDPR-Automated Processes,"If firm engages in automated decisionmaking, d...",[ . ] - not applicable,.,0,0,0,...,0,0,0,0,1,0,0,1,0,0
400,99,coding5_q99,v_82_2020,GDPR-Automated Processes,"If firm engages in automated decisionmaking, d...",[ DND ] - does not disclose,DND,1,0,2,...,1,0,1,0,0,1,0,0,0,0
401,100,coding5_q100,v_83_2020,COVID,Does the PP include any terms related to conta...,[ 0 ] - no,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [652]:
print(f"""
{valid_inst} out of {len(cis[5])} are valid coding instances, 
in other words the coding was done by a NYU affiliated user and attempts that were not completed is also counted
""")
coding5_df.to_csv('coding5.csv')


124 out of 140 are valid coding instances, 
in other words the coding was done by a NYU affiliated user and attempts that were not completed is also counted



### Coding Instances for Coding 6

In [653]:
coding_6 = api_coding[5]
print('Coding #', coding_6['id'])
print('Parent: ',coding_6['parent'])
print('Created Date: ',coding_6['created_dt'])

coding_6_q = json.loads(api_coding[5]['questions'])
print('Number of Questions in Coding 6: ', len(coding_6_q))

print(f"""
Question format: {list(coding_6_q[0].keys())}

'': directions, 
info: contains the question text,
type: type of question ie multiselect or single select,
values: question answer values stored in a list with the format: {{'label: '', value: '''}},
details: directions for answering the question,
opp_info:,
question: labels the question,
ccpa_info:, 
gdpr_info:,
identifier: question id ,
values_raw:
""")

Coding # 6
Parent:  None
Created Date:  2020-07-02 07:22:33.181775+00
Number of Questions in Coding 6:  239

Question format: ['', 'info', 'type', 'values', 'details', 'opp_info', 'question', 'ccpa_info', 'gdpr_info', 'identifier', 'values_raw']

'': directions, 
info: contains the question text,
type: type of question ie multiselect or single select,
values: question answer values stored in a list with the format: {'label: '', value: '''},
details: directions for answering the question,
opp_info:,
question: labels the question,
ccpa_info:, 
gdpr_info:,
identifier: question id ,
values_raw:



In [654]:
coding6_questions = {'index': [], 'coding_q_ids': [], 'q_ids': [], 
                     'q_types': [], 'q_texts': [], 'q_resps': [], 'ans_vals': []}

for idx, question in enumerate(coding_6_q):
    label = 'coding6_q' + str(idx)
        
    for ans_val in question['values']:
        coding6_questions['index'].append(idx)
        coding6_questions['coding_q_ids'].append(label)
        coding6_questions['q_ids'].append(question['identifier'])
        coding6_questions['q_types'].append(question['question'])
        coding6_questions['q_texts'].append(question['info'])
        coding6_questions['q_resps'].append(ans_val['label'])
        coding6_questions['ans_vals'].append(ans_val['value'])

coding6_df = pd.DataFrame(coding6_questions, columns=coding6_questions.keys())
coding6_df.columns = ['Index', 'Coding Question #', 'Question Identifier', 'Question Type', 
                      'Question Text', 'Question Response', 'Answer Value']
# coding6_df

In [655]:
coding6_companies = set()

for ci in cis[6]: 
    coding6_companies.add(pi_mapping.iloc[ci['policy_instance_id']]['company_name'])

for company in coding6_companies:
    coding6_df[company] = np.zeros((len(coding6_questions['coding_q_ids']),), dtype=int)

  import sys


In [656]:
valid_inst = 0 

for ci in cis[6]: 
    company = pi_mapping.iloc[ci['policy_instance_id']]['company_name']
    
    coding_values = json.loads(ci['coding_values'])
    
    if (ci['coder_email'].endswith('@nyu.edu') or ci['coder_email'] == 'florencia.m.wurgler@gmail.com'):
        valid_inst += 1
        for q in coding_values: 
            if q == '':
#                 if 'notes' in coding_values[q]:
#                     print('note: ', coding_values[q]['notes'])
#                 print('comment: ', coding_values[q]['comment'])
                continue
            if q.isdigit():
                index = int(q)
                for ans_val in coding_values[q]['values']:
                    val = ans_val
                    if val.isdigit():
                        val = int(val)

                    if coding_values[q]['values'][ans_val] == True:
                        coding6_df.loc[(coding6_df['Index'] == index) & 
                                       (coding6_df['Answer Value'] == val), company] += 1
            else:
                for ans_val in coding_values[q]['values']:
                    val = ans_val
                    if val.isdigit():
                        val = int(val)

                    if coding_values[q]['values'][ans_val] == True:
                        coding6_df.loc[(coding6_df['Question Identifier'] == q) & 
                                       (coding6_df['Answer Value'] == val), company] += 1


In [657]:
coding6_df

Unnamed: 0,Index,Coding Question #,Question Identifier,Question Type,Question Text,Question Response,Answer Value,fool.com,kraftrecipes.com,allstate.com,...,pbs.org,dpreview.com,filefront.com,mate1.com,edmunds.com,boardgamegeek.com,bigtent.com,millionairemate.com,lids.com,epernicus.com
0,0,coding6_q0,v2_2020_txt,Notice of Contract - Consent and Registration,Users are asked to manifest consent to PP when...,[ 0 ] - no,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,coding6_q0,v2_2020_txt,Notice of Contract - Consent and Registration,Users are asked to manifest consent to PP when...,[ 1 ] - yes,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,coding6_q0,v2_2020_txt,Notice of Contract - Consent and Registration,Users are asked to manifest consent to PP when...,"[ N/A ] - Not applicable (e.g., there is no re...",,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,coding6_q1,v0_2020,Notice of Contract- Registration and Salience,"For potential registrants (i.e., when you are ...","[ 1 ] - hyperlinked, at the bottom of the main...",1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,coding6_q1,v0_2020,Notice of Contract- Registration and Salience,"For potential registrants (i.e., when you are ...","[ 2 ] - visibly located on the main page, but ...",2,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
550,237,coding6_q237,v82_2020,GDPR-Automated Processes,"If firm engages in automated decision making, ...",[ 1 ] - yes,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
551,237,coding6_q237,v82_2020,GDPR-Automated Processes,"If firm engages in automated decision making, ...",[ . ] - not applicable,.,1,0,0,...,2,0,0,0,0,0,0,0,0,0
552,237,coding6_q237,v82_2020,GDPR-Automated Processes,"If firm engages in automated decision making, ...",[ DND ] - does not disclose,DND,0,2,0,...,0,0,0,2,0,1,1,0,0,0
553,238,coding6_q238,v83_2020,COVID,Does the PP include any terms related to conta...,[ 0 ] - no,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [658]:
print(f"""
{valid_inst} out of {len(cis[6])} are valid coding instances, 
in other words the coding was done by a NYU affiliated user and attempts that were not completed is also counted
""")
coding6_df.to_csv('coding6.csv')


193 out of 223 are valid coding instances, 
in other words the coding was done by a NYU affiliated user and attempts that were not completed is also counted



### Coding Instances for Coding 7
#### No good coding instances

In [522]:
coding_7 = api_coding[6]
print('Coding #', coding_7['id'])
print('Parent: ',coding_7['parent'])
print('Created Date: ',coding_7['created_dt'])

coding_7_q = json.loads(api_coding[6]['questions'])
print('Number of Questions in Coding 7: ', len(coding_7_q))

print(f"""
Question format: {list(coding_7_q[0].keys())}

'': directions, 
info: contains the question text,
type: type of question ie multiselect or single select,
values: question answer values stored in a list with the format: {{'label: '', value: '''}},
details: directions for answering the question,
opp_info:,
question: labels the question,
ccpa_info:, 
gdpr_info:,
identifier: question id ,
values_raw:
""")

Coding # 7
Parent:  None
Created Date:  2020-07-14 23:12:58.715513+00
Number of Questions in Coding 7:  230

Question format: ['', 'info', 'type', 'values', 'details', 'opp_info', 'question', 'ccpa_info', 'gdpr_info', 'identifier', 'values_raw']

'': directions, 
info: contains the question text,
type: type of question ie multiselect or single select,
values: question answer values stored in a list with the format: {'label: '', value: '''},
details: directions for answering the question,
opp_info:,
question: labels the question,
ccpa_info:, 
gdpr_info:,
identifier: question id ,
values_raw:



In [524]:
coding7_questions = {'index': [], 'coding_q_ids': [], 'q_ids': [], 
                     'q_types': [], 'q_texts': [], 'q_resps': [], 'ans_vals': []}

for idx, question in enumerate(coding_7_q):
    label = 'coding7_q' + str(idx)
        
    for ans_val in question['values']:
        coding7_questions['index'].append(idx)
        coding7_questions['coding_q_ids'].append(label)
        coding7_questions['q_ids'].append(question['identifier'])
        coding7_questions['q_types'].append(question['question'])
        coding7_questions['q_texts'].append(question['info'])
        coding7_questions['q_resps'].append(ans_val['label'])
        coding7_questions['ans_vals'].append(ans_val['value'])

coding7_df = pd.DataFrame(coding7_questions, columns=coding7_questions.keys())
coding7_df.columns = ['Index', 'Coding Question #', 'Question Identifier', 'Question Type', 
                      'Question Text', 'Question Response', 'Answer Value']
coding7_df

Unnamed: 0,Index,Coding Question #,Question Identifier,Question Type,Question Text,Question Response,Answer Value
0,0,coding7_q0,v2_2020_txt,Notice of Contract - Consent and Registration,Users are asked to manifest consent to PP when...,[ 0 ] - no,0
1,0,coding7_q0,v2_2020_txt,Notice of Contract - Consent and Registration,Users are asked to manifest consent to PP when...,[ 1 ] - yes,1
2,0,coding7_q0,v2_2020_txt,Notice of Contract - Consent and Registration,Users are asked to manifest consent to PP when...,"[ N/A ] - Not applicable (e.g., there is no re...",
3,1,coding7_q1,v0_2020,Notice of Contract- Registration and Salience,"For potential registrants (i.e., when you are ...","[ 1 ] - hyperlinked, at the bottom of the main...",1
4,1,coding7_q1,v0_2020,Notice of Contract- Registration and Salience,"For potential registrants (i.e., when you are ...","[ 2 ] - visibly located on the main page, but ...",2
...,...,...,...,...,...,...,...
542,228,coding7_q228,v82_2020,GDPR-Automated Processes,"If firm engages in automated decision making, ...",[ 1 ] - yes,1
543,228,coding7_q228,v82_2020,GDPR-Automated Processes,"If firm engages in automated decision making, ...",[ . ] - not applicable,.
544,228,coding7_q228,v82_2020,GDPR-Automated Processes,"If firm engages in automated decision making, ...",[ DND ] - does not disclose,DND
545,229,coding7_q229,v83_2020,COVID,Does the PP include any terms related to conta...,[ 0 ] - no,0


In [674]:
coding7_df.to_csv('coding7.csv')

### Coding Instances for Coding 8
#### No coding instances

In [529]:
coding_8 = api_coding[7]
print('Coding #', coding_8['id'])
print('Parent: ',coding_8['parent'])
print('Created Date: ',coding_8['created_dt'])

coding_8_q = json.loads(api_coding[7]['questions'])
print('Number of Questions in Coding 8: ', len(coding_8_q))

print(f"""
Question format: {list(coding_8_q[0].keys())}

'': directions, 
info: contains the question text,
type: type of question ie multiselect or single select,
values: question answer values stored in a list with the format: {{'label: '', value: '''}},
details: directions for answering the question,
opp_info:,
question: labels the question,
ccpa_info:, 
gdpr_info:,
identifier: question id ,
values_raw:
""")

Coding # 8
Parent:  None
Created Date:  2020-07-17 07:37:56.176181+00
Number of Questions in Coding 8:  230

Question format: ['', 'info', 'type', 'values', 'details', 'opp_info', 'question', 'ccpa_info', 'gdpr_info', 'identifier', 'values_raw']

'': directions, 
info: contains the question text,
type: type of question ie multiselect or single select,
values: question answer values stored in a list with the format: {'label: '', value: '''},
details: directions for answering the question,
opp_info:,
question: labels the question,
ccpa_info:, 
gdpr_info:,
identifier: question id ,
values_raw:



In [532]:
coding8_questions = {'index': [], 'coding_q_ids': [], 'q_ids': [], 
                     'q_types': [], 'q_texts': [], 'q_resps': [], 'ans_vals': []}

for idx, question in enumerate(coding_8_q):
    label = 'coding8_q' + str(idx)
        
    for ans_val in question['values']:
        coding8_questions['index'].append(idx)
        coding8_questions['coding_q_ids'].append(label)
        coding8_questions['q_ids'].append(question['identifier'])
        coding8_questions['q_types'].append(question['question'])
        coding8_questions['q_texts'].append(question['info'])
        coding8_questions['q_resps'].append(ans_val['label'])
        coding8_questions['ans_vals'].append(ans_val['value'])

coding8_df = pd.DataFrame(coding8_questions, columns=coding8_questions.keys())
coding8_df.columns = ['Index', 'Coding Question #', 'Question Identifier', 'Question Type', 
                      'Question Text', 'Question Response', 'Answer Value']
coding8_df

Unnamed: 0,Index,Coding Question #,Question Identifier,Question Type,Question Text,Question Response,Answer Value
0,0,coding8_q0,v2_2020_txt,Notice of Contract - Consent and Registration,Users are asked to manifest consent to PP when...,[ 0 ] - no,0
1,0,coding8_q0,v2_2020_txt,Notice of Contract - Consent and Registration,Users are asked to manifest consent to PP when...,[ 1 ] - yes,1
2,0,coding8_q0,v2_2020_txt,Notice of Contract - Consent and Registration,Users are asked to manifest consent to PP when...,"[ N/A ] - Not applicable (e.g., there is no re...",
3,1,coding8_q1,v0_2020,Notice of Contract- Registration and Salience,"For potential registrants (i.e., when you are ...","[ 1 ] - hyperlinked, at the bottom of the main...",1
4,1,coding8_q1,v0_2020,Notice of Contract- Registration and Salience,"For potential registrants (i.e., when you are ...","[ 2 ] - visibly located on the main page, but ...",2
...,...,...,...,...,...,...,...
542,228,coding8_q228,v82_2020,GDPR-Automated Processes,"If firm engages in automated decision making, ...",[ 1 ] - yes,1
543,228,coding8_q228,v82_2020,GDPR-Automated Processes,"If firm engages in automated decision making, ...",[ . ] - not applicable,.
544,228,coding8_q228,v82_2020,GDPR-Automated Processes,"If firm engages in automated decision making, ...",[ DND ] - does not disclose,DND
545,229,coding8_q229,v83_2020,COVID,Does the PP include any terms related to conta...,[ 0 ] - no,0


In [675]:
coding8_df.to_csv('coding8.csv')

### Coding Instances for Coding 9

In [659]:
coding_9 = api_coding[8]
print('Coding #', coding_9['id'])
print('Parent: ',coding_9['parent'])
print('Created Date: ',coding_9['created_dt'])

coding_9_q = json.loads(api_coding[8]['questions'])
print('Number of Questions in Coding 9: ', len(coding_9_q))

print(f"""
Question format: {list(coding_9_q[0].keys())}

'': directions, 
info: contains the question text,
type: type of question ie multiselect or single select,
values: question answer values stored in a list with the format: {{'label: '', value: '''}},
details: directions for answering the question,
opp_info:,
question: labels the question,
ccpa_info:, 
gdpr_info:,
identifier: question id ,
values_raw:
""")

Coding # 9
Parent:  None
Created Date:  2020-07-23 01:53:32.017866+00
Number of Questions in Coding 9:  239

Question format: ['', 'info', 'type', 'values', 'details', 'opp_info', 'question', 'ccpa_info', 'gdpr_info', 'identifier', 'values_raw']

'': directions, 
info: contains the question text,
type: type of question ie multiselect or single select,
values: question answer values stored in a list with the format: {'label: '', value: '''},
details: directions for answering the question,
opp_info:,
question: labels the question,
ccpa_info:, 
gdpr_info:,
identifier: question id ,
values_raw:



In [660]:
coding9_questions = {'index': [], 'coding_q_ids': [], 'q_ids': [], 
                     'q_types': [], 'q_texts': [], 'q_resps': [], 'ans_vals': []}

for idx, question in enumerate(coding_9_q):
    label = 'coding9_q' + str(idx)
        
    for ans_val in question['values']:
        coding9_questions['index'].append(idx)
        coding9_questions['coding_q_ids'].append(label)
        coding9_questions['q_ids'].append(question['identifier'])
        coding9_questions['q_types'].append(question['question'])
        coding9_questions['q_texts'].append(question['info'])
        coding9_questions['q_resps'].append(ans_val['label'])
        coding9_questions['ans_vals'].append(ans_val['value'])

coding9_df = pd.DataFrame(coding9_questions, columns=coding9_questions.keys())
coding9_df.columns = ['Index', 'Coding Question #', 'Question Identifier', 'Question Type', 
                      'Question Text', 'Question Response', 'Answer Value']
#coding9_df

In [661]:
coding9_companies = set()

for ci in cis[9]: 
    #print(pi_mapping.loc[ci['policy_instance_id']]['company_name'])
    coding9_companies.add(pi_mapping.loc[ci['policy_instance_id']]['company_name'])

for company in coding9_companies:
    coding9_df[company] = np.zeros((len(coding9_questions['coding_q_ids']),), dtype=int)

  


In [662]:
valid_inst = 0 

for ci in cis[9]: 
    company = pi_mapping.loc[ci['policy_instance_id']]['company_name']
    
    coding_values = json.loads(ci['coding_values'])
    
    if (ci['coder_email'].endswith('@nyu.edu') or ci['coder_email'] == 'florencia.m.wurgler@gmail.com'):
        valid_inst += 1
        for q in coding_values: 
            if q == '':
                if 'notes' in coding_values[q]:
                    print('note: ', coding_values[q]['notes'])
                print('comment: ', coding_values[q]['comment'])
                continue
            if q.isdigit():
                index = int(q)
                for ans_val in coding_values[q]['values']:
                    val = ans_val
                    if val.isdigit():
                        val = int(val)
                        
                    if company.capitalize() in coding9_df.columns:
                        company = company.capitalize()
                    
                    if company not in coding9_df.columns:
                        print('skipping:', company)
                        continue

                    if coding_values[q]['values'][ans_val] == True:
                        coding9_df.loc[(coding9_df['Index'] == index) & 
                                       (coding9_df['Answer Value'] == val), company] += 1
            else:
                for ans_val in coding_values[q]['values']:
                    val = ans_val
                    if val.isdigit():
                        val = int(val)
                        
                    if company.capitalize() in coding9_df.columns:
                        company = company.capitalize()
                        
                    if company not in coding9_df.columns:
                        print('skipping:', company)
                        continue

                    if coding_values[q]['values'][ans_val] == True:
                        coding9_df.loc[(coding9_df['Question Identifier'] == q) & 
                                       (coding9_df['Answer Value'] == val), company] += 1

In [663]:
print(f"""
{valid_inst} out of {len(cis[9])} are valid coding instances, 
in other words the coding was done by a NYU affiliated user and attempts that were not completed is also counted
""")
coding9_df.to_csv('coding9.csv')


310 out of 332 are valid coding instances, 
in other words the coding was done by a NYU affiliated user and attempts that were not completed is also counted



### Coding Instances for Coding 10

In [664]:
coding_10 = api_coding[9]
print('Coding #', coding_10['id'])
print('Parent: ',coding_10['parent'])
print('Created Date: ',coding_10['created_dt'])

coding_10_q = json.loads(api_coding[9]['questions'])
print('Number of Questions in Coding 10: ', len(coding_10_q))

print(f"""
Question format: {list(coding_10_q[0].keys())}

'': directions, 
info: contains the question text,
type: type of question ie multiselect or single select,
values: question answer values stored in a list with the format: {{'label: '', value: '''}},
details: directions for answering the question,
opp_info:,
question: labels the question,
ccpa_info:, 
gdpr_info:,
threshold:, 
identifier: question id ,
values_raw:
""")

Coding # 10
Parent:  None
Created Date:  2020-08-17 07:20:24.920932+00
Number of Questions in Coding 10:  239

Question format: ['', 'info', 'type', 'values', 'details', 'opp_info', 'question', 'ccpa_info', 'gdpr_info', 'threshold', 'identifier', 'values_raw']

'': directions, 
info: contains the question text,
type: type of question ie multiselect or single select,
values: question answer values stored in a list with the format: {'label: '', value: '''},
details: directions for answering the question,
opp_info:,
question: labels the question,
ccpa_info:, 
gdpr_info:,
threshold:, 
identifier: question id ,
values_raw:



In [665]:
coding10_questions = {'index': [], 'coding_q_ids': [], 'q_ids': [], 
                     'q_types': [], 'q_texts': [], 'q_resps': [], 'ans_vals': []}

for idx, question in enumerate(coding_10_q):
    label = 'coding10_q' + str(idx)
        
    for ans_val in question['values']:
        coding10_questions['index'].append(idx)
        coding10_questions['coding_q_ids'].append(label)
        coding10_questions['q_ids'].append(question['identifier'])
        coding10_questions['q_types'].append(question['question'])
        coding10_questions['q_texts'].append(question['info'])
        coding10_questions['q_resps'].append(ans_val['label'])
        coding10_questions['ans_vals'].append(ans_val['value'])

coding10_df = pd.DataFrame(coding10_questions, columns=coding10_questions.keys())
coding10_df.columns = ['Index', 'Coding Question #', 'Question Identifier', 'Question Type', 
                      'Question Text', 'Question Response', 'Answer Value']
# coding10_df

In [666]:
coding10_companies = set()

for ci in cis[10]: 
    #print(pi_mapping.loc[ci['policy_instance_id']]['company_name'])
    coding10_companies.add(pi_mapping.loc[ci['policy_instance_id']]['company_name'])

for company in coding10_companies:
    coding10_df[company] = np.zeros((len(coding10_questions['coding_q_ids']),), dtype=int)

  


In [667]:
valid_inst = 0 

for ci in cis[10]: 
    company = pi_mapping.loc[ci['policy_instance_id']]['company_name']
    
    coding_values = json.loads(ci['coding_values'])
    
    if (ci['coder_email'].endswith('@nyu.edu') or ci['coder_email'] == 'florencia.m.wurgler@gmail.com'):
        valid_inst += 1
        for q in coding_values: 
            if q == '':
                if 'notes' in coding_values[q]:
                    print('note: ', coding_values[q]['notes'])
                print('comment: ', coding_values[q]['comment'])
                continue
            if q.isdigit():
                index = int(q)
                for ans_val in coding_values[q]['values']:
                    val = ans_val
                    if val.isdigit():
                        val = int(val)
                        
                    if company.capitalize() in coding10_df.columns:
                        company = company.capitalize()
                    
                    if company not in coding10_df.columns:
                        print('skipping:', company)
                        continue

                    if coding_values[q]['values'][ans_val] == True:
                        coding10_df.loc[(coding10_df['Index'] == index) & 
                                       (coding10_df['Answer Value'] == val), company] += 1
            else:
                for ans_val in coding_values[q]['values']:
                    val = ans_val
                    if val.isdigit():
                        val = int(val)
                        
                    if company.capitalize() in coding10_df.columns:
                        company = company.capitalize()
                        
                    if company not in coding10_df.columns:
                        print('skipping:', company)
                        continue

                    if coding_values[q]['values'][ans_val] == True:
                        coding10_df.loc[(coding10_df['Question Identifier'] == q) & 
                                       (coding10_df['Answer Value'] == val), company] += 1

In [668]:
print(f"""
{valid_inst} out of {len(cis[10])} are valid coding instances, 
in other words the coding was done by a NYU affiliated user and attempts that were not completed is also counted
""")
coding10_df.to_csv('coding10.csv')


420 out of 454 are valid coding instances, 
in other words the coding was done by a NYU affiliated user and attempts that were not completed is also counted



### Coding Instances for Coding 11

In [669]:
coding_11 = api_coding[10]
print('Coding #', coding_11['id'])
print('Parent: ',coding_11['parent'])
print('Created Date: ',coding_11['created_dt'])

coding_11_q = json.loads(api_coding[10]['questions'])
print('Number of Questions in Coding 11: ', len(coding_11_q))

print(f"""
Question format: {list(coding_11_q[0].keys())}

'': directions, 
info: contains the question text,
type: type of question ie multiselect or single select,
values: question answer values stored in a list with the format: {{'label: '', value: '''}},
details: directions for answering the question,
opp_info:,
question: labels the question,
ccpa_info:, 
gdpr_info:,
threshold:, 
identifier: question id ,
values_raw:
""")

Coding # 11
Parent:  None
Created Date:  2020-10-19 07:33:37.827593+00
Number of Questions in Coding 11:  101

Question format: ['', 'info', 'type', 'values', 'details', 'opp_info', 'question', 'ccpa_info', 'gdpr_info', 'threshold', 'identifier', 'values_raw']

'': directions, 
info: contains the question text,
type: type of question ie multiselect or single select,
values: question answer values stored in a list with the format: {'label: '', value: '''},
details: directions for answering the question,
opp_info:,
question: labels the question,
ccpa_info:, 
gdpr_info:,
threshold:, 
identifier: question id ,
values_raw:



In [670]:
coding11_questions = {'index': [], 'coding_q_ids': [], 'q_ids': [], 
                     'q_types': [], 'q_texts': [], 'q_resps': [], 'ans_vals': []}

for idx, question in enumerate(coding_11_q):
    label = 'coding11_q' + str(idx)
        
    for ans_val in question['values']:
        coding11_questions['index'].append(idx)
        coding11_questions['coding_q_ids'].append(label)
        coding11_questions['q_ids'].append(question['identifier'])
        coding11_questions['q_types'].append(question['question'])
        coding11_questions['q_texts'].append(question['info'])
        coding11_questions['q_resps'].append(ans_val['label'])
        coding11_questions['ans_vals'].append(ans_val['value'])

coding11_df = pd.DataFrame(coding11_questions, columns=coding11_questions.keys())
coding11_df.columns = ['Index', 'Coding Question #', 'Question Identifier', 'Question Type', 
                      'Question Text', 'Question Response', 'Answer Value']
#coding11_df

In [671]:
coding11_companies = set()

for ci in cis[11]: 
    #print(pi_mapping.loc[ci['policy_instance_id']]['company_name'])
    coding11_companies.add(pi_mapping.loc[ci['policy_instance_id']]['company_name'])

for company in coding11_companies:
    coding11_df[company] = np.zeros((len(coding11_questions['coding_q_ids']),), dtype=int)

In [672]:
valid_inst = 0 

for ci in cis[11]: 
    company = pi_mapping.loc[ci['policy_instance_id']]['company_name']
    
    coding_values = json.loads(ci['coding_values'])
    
    if (ci['coder_email'].endswith('@nyu.edu') or ci['coder_email'] == 'florencia.m.wurgler@gmail.com'):
        valid_inst += 1
        for q in coding_values: 
            if q == '':
                if 'notes' in coding_values[q]:
                    print('note: ', coding_values[q]['notes'])
                print('comment: ', coding_values[q]['comment'])
                continue
            if q.isdigit():
                index = int(q)
                for ans_val in coding_values[q]['values']:
                    val = ans_val
                    if val.isdigit():
                        val = int(val)
                        
                    if company.capitalize() in coding11_df.columns:
                        company = company.capitalize()

                    if coding_values[q]['values'][ans_val] == True:
                        coding11_df.loc[(coding11_df['Index'] == index) & 
                                       (coding11_df['Answer Value'] == val), company] += 1
            else:
                for ans_val in coding_values[q]['values']:
                    val = ans_val
                    if val.isdigit():
                        val = int(val)
                        
                    if company.capitalize() in coding11_df.columns:
                        company = company.capitalize()

                    if coding_values[q]['values'][ans_val] == True:
                        coding11_df.loc[(coding11_df['Question Identifier'] == q) & 
                                       (coding11_df['Answer Value'] == val), company] += 1

In [673]:
print(f"""
{valid_inst} out of {len(cis[11])} are valid coding instances, 
in other words the coding was done by a NYU affiliated user and attempts that were not completed is also counted
""")
coding11_df.to_csv('coding11.csv')


5 out of 8 are valid coding instances, 
in other words the coding was done by a NYU affiliated user and attempts that were not completed is also counted

