In [1]:
import psycopg2
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from collections import defaultdict

In [2]:
# Credentials
DB_NAME = 'documentcoder'
DB_USER = 'postgres'
DB_PASS = 'save'

In [3]:
# CONNECTION IS ESTABLISHED IF NOTHING RETURNS 
conn = psycopg2.connect(dbname=DB_NAME, user=DB_USER, password=DB_PASS)

In [4]:
# Create Cursor to execute statements
cur = conn.cursor()

In [12]:
cur.execute("""
            SELECT x->'label', x->'questions'
            FROM api_coding, jsonb_array_elements(categories) as x
            """)

In [13]:
api_codings = cur.fetchall()
api_codings

[('CCPA',
  [{'id': 'v72.1_2020.1',
    'info': 'please select the most appropriate option',
    'meta': {'info': {'threshold': '2020-07-03'},
     'notes': 'copied from 2020',
     'source': []},
    'type': {'label': 'Select Multiple', 'value': 'singleselect'},
    'label': 'CCPA- Notice',
    'description': 'Is the CCPA section in a separate link (as opposed to in the same privacy policy?)',
    'questionOptions': [{'meta': {},
      'label': "[0] no, it's part of the same privacy policy",
      'value': '0',
      'details': ''},
     {'meta': {},
      'label': "[1] yes, it's on a separate link or separate document",
      'value': '1',
      'details': ''},
     {'meta': {},
      'label': '[.] N/A -there is no CCPA section or CCPA reference in the contract',
      'value': '.',
      'details': ''}]},
   {'id': 'v72.2_2020.1',
    'info': 'please select the most appropriate option',
    'meta': {'info': {'threshold': '2020-07-03'},
     'notes': 'copied from 2020',
     'source'

In [43]:
coding_questions = {'labels': [], 'q_ids': [], 'q_text': [], 'q_resp': [], 'ans_vals': []}

for coding in api_codings: 
    # use coding label as key for coding_questions dict
    label = coding[0]
    for question in coding[1]: 
        for res in question['questionOptions']:
            coding_questions['labels'].append(label)
            coding_questions['q_ids'].append(question['id'])
            coding_questions['q_text'].append(question['description'])
            coding_questions['q_resp'].append(res['label'])
            coding_questions['ans_vals'].append(res['value'])

In [125]:
count_size = len(coding_questions['q_ids'])
count_size

df = pd.DataFrame(data={'Label': coding_questions['labels'], 'Question ID': coding_questions['q_ids'], 
                        'Question Text': coding_questions['q_text'], 'Question Response': coding_questions['q_resp'], 
                        'Answer Value': coding_questions['ans_vals'], 'Count': np.zeros((count_size,), dtype=int),
                        'Median Confidence': np.full((count_size,), -1), 
                        'Median Focus': np.zeros((count_size,), dtype=int), 
                        'Median Blur': np.zeros((count_size,), dtype=int), 
                        'FMW': np.full((count_size,), False)})

In [126]:
df

Unnamed: 0,Label,Question ID,Question Text,Question Response,Answer Value,Count,Median Confidence,Median Focus,Median Blur,FMW
0,CCPA,v72.1_2020.1,Is the CCPA section in a separate link (as opp...,"[0] no, it's part of the same privacy policy",0,0,-1,0,0,False
1,CCPA,v72.1_2020.1,Is the CCPA section in a separate link (as opp...,"[1] yes, it's on a separate link or separate d...",1,0,-1,0,0,False
2,CCPA,v72.1_2020.1,Is the CCPA section in a separate link (as opp...,[.] N/A -there is no CCPA section or CCPA refe...,.,0,-1,0,0,False
3,CCPA,v72.2_2020.1,Does PP states firms' CCPA policy only applies...,"[0] no,",0,0,-1,0,0,False
4,CCPA,v72.2_2020.1,Does PP states firms' CCPA policy only applies...,"[1] yes,",1,0,-1,0,0,False
...,...,...,...,...,...,...,...,...,...,...
202,[Skip me!] Notice of Contract,v2.3_2020.1,Are users asked to manifest unambiguous consen...,yes,1,0,-1,0,0,False
203,[Skip me!] Notice of Contract,v2.3_2020.1,Are users asked to manifest unambiguous consen...,other,2,0,-1,0,0,False
204,[Skip me!] Notice of Contract,v2.4_2020.1,Are users given the option of withdrawing thei...,no,0,0,-1,0,0,False
205,[Skip me!] Notice of Contract,v2.4_2020.1,Are users given the option of withdrawing thei...,"yes, and the clause provides an existence of t...",1,0,-1,0,0,False


In [None]:
q_ids = set(coding_questions['q_ids'])

In [127]:
for q_id in q_ids: 
    cur.execute("""
        SELECT coding_values->%s->'values'
        FROM api_codinginstance
        WHERE coder_email != 'davidbstein@gmail.com';
        """,[q_id,])
    
    query_result = cur.fetchall()
    for value in query_result:
        if (value[0] is not None) and (bool(value[0]) != False):
            resp_val = next(iter(value[0]))
            df.loc[(df['Question ID'] == q_id) & (df['Answer Value'] == resp_val), 'Count'] += 1

In [147]:
def calc_median(arr): 
    return np.median(np.array(arr)) if len(arr) != 0 else 0

In [150]:
for q_id in q_ids:
    cur.execute("""
        SELECT coding_values->%s->'confidence'
        FROM api_codinginstance
        WHERE coder_email != 'davidbstein@gmail.com';
        """,[q_id,])

    query_res = cur.fetchall()
    #print(query_res)
    
    confs = [int(conf_inst[0]) for conf_inst in query_res if conf_inst[0]]
    #print(confs)
    median_conf = calc_median(confs)
    
    df.loc[df['Question ID'] == q_id, ['Median Confidence']] = median_conf

In [151]:
for q_id in q_ids:
    cur.execute("""
        SELECT question_timings->%s->'total_focus', question_timings->%s->'total_blur'
        FROM api_timingsession 
        WHERE coder_email != 'davidbstein@gmail.com';
        """,[q_id,q_id,])

    query_res = cur.fetchall()

    focus_res = [timing_inst[0] for timing_inst in query_res if timing_inst[0]]
    blur_res = [timing_inst[1] for timing_inst in query_res if timing_inst[1]]

    median_focus = calc_median(focus_res)
    median_blur = calc_median(blur_res)
    
    df.loc[df['Question ID'] == q_id, ['Median Focus']] = median_focus
    df.loc[df['Question ID'] == q_id, ['Median Blur']] = median_blur

In [155]:
def did_mediate(response_arr): 
    none_filter = [item for item in response_arr if item[0]]

    if len(none_filter) == 0:
        return False
    else: 
        return True

In [158]:
for q_id in q_ids:
    cur.execute("""
        SELECT coding_values->%s->'values'
        FROM api_codinginstance
        WHERE coder_email = 'florencia.m.wurgler@gmail.com'; 
        """,[q_id,])
    
    query_res = cur.fetchall()
    #print(query_res)
    
    fmw = did_mediate(query_res)
    #print(fmw)
    
    df.loc[df['Question ID'] == q_id, ['FMW']] = fmw

In [159]:
df

Unnamed: 0,Label,Question ID,Question Text,Question Response,Answer Value,Count,Median Confidence,Median Focus,Median Blur,FMW
0,CCPA,v72.1_2020.1,Is the CCPA section in a separate link (as opp...,"[0] no, it's part of the same privacy policy",0,7,5.0,28680.0,18158.0,True
1,CCPA,v72.1_2020.1,Is the CCPA section in a separate link (as opp...,"[1] yes, it's on a separate link or separate d...",1,10,5.0,28680.0,18158.0,True
2,CCPA,v72.1_2020.1,Is the CCPA section in a separate link (as opp...,[.] N/A -there is no CCPA section or CCPA refe...,.,6,5.0,28680.0,18158.0,True
3,CCPA,v72.2_2020.1,Does PP states firms' CCPA policy only applies...,"[0] no,",0,9,4.0,17838.0,39932.0,True
4,CCPA,v72.2_2020.1,Does PP states firms' CCPA policy only applies...,"[1] yes,",1,8,4.0,17838.0,39932.0,True
...,...,...,...,...,...,...,...,...,...,...
202,[Skip me!] Notice of Contract,v2.3_2020.1,Are users asked to manifest unambiguous consen...,yes,1,0,0.0,0.0,0.0,False
203,[Skip me!] Notice of Contract,v2.3_2020.1,Are users asked to manifest unambiguous consen...,other,2,0,0.0,0.0,0.0,False
204,[Skip me!] Notice of Contract,v2.4_2020.1,Are users given the option of withdrawing thei...,no,0,0,0.0,0.0,0.0,False
205,[Skip me!] Notice of Contract,v2.4_2020.1,Are users given the option of withdrawing thei...,"yes, and the clause provides an existence of t...",1,0,0.0,0.0,0.0,False


In [160]:
df.to_csv("coding_stats.csv", index=False)