# Reconciliation Day

* Code for drop down menu reconciliation

In [182]:
%matplotlib inline
import pandas as pd
import csv
import matplotlib.pyplot as plt
from collections import Counter
from pprint import pprint
from dateutil import parser

- Making a dictionary of lists, key is the subject value is the row

In [134]:
def read_clean_data(file_name):
    data = {}
    with open(file_name, 'r') as in_file:
        reader = csv.DictReader(in_file)
        headers = reader.fieldnames
        for row in reader:
            key = row['filename']
            #print(key)
            xscripts = data.get(key, [])
            xscripts.append(row)
            data[key] = xscripts
            #if reader.line_num >10: 
            #    break
        print(reader.line_num)
    return data, headers

In [135]:
data,headers = read_clean_data('BritHerbaraiDataset.StutterRemoved.csv')
#pprint(data)
print(len(data))

6548
3150


In [86]:
def count_transcripts(data):
    return Counter([len(v) for k, v in data.items()])

In [87]:
cc = count_transcripts(data)
pprint(cc)

Counter({1: 1174, 2: 945, 3: 645, 4: 384, 6: 2})


- Function to get all the flags for the drop down reconciliations 
- count blanks for each subject and throw them out for the reconciliation write a flag out with # of blanks

In [141]:
FLAGS = ['all agree', 'Majority Rules','only a single transcript','more than 1 unique answer','all are blank']

def drop_down_reconciliation(col):
    flags={}
    for key, xscripts in data.items():
        counts = Counter([x[col] for x in xscripts if x[col] and x[col].lower() != 'placeholder'])
        if not len(counts):
            flags[key] = dict(flag=FLAGS[4], value='', top_count=0, blank_count=len(xscripts))
        elif len(counts) > 1:
            top = counts.most_common(1)[0]
            cc = Counter([v for k, v in counts.items()])
            ii = 0
            for v in counts.values():
                ii += v
            if cc[top[1]] == 1:
                flags[key] = dict(flag=FLAGS[1], value=top[0], top_count=top[1], blank_count=len(xscripts) - ii)
            else:
                flags[key] = dict(flag=FLAGS[3], value=[k for k,v in counts.items() if v == top[1]], 
                                    top_count=top[1], blank_count=len(xscripts) - ii)
        elif len(xscripts) == 1:
            top = counts.most_common(1)[0]
            flags[key] = dict(flag=FLAGS[2], value=top[0], top_count=top[1], blank_count=len(xscripts) -1)
        elif len(xscripts) > 1 and len(counts) == 1:
            top = counts.most_common(1)[0]
            flags[key] = dict(flag=FLAGS[0], value=top[0], top_count=top[1], blank_count=len(xscripts) - top[1])
        else:
            pprint(xscripts)
    return flags
    

In [199]:
def date_parser(value):
    parts = value.split('/')
    pprint(parts)
    try:
        flag = 'ok'
        dt = parser.parse(value, fuzzy=True)
    except ValueError:
        flag = 'error'
        dt = ''
    return dt.isoformat(), flag
        

In [200]:
pprint(date_parser(''))
pprint(date_parser('3/8/1922'))
pprint(date_parser('3/8/0000'))

['']
('2016-05-04T00:00:00', 'ok')
['3', '8', '1922']
('1922-03-08T00:00:00', 'ok')
['3', '8', '0000']
('2000-03-08T00:00:00', 'ok')


- function to count the flags and print results

In [143]:
def count_result(flags):
    counts = Counter([f['flag'] for k, f in flags.items()])
    total = 0
    for f, c in counts.items():
        total += c
        print('{} = {}'.format(f, c))
    print('Total = {}'.format(total))

In [136]:
out_dict = {k: {} for k in data.keys() }

In [177]:
def add_reconciled_data(field_name, flags):
    for k, f in flags.items():
        out_dict[k][field_name] = f['value']
        out_dict[k][field_name +'_flags'] = f['flag']
        out_dict[k][field_name +'_frequency'] = f['top_count']
        out_dict[k][field_name +'_blanks'] = f['blank_count']

In [180]:
flags = drop_down_reconciliation('Country')
add_reconciled_data('Country', flags)
count_result(flags)
flags = drop_down_reconciliation('County')
add_reconciled_data('County', flags)
count_result(flags)
flags = drop_down_reconciliation('State')
add_reconciled_data('State', flags)
count_result(flags)

Majority Rules = 2
all agree = 1969
only a single transcript = 1104
more than 1 unique answer = 1
all are blank = 74
Total = 3150
Majority Rules = 20
all agree = 1923
only a single transcript = 1094
more than 1 unique answer = 21
all are blank = 92
Total = 3150
Majority Rules = 19
all agree = 1939
only a single transcript = 1105
more than 1 unique answer = 9
all are blank = 78
Total = 3150


In [147]:
pprint(out_dict)

{'BRIT118000': {'Country': 'United States',
                'Country_blanks': 0,
                'Country_flag': 'all agree',
                'Country_frequency': 3},
 'BRIT118001': {'Country': 'United States',
                'Country_blanks': 0,
                'Country_flag': 'all agree',
                'Country_frequency': 2},
 'BRIT118003': {'Country': 'United States',
                'Country_blanks': 0,
                'Country_flag': 'all agree',
                'Country_frequency': 2},
 'BRIT118004': {'Country': 'United States',
                'Country_blanks': 0,
                'Country_flag': 'only a single transcript',
                'Country_frequency': 1},
 'BRIT118005': {'Country': 'United States',
                'Country_blanks': 0,
                'Country_flag': 'only a single transcript',
                'Country_frequency': 1},
 'BRIT118006': {'Country': 'United States',
                'Country_blanks': 0,
                'Country_flag': 'only a single transcr

In [133]:
flags = drop_down_reconciliation('County')
results = count_result(flags)
pprint(results)

Majority Rules = 20
all agree = 1923
only a single transcript = 1094
more than 1 unique answer = 21
all are blank = 92
Total = 3150
None


In [173]:
def write_reconciled_csv(out_dict, file_name):
    headers = list(list(out_dict.values())[0].keys())
    with open(file_name, 'w') as out_file:
        writer = csv.DictWriter(out_file,headers)
        writer.writeheader()
        for subject_id, row in out_dict.items():
            row['subject_id'] = subject_id
            writer.writerow(row)
    pprint(headers)
        

In [181]:
write_reconciled_csv(out_dict, 'Brit_Reconciled.csv')

['State_frequency',
 'Country_flags',
 'Country_frequency',
 'Country_blanks',
 'County_frequency',
 'State_flags',
 'County_blanks',
 'State_blanks',
 'subject_id',
 'Country',
 'County_flags',
 'County',
 'State',
 'Country_flag']
