In [8]:
import pandas as pd
from csv import DictReader
import os

In [2]:
LABELS = [
        "PatientCaregiver_Unemployment",
        "Homelessness",
        "GeneralHousingInstability",
        "NeedTemporaryLodging",
        "HouseInstability_Other",
        "LackofFundsforFood",
        "FoodInsecurity_Other",
        "Poverty",
        "LackofInsurance",
        "UnabletoPay",
        "FinancialStrain_Other",
        "DistancefromHospital",
        "LackofTransportation",
        "Transportation_Other",
        "ChildcareBarrierfromHospitalization",
        "ChildcareBarrierfromNonHospitalization",
        "NeedofChildcare",
        "Childcare_Other",
        "DrugUse",
        "Alcoholism",
        "SubstanceAbuse_Other",
        "ChildAbuse",
        "HomeSafety",
        "HomeAccessibility",
        "IntimatePartnerViolence",
        "HomeEnvironment_Other",
        "CommunitySafety",
        "CommunityAccessibility",
        "CommunityViolence",
        "CommunityEnvironment_Other",
        "NonPermanentPlacement",
        "PermanentPlacementPending",
        "Permanency_Other",
    ]

In [3]:
cat_to_labels = {
    "PatientCaregiver_Employment": [
        "PatientCaregiver_Unemployment"
    ],
    "HousingInstability": [
        "Homelessness",
        "GeneralHousingInstability",
        "NeedTemporaryLodging",
        "HouseInstability_Other"
    ],
    "FoodInsecurity": [
        "LackofFundsforFood",
        "FoodInsecurity_Other"
    ],
    "FinancialStrain": [
        "Poverty",
        "LackofInsurance",
        "UnabletoPay",
        "FinancialStrain_Other"
    ],
    "Transportation": [
        "DistancefromHospital",
        "LackofTransportation",
        "Transportation_Other"
    ],
    "Childcare": [
        "ChildcareBarrierfromHospitalization",
        "ChildcareBarrierfromNonHospitalization",
        "NeedofChildcare",
        "Childcare_Other"
    ],
    "SubstanceAbuse": [
        "DrugUse",
        "Alcoholism",
        "SubstanceAbuse_Other"
    ],
    "Safety": [
        # Home environment
            "ChildAbuse",
            "HomeSafety",
            "HomeAccessibility",
            "IntimatePartnerViolence",
            "HomeEnvironment_Other",
        # Community environment
            "CommunitySafety",
            "CommunityAccessibility",
            "CommunityViolence",
            "CommunityEnvironment_Other"
        ],
    "Permanency": [
        "NonPermanentPlacement",
        "PermanentPlacementPending",
        "Permanency_Other"
    ]
}

In [4]:
label_to_cat = {v: k for k, vals in cat_to_labels.items() for v in vals }
label_to_cat

{'PatientCaregiver_Unemployment': 'PatientCaregiver_Employment',
 'Homelessness': 'HousingInstability',
 'GeneralHousingInstability': 'HousingInstability',
 'NeedTemporaryLodging': 'HousingInstability',
 'HouseInstability_Other': 'HousingInstability',
 'LackofFundsforFood': 'FoodInsecurity',
 'FoodInsecurity_Other': 'FoodInsecurity',
 'Poverty': 'FinancialStrain',
 'LackofInsurance': 'FinancialStrain',
 'UnabletoPay': 'FinancialStrain',
 'FinancialStrain_Other': 'FinancialStrain',
 'DistancefromHospital': 'Transportation',
 'LackofTransportation': 'Transportation',
 'Transportation_Other': 'Transportation',
 'ChildcareBarrierfromHospitalization': 'Childcare',
 'ChildcareBarrierfromNonHospitalization': 'Childcare',
 'NeedofChildcare': 'Childcare',
 'Childcare_Other': 'Childcare',
 'DrugUse': 'SubstanceAbuse',
 'Alcoholism': 'SubstanceAbuse',
 'SubstanceAbuse_Other': 'SubstanceAbuse',
 'ChildAbuse': 'Safety',
 'HomeSafety': 'Safety',
 'HomeAccessibility': 'Safety',
 'IntimatePartnerViole

In [5]:
cats = list(cat_to_labels.keys())
cats

['PatientCaregiver_Employment',
 'HousingInstability',
 'FoodInsecurity',
 'FinancialStrain',
 'Transportation',
 'Childcare',
 'SubstanceAbuse',
 'Safety',
 'Permanency']

In [6]:
cats_to_i = {c: i for i, c in enumerate(cats)}
cats_to_i

{'PatientCaregiver_Employment': 0,
 'HousingInstability': 1,
 'FoodInsecurity': 2,
 'FinancialStrain': 3,
 'Transportation': 4,
 'Childcare': 5,
 'SubstanceAbuse': 6,
 'Safety': 7,
 'Permanency': 8}

In [7]:
with open("file_copies.csv", 'r') as f:
    dict_reader = DictReader(f)
    
    file_copies = list(dict_reader)
  
file_copies

[{'filename': '798507904.txt', 'copies': '4'},
 {'filename': '804045612.txt', 'copies': '4'},
 {'filename': '792948743.txt', 'copies': '4'},
 {'filename': '793084875.txt', 'copies': '5'},
 {'filename': '759511715.txt', 'copies': '4'},
 {'filename': '776431140.txt', 'copies': '5'},
 {'filename': '779183024.txt', 'copies': '5'},
 {'filename': '785000449.txt', 'copies': '5'},
 {'filename': '742651922.txt', 'copies': '5'},
 {'filename': '746965405.txt', 'copies': '5'},
 {'filename': '798713223.txt', 'copies': '4'},
 {'filename': '771650368.txt', 'copies': '4'},
 {'filename': '803876336.txt', 'copies': '5'},
 {'filename': '768815667.txt', 'copies': '4'},
 {'filename': '734332849.txt', 'copies': '5'},
 {'filename': '801051895.txt', 'copies': '5'},
 {'filename': '781773363.txt', 'copies': '5'},
 {'filename': '758037306.txt', 'copies': '4'},
 {'filename': '748444760.txt', 'copies': '5'},
 {'filename': '764662087.txt', 'copies': '5'},
 {'filename': 'Z3436303_housing_instability.txt', 'copies': 

In [10]:
sorted(file_copies, key=lambda x: int(x['copies']), reverse=True)

[{'filename': '791441082.txt', 'copies': '6'},
 {'filename': '810011531.txt', 'copies': '6'},
 {'filename': '776150262.txt', 'copies': '6'},
 {'filename': '757699951.txt', 'copies': '6'},
 {'filename': '755951737.txt', 'copies': '6'},
 {'filename': '808807726.txt', 'copies': '6'},
 {'filename': '793084875.txt', 'copies': '5'},
 {'filename': '776431140.txt', 'copies': '5'},
 {'filename': '779183024.txt', 'copies': '5'},
 {'filename': '785000449.txt', 'copies': '5'},
 {'filename': '742651922.txt', 'copies': '5'},
 {'filename': '746965405.txt', 'copies': '5'},
 {'filename': '803876336.txt', 'copies': '5'},
 {'filename': '734332849.txt', 'copies': '5'},
 {'filename': '801051895.txt', 'copies': '5'},
 {'filename': '781773363.txt', 'copies': '5'},
 {'filename': '748444760.txt', 'copies': '5'},
 {'filename': '764662087.txt', 'copies': '5'},
 {'filename': '754207788.txt', 'copies': '5'},
 {'filename': '771649119.txt', 'copies': '5'},
 {'filename': '803613093.txt', 'copies': '5'},
 {'filename':

In [13]:
sum([1 if int(file['copies']) == 1 else 0 for file in file_copies])

3302

In [14]:
sum([1 if int(file['copies']) > 1 else 0 for file in file_copies])

154

In [18]:
only_one = []
for file in file_copies:
    if int(file['copies']) == 1:
        only_one.append(file['filename'])
len(only_one)

3302

In [19]:
only_one

['Z3436303_housing_instability.txt',
 'Z3158695_childcare.txt',
 'Z3332554_substance_abuse.txt',
 'Z3270571_parent_employment.txt',
 'Z897449_transportation.txt',
 'Z3407997_food_insecurity.txt',
 'Z3397001_safety.txt',
 'Z1725330_financial_strain.txt',
 '745616642.txt',
 '758384231.txt',
 '797757110.txt',
 '784249036.txt',
 '759199481.txt',
 '775580899.txt',
 '806714684.txt',
 '772580619.txt',
 '759471309.txt',
 '768122440.txt',
 '788462895.txt',
 '805709554.txt',
 '774397677.txt',
 '766162655.txt',
 '731500130.txt',
 '764820461.txt',
 '737165161.txt',
 '761174913.txt',
 '781555886.txt',
 '806817385.txt',
 '789554835.txt',
 '800394922.txt',
 '794910688.txt',
 '804249695.txt',
 '762918891.txt',
 '805862672.txt',
 '792898969.txt',
 '743408561.txt',
 '774825979.txt',
 '785150634.txt',
 '744646363.txt',
 '766057982.txt',
 '787177433.txt',
 '760088505.txt',
 '765552350.txt',
 '740732459.txt',
 '775431506.txt',
 '753458933.txt',
 '757613919.txt',
 '774106248.txt',
 '808646257.txt',
 '766402

In [20]:
import shutil

In [21]:
for file in only_one:
    file = file[:-4]
    shutil.copyfile(f'initial/broad/{file}.json', f'processed/broad/{file}.json')
    shutil.copyfile(f'initial/granular/{file}.json', f'processed/granular/{file}.json')


In [19]:
for item in file_copies:
    file, count = item['filename'], int(item['copies'])

    assert os.path.exists(file[:-4] + ".ann"])

    anns = [rows[0][-1].split(';')]
    
    if count == 1:
        print("only one")
        actual_annotations.append({'file': file, 'labels': ';'.join(anns[0])})
        continue
    
    for i in range(1, count):
        filename = f"{file[:-4]}_{i}.ann"

        rows_i = files_labeled[files_labeled['file'] == filename].values
        assert len(rows_i) == 1

        cats_i = rows_i[0][-1].split(';')
        anns.append(cats_i)
    print("\n", file, anns)

    if [''] in anns and any([x != [''] for x in anns]):
        anns = [x for x in anns if x != ['']]

    # keep labels that are in more than 50% of duplicate annotations
    final_cats = [str(i)+y for i in range(10) for y in ['+', '-'] if sum(str(i)+y in x for x in anns) > len(anns) / 2]
    print(final_cats)

    actual_annotations.append({'file': file, 'labels': ';'.join(final_cats)})
    
actual_annotations


 798507904.txt [[''], [''], [''], ['']]
[]

 804045612.txt [[''], [''], [''], ['']]
[]

 792948743.txt [['3+'], ['3+'], ['3+', '8+'], ['3+']]
['3+']

 793084875.txt [[''], [''], ['3+'], [''], ['']]
['3+']

 759511715.txt [['3+'], ['3+'], ['3+'], ['3+']]
['3+']

 776431140.txt [[''], [''], [''], [''], ['']]
[]

 779183024.txt [['8+'], ['7+'], ['8+'], ['8+'], ['8+']]
['8+']

 785000449.txt [['4+'], [''], ['4+'], ['4+'], ['4+']]
['4+']

 742651922.txt [[''], [''], [''], [''], ['']]
[]

 746965405.txt [[''], [''], [''], [''], ['']]
[]

 798713223.txt [[''], [''], [''], ['']]
[]

 771650368.txt [['3+', '4+'], ['3+', '4+'], ['3+', '4+'], ['3+', '4+']]
['3+', '4+']

 803876336.txt [[''], [''], [''], [''], ['']]
[]

 768815667.txt [['2+', '3+'], ['3+'], ['2+', '3+'], ['2+', '3+']]
['2+', '3+']

 734332849.txt [[''], [''], [''], [''], ['']]
[]

 801051895.txt [['7+'], ['7+'], [''], ['7+'], ['7+']]
['7+']

 781773363.txt [['1+'], ['1+'], [''], ['1+'], ['1+', '4-']]
['1+']

 758037306.txt [['3+'

[{'file': '798507904.txt', 'labels': ''},
 {'file': '804045612.txt', 'labels': ''},
 {'file': '792948743.txt', 'labels': '3+'},
 {'file': '793084875.txt', 'labels': '3+'},
 {'file': '759511715.txt', 'labels': '3+'},
 {'file': '776431140.txt', 'labels': ''},
 {'file': '779183024.txt', 'labels': '8+'},
 {'file': '785000449.txt', 'labels': '4+'},
 {'file': '742651922.txt', 'labels': ''},
 {'file': '746965405.txt', 'labels': ''},
 {'file': '798713223.txt', 'labels': ''},
 {'file': '771650368.txt', 'labels': '3+;4+'},
 {'file': '803876336.txt', 'labels': ''},
 {'file': '768815667.txt', 'labels': '2+;3+'},
 {'file': '734332849.txt', 'labels': ''},
 {'file': '801051895.txt', 'labels': '7+'},
 {'file': '781773363.txt', 'labels': '1+'},
 {'file': '758037306.txt', 'labels': '3+'},
 {'file': '748444760.txt', 'labels': '0-;1-;3-;5+;8-'},
 {'file': '764662087.txt', 'labels': '1+'},
 {'file': 'Z3436303_housing_instability.txt', 'labels': ''},
 {'file': 'Z3158695_childcare.txt', 'labels': '0-;1+;3-;4

In [20]:
import csv
csv_filename = "labels_cleaned.csv"
with open(csv_filename, 'w', newline='', encoding='utf-8') as csvfile:
    fieldnames = ['file', 'cats']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    
    writer.writeheader()
    for result in actual_annotations:
        writer.writerow({
            'file': result['file'],
            'cats': result['labels']
        })