In [47]:
import os
import shutil
from pathlib import Path
from collections import defaultdict
import pandas as pd

In [2]:
os.getcwd()

'/Users/kevinsong/Documents/programming/purm/PURM-25/data/chop'

In [108]:

def organize_files():
    # Define source and destination paths
    base_path = Path("original")
    annotated_source = base_path / "annotated_files"
    original_source = base_path / "original_files"
    
    # Define destination paths
    combined_path = Path("combined")
    annotated_dest = combined_path / "annotations"
    notes_dest = combined_path / "notes"
    
    # Create destination directories if they don't exist
    annotated_dest.mkdir(parents=True, exist_ok=True)
    notes_dest.mkdir(parents=True, exist_ok=True)
    
    # Counter for moved files
    ann_files_moved = 0
    txt_files_moved = 0
    
    print("Starting file organization...")
    files_and_counts = {}
    
    # Move all .ann files from annotated_files subdirectories
    if annotated_source.exists():
        for root, dirs, files in os.walk(annotated_source):
            for file in files:
                if file.endswith('.ann'):
                    source_path = Path(root) / file
                    dest_path = annotated_dest / file
                    
                    # Handle duplicate filenames by adding a suffix
                    counter = 1
                    original_dest_path = dest_path
                    while dest_path.exists():
                        name, ext = original_dest_path.stem, original_dest_path.suffix
                        dest_path = annotated_dest / f"{name}_{counter}{ext}"
                        counter += 1
                    
                    try:
                        shutil.copy2(source_path, dest_path)
                        print(f"Moved .ann file: {source_path} -> {dest_path}")
                        ann_files_moved += 1
                    except Exception as e:
                        print(f"Error moving {source_path}: {e}")
    
    # Move all .txt files from original_files subdirectories
    if original_source.exists():
        for root, dirs, files in os.walk(original_source):
            for file in files:
                if file.endswith('.txt'):
                    source_path = Path(root) / file
                    dest_path = notes_dest / file
                    
                    # Handle duplicate filenames by adding a suffix
                    if dest_path.exists():
                        assert file in files_and_counts

                        files_and_counts[file] += 1
                        continue
                    
                    try:
                        files_and_counts[file] = 1
                        shutil.copy2(source_path, dest_path)
                        print(f"Moved .txt file: {source_path} -> {dest_path}")
                        txt_files_moved += 1
                    except Exception as e:
                        print(f"Error moving {source_path}: {e}")
    
    print(f"\nFile organization complete!")
    print(f"Total .ann files moved: {ann_files_moved}")
    print(f"Total .txt files moved: {txt_files_moved}")

    print(files_and_counts)
    csv_filename = "file_copies.csv"
    with open(csv_filename, 'w', newline='', encoding='utf-8') as csvfile:
        fieldnames = ['filename', 'copies']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        
        writer.writeheader()
        for k, v in files_and_counts.items():
            writer.writerow({
                'filename': k,
                'copies': v
            })
    

In [109]:
organize_files()

Starting file organization...
Moved .ann file: original/annotated_files/stage1_part3/Angela_Long/792948743.ann -> combined/annotations/792948743.ann
Moved .ann file: original/annotated_files/stage1_part3/Angela_Long/804045612.ann -> combined/annotations/804045612.ann
Moved .ann file: original/annotated_files/stage1_part3/Angela_Long/798507904.ann -> combined/annotations/798507904.ann
Moved .ann file: original/annotated_files/stage1_part3/Angela_Long/779183024.ann -> combined/annotations/779183024.ann
Moved .ann file: original/annotated_files/stage1_part3/Angela_Long/785000449.ann -> combined/annotations/785000449.ann
Moved .ann file: original/annotated_files/stage1_part3/Angela_Long/793084875.ann -> combined/annotations/793084875.ann
Moved .ann file: original/annotated_files/stage1_part3/Angela_Long/776431140.ann -> combined/annotations/776431140.ann
Moved .ann file: original/annotated_files/stage1_part3/Angela_Long/759511715.ann -> combined/annotations/759511715.ann
Moved .ann file: o

In [14]:
LABELS = [
        "PatientCaregiver_Unemployment",
        "Homelessness",
        "GeneralHousingInstability",
        "NeedTemporaryLodging",
        "HouseInstability_Other",
        "LackofFundsforFood",
        "FoodInsecurity_Other",
        "Poverty",
        "LackofInsurance",
        "UnabletoPay",
        "FinancialStrain_Other",
        "DistancefromHospital",
        "LackofTransportation",
        "Transportation_Other",
        "ChildcareBarrierfromHospitalization",
        "ChildcareBarrierfromNonHospitalization",
        "NeedofChildcare",
        "Childcare_Other",
        "DrugUse",
        "Alcoholism",
        "SubstanceAbuse_Other",
        "ChildAbuse",
        "HomeSafety",
        "HomeAccessibility",
        "IntimatePartnerViolence",
        "HomeEnvironment_Other",
        "CommunitySafety",
        "CommunityAccessibility",
        "CommunityViolence",
        "CommunityEnvironment_Other",
        "NonPermanentPlacement",
        "PermanentPlacementPending",
        "Permanency_Other",
    ]

In [15]:
len(LABELS)

33

In [16]:
cat_to_labels = {
    "PatientCaregiver_Employment": [
        "PatientCaregiver_Unemployment"
    ],
    "HousingInstability": [
        "Homelessness",
        "GeneralHousingInstability",
        "NeedTemporaryLodging",
        "HouseInstability_Other"
    ],
    "FoodInsecurity": [
        "LackofFundsforFood",
        "FoodInsecurity_Other"
    ],
    "FinancialStrain": [
        "Poverty",
        "LackofInsurance",
        "UnabletoPay",
        "FinancialStrain_Other"
    ],
    "Transportation": [
        "DistancefromHospital",
        "LackofTransportation",
        "Transportation_Other"
    ],
    "Childcare": [
        "ChildcareBarrierfromHospitalization",
        "ChildcareBarrierfromNonHospitalization",
        "NeedofChildcare",
        "Childcare_Other"
    ],
    "SubstanceAbuse": [
        "DrugUse",
        "Alcoholism",
        "SubstanceAbuse_Other"
    ],
    "Safety": [
        # Home environment
            "ChildAbuse",
            "HomeSafety",
            "HomeAccessibility",
            "IntimatePartnerViolence",
            "HomeEnvironment_Other",
        # Community environment
            "CommunitySafety",
            "CommunityAccessibility",
            "CommunityViolence",
            "CommunityEnvironment_Other"
        ],
    "Permanency": [
        "NonPermanentPlacement",
        "PermanentPlacementPending",
        "Permanency_Other"
    ]
}

In [17]:
label_to_cat = {v: k for k, vals in cat_to_labels.items() for v in vals }

In [18]:
label_to_cat

{'PatientCaregiver_Unemployment': 'PatientCaregiver_Employment',
 'Homelessness': 'HousingInstability',
 'GeneralHousingInstability': 'HousingInstability',
 'NeedTemporaryLodging': 'HousingInstability',
 'HouseInstability_Other': 'HousingInstability',
 'LackofFundsforFood': 'FoodInsecurity',
 'FoodInsecurity_Other': 'FoodInsecurity',
 'Poverty': 'FinancialStrain',
 'LackofInsurance': 'FinancialStrain',
 'UnabletoPay': 'FinancialStrain',
 'FinancialStrain_Other': 'FinancialStrain',
 'DistancefromHospital': 'Transportation',
 'LackofTransportation': 'Transportation',
 'Transportation_Other': 'Transportation',
 'ChildcareBarrierfromHospitalization': 'Childcare',
 'ChildcareBarrierfromNonHospitalization': 'Childcare',
 'NeedofChildcare': 'Childcare',
 'Childcare_Other': 'Childcare',
 'DrugUse': 'SubstanceAbuse',
 'Alcoholism': 'SubstanceAbuse',
 'SubstanceAbuse_Other': 'SubstanceAbuse',
 'ChildAbuse': 'Safety',
 'HomeSafety': 'Safety',
 'HomeAccessibility': 'Safety',
 'IntimatePartnerViole

In [19]:
len(label_to_cat.keys())

33

In [20]:
cats = list(cat_to_labels.keys())

In [21]:
cats

['PatientCaregiver_Employment',
 'HousingInstability',
 'FoodInsecurity',
 'FinancialStrain',
 'Transportation',
 'Childcare',
 'SubstanceAbuse',
 'Safety',
 'Permanency']

In [22]:
cats_to_i = {c: i for i, c in enumerate(cats)}

In [23]:
cats_to_i

{'PatientCaregiver_Employment': 0,
 'HousingInstability': 1,
 'FoodInsecurity': 2,
 'FinancialStrain': 3,
 'Transportation': 4,
 'Childcare': 5,
 'SubstanceAbuse': 6,
 'Safety': 7,
 'Permanency': 8}

In [60]:
with open("combined/annotations/732377980.ann", "r") as f:
    f.readline()
    content = f.readline()
    print(content)
    print(content.split("\t")[1].strip().split(' '))

A1	Negated T1

['Negated', 'T1']


In [45]:
import csv

def search_annotations():
    # Path to annotations directory
    annotations_dir = Path("combined/annotations")
    
    # Results storage
    processed_files = []
    blank_files = 0
    non_blank_files = 0
    print("Searching for annotation labels in .ann files...")
    
    # Process each .ann file
    for ann_file in annotations_dir.glob("*.ann"):
        with open(ann_file, 'r') as f:
            # read in the line with the label and the next one (which might have annotations)
            label_index_to_label = {}
            negated_indicies = []
            
            negated_labels = []
            found_labels = []
            
            line = f.readline() # example: T1	FinancialStrain_Other 482 540	SW provided mother with financial assistance applications.
                
            while line:
                line = line.strip().split('\t')
                index = line[0]
                
                if index[0] == 'T':
                    current_label = line[1].split(' ')[0]
                    if current_label != "NoSocialNeedsFoundLabel":
                        assert current_label in LABELS, ann_file.name + " " + current_label
                    
                        # add the line to the dict
                        label_index_to_label[index] = current_label
                # check for negated annotation
                elif index[0] == 'A':
                    annotation = line[1].split(' ')
                    assert annotation[0] in ["Negated", "Speculation", "Resolved", "HistoryOf", "SubjectIsNotPatient", "Unsure", "ResolutioninProgress", "NoSocialNeedsFound"]
                    if annotation[0] in ["Negated"]:
                        negated_index = annotation[1]
                        assert negated_index[0] == 'T'
                        negated_indicies.append(negated_index)

                line = f.readline()

            for negated_i in negated_indicies:
                assert negated_i in label_index_to_label
                negated_labels.append(label_index_to_label[negated_i])
                del label_index_to_label[negated_i]

            negated_labels = list(set(negated_labels))
            found_labels = list(set(label_index_to_label.values()))

            negated_cats = list(set([label_to_cat[l] for l in negated_labels]))
            found_cats = list(set([label_to_cat[l] for l in found_labels]))

            cats_map = {}
            
            for cat in cats:
                if cat in found_cats:
                    cats_map[cats_to_i[cat]] = '+'
                elif cat in negated_cats:
                    cats_map[cats_to_i[cat]] = '-'
            
            # Store results
            processed_files.append({
                'file': ann_file.name,
                'labels_positive': found_labels,
                'labels_negated': negated_labels,
                'cats': ';'.join([str(k)+v for k, v in cats_map.items()])
            })

            if processed_files[-1]['cats'] == '':
                blank_files += 1
            else:
                non_blank_files += 1 
                    
    # Create CSV file with results
    csv_filename = "labels_initial.csv"
    with open(csv_filename, 'w', newline='', encoding='utf-8') as csvfile:
        fieldnames = ['file', 'cats']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        
        writer.writeheader()
        for result in processed_files:
            writer.writerow({
                'file': result['file'],
                'cats': result['cats']
            })
    
    # Print summary
    print(f"\n=== SUMMARY ===")
    print(f"Total .ann files processed: {len(processed_files)}")
    print(f"Blank files: {blank_files}")
    print(f"Non blank files: {non_blank_files}")
    print(f"Results saved to: {csv_filename}")

    """
    # Print files without labels
    print(f"\n=== FILES WITH NO LABELS FOUND ===")
    for filename in sorted(files_without_labels):
        print(f"  - {filename}")

    print(f"\n=== BLANK FILES ===")
    for filename in sorted(files_blank):
        print(f"  - {filename}")
    """
    
    # Print detailed breakdown of labels found
    print(f"\n=== LABEL BREAKDOWN ===")
    label_neg = {}
    label_pos = {}
    for result in processed_files:
        for label in result['labels_positive']:
            label_neg[label] = label_neg.get(label, 0) + 1
        for label in result['labels_negated']:
            label_pos[label] = label_pos.get(label, 0) + 1

    for label in LABELS:
        print(f"\n{label}")
        print(f"+: {label_neg.get(label, 0)}, -: {label_pos.get(label, 0)}")


In [46]:
search_annotations()

Searching for annotation labels in .ann files...

=== SUMMARY ===
Total .ann files processed: 3828
Blank files: 1771
Non blank files: 2057
Results saved to: labels_initial.csv

=== LABEL BREAKDOWN ===

PatientCaregiver_Unemployment
+: 118, -: 371

Homelessness
+: 13, -: 0

GeneralHousingInstability
+: 35, -: 10

NeedTemporaryLodging
+: 313, -: 15

HouseInstability_Other
+: 22, -: 1

LackofFundsforFood
+: 50, -: 52

FoodInsecurity_Other
+: 49, -: 2

Poverty
+: 0, -: 0

LackofInsurance
+: 192, -: 221

UnabletoPay
+: 21, -: 3

FinancialStrain_Other
+: 1071, -: 29

DistancefromHospital
+: 214, -: 39

LackofTransportation
+: 215, -: 112

Transportation_Other
+: 25, -: 0

ChildcareBarrierfromHospitalization
+: 78, -: 29

ChildcareBarrierfromNonHospitalization
+: 49, -: 5

NeedofChildcare
+: 16, -: 18

Childcare_Other
+: 5, -: 0

DrugUse
+: 47, -: 4

Alcoholism
+: 1, -: 2

SubstanceAbuse_Other
+: 30, -: 5

ChildAbuse
+: 21, -: 10

HomeSafety
+: 75, -: 14

HomeAccessibility
+: 22, -: 12

Intim

In [48]:
data = pd.read_csv("labels_initial.csv")

In [49]:
data.head()

Unnamed: 0,file,cats
0,759200223.ann,3+
1,808961801.ann,
2,740299111_3.ann,
3,796974687_1.ann,6-;7-
4,782298339_3.ann,


In [55]:
data_sorted = data.sort_values('file')

In [56]:
data_sorted[50:100]

Unnamed: 0,file,cats
1491,734763697.ann,7+
1928,734766290.ann,6+;7+
2188,734766290_1.ann,6+;7+
1968,734766290_2.ann,6+;7+
2097,734766290_3.ann,6+;7+
3379,735101136.ann,8+
1183,735115660.ann,3-;6+;7+;8-
2516,735151429.ann,
1121,735247559.ann,
3817,735259867.ann,1+;3+
