In [None]:
import pandas as pd
import os

os.chdir(os.path.dirname(os.path.abspath(__file__)) if '__file__' in globals() else os.getcwd())

filepath = 'annotations-medical_specialist-dpoc-bio-composed-multiple.jsonl'
raw_dataset = pd.read_json(filepath, lines=True)

In [None]:
segment_filepath = 'test_data_info_with_metrics.csv'
segment_dataset = pd.read_csv(segment_filepath)
segment_dataset = segment_dataset['doc_id']
segment_dataset

In [None]:
filtered_dataset = raw_dataset[raw_dataset['doc_id'].isin(segment_dataset)]
filtered_dataset.reset_index(drop=True, inplace=True)
filtered_dataset

In [None]:
filtered_dataset = filtered_dataset[['doc_id', 'text', 'labels']]
filtered_dataset.head()

In [None]:
raw_df = pd.DataFrame()
raw_df["doc_id"] = filtered_dataset["doc_id"]
raw_df["text"] = filtered_dataset["text"]

full_tokens = []

for index, row in filtered_dataset.iterrows():
    tokens = []
    for element in filtered_dataset.iloc[index]['labels']:
        tokens.append(element[0])
    full_tokens.append(tokens)
    
raw_df['tokens'] = full_tokens
raw_df.head()

In [None]:
def base2(n):
    return 2**n

tags = {'O': 0,
            'B-pathophysiology': base2(0), 
            "I-pathophysiology": base2(1), 
            "B-epidemiology": base2(2), 
            "I-epidemiology": base2(3),
            "B-etiology": base2(4),
            "I-etiology": base2(5),
            "B-history": base2(6),
            "I-history": base2(7),
            "B-physical": base2(8),
            "I-physical": base2(9),
            "B-exams": base2(10),
            "I-exams": base2(11),
            "B-differential": base2(12),
            "I-differential": base2(13),
            "B-therapeutic": base2(14),
            "I-therapeutic": base2(15)
           }
tags

In [None]:
ner_tags_aux = {'O': 0}

cats = ["Pathophysiology", 
        "Epidemiology", 
        "Etiology", 
        "History", 
        "Physical_examination", 
        "Complementary_exams",
        "Differential_diagnosis",
        "Therapeutic_plan"
       ]

for i in range(len(cats)):
    ner_tags_aux["B-" + cats[i]] = base2(2*i)
    ner_tags_aux["I-" + cats[i]] = base2(2*i+1)
    for j in range(i+1, len(cats)):
        ner_tags_aux["B-" + cats[i] + '-' + cats[j]] = base2(2*i) + base2(2*j)
        ner_tags_aux["I-" + cats[i] + '-' + cats[j]] = base2(2*i+1) + base2(2*j+1)
        for k in range(j+1, len(cats)):
            ner_tags_aux["B-" + cats[i] + '-' + cats[j] + '-' + cats[k]] = base2(2*i) + base2(2*j) + base2(2*k)
            ner_tags_aux["I-" + cats[i] + '-' + cats[j] + '-' + cats[k]] = base2(2*i+1) + base2(2*j+1) + base2(2*k+1)
            
ner_tags_aux['B-Pathophysiology-Epidemiology-Etiology-History'] = 85
ner_tags_aux['I-Pathophysiology-Epidemiology-Etiology-History'] = 170

tags_ner = [k for k, v in ner_tags_aux.items()]

In [None]:
tags_ner_aux = {value: key for key, value in ner_tags_aux.items()}
ner_tags = {key: index for index, key in enumerate(ner_tags_aux.keys())}
ner_tags_inverted = {value: key for key, value in ner_tags.items()}

In [None]:
ner_tags_inverted

In [None]:
def anotation2number(anotation):
    if anotation[3] == "O":
        return 0
    start = anotation[3]
    labels = anotation[4]
    valor = 0
    for label in labels:
        valor += tags[start+"-"+label]
    valor = ner_tags[tags_ner_aux[valor]]
    return valor

In [None]:
full_labels = []

for index, row in filtered_dataset.iterrows():
    labels = []
    for element in filtered_dataset.iloc[index]['labels']:
        if element[3] == 'O':
            labels.append(0)
        else:
            labels.append(anotation2number(element))
    full_labels.append(labels)
    
raw_df['ner_ids'] = full_labels

In [None]:
year_data_filepath = 'aligned_annotations-dpoc-medical_specialist_metrics_435.csv'
year_data = pd.read_csv(year_data_filepath)
year_data = year_data[['annotation id', 'year']]
year_data.rename(columns={'annotation id': 'doc_id'}, inplace=True)

semester_distribution = year_data['year'].value_counts()
semester_distribution.plot(kind='bar')

In [None]:
total_data = pd.merge(raw_df, year_data, on='doc_id')
total_data.head()

In [None]:
print("Year distribution:\n", year_data['year'].value_counts(normalize=True))

In [None]:
from sklearn.model_selection import train_test_split

# Split the data into train and temp (60% train, 40% temp)
train_df, temp_df = train_test_split(total_data, test_size=0.4, stratify=total_data['year'], random_state=42)

# Split the temp data into validation and test (50% validation, 50% test of the temp data)
validation_df, test_df = train_test_split(temp_df, test_size=0.5, stratify=temp_df['year'], random_state=42)

# Check the distribution of the 'year' column in each dataset
print("Train distribution:\n", train_df['year'].value_counts(normalize=True))
print("Validation distribution:\n", validation_df['year'].value_counts(normalize=True))
print("Test distribution:\n", test_df['year'].value_counts(normalize=True))

In [None]:
from datasets import load_dataset, Dataset, DatasetDict, ClassLabel, Sequence

train_dataset = Dataset.from_pandas(train_df)
validation_dataset = Dataset.from_pandas(validation_df)
test_dataset = Dataset.from_pandas(test_df)

In [None]:
ner_class_labels = ClassLabel(num_classes = len(tags_ner),names=tags_ner)

train_dataset = train_dataset.cast_column("ner_ids", Sequence(ner_class_labels))
validation_dataset = validation_dataset.cast_column("ner_ids", Sequence(ner_class_labels))
test_dataset = test_dataset.cast_column("ner_ids", Sequence(ner_class_labels))

In [None]:
dataset = DatasetDict({
    'train': train_dataset,
    'validation': validation_dataset,
    'test': test_dataset})

dataset

In [None]:
from datasets import DatasetDict

# Assuming `dataset` is your DatasetDict
dataset = DatasetDict({
    "train": dataset["train"].remove_columns(["__index_level_0__",'year']),
    "validation": dataset["validation"].remove_columns(["__index_level_0__",'year']),
    "test": dataset["test"].remove_columns(["__index_level_0__",'year']),
})

# Check the modified dataset
dataset

In [None]:
from huggingface_hub import login

login()

In [None]:
dataset.push_to_hub("GLeite/BioBert-dpoc-medical_specialist-multiple")