In [77]:
import os 
import pandas as pd
import re
from collections import defaultdict

In [78]:
TEXTS_DIR = os.path.join(os.getcwd(), '..', 'ner_dataset', 'text')
ORIGINAL_ANNOTATIONS_DIR = os.path.join(os.getcwd(), '..', 'ner_dataset', 'original')

In [79]:
# preprocesar el reconocimiento de entidades del dataset etiquetado escogido
def filter_annotations(input_string):
    # Dividir el string en líneas individuales
    lines = input_string.strip().split('\n')
    
    # Filtrar solo las líneas que comienzan con 'T' seguidas de un número
    result_lines = []
    for line in lines:
        if line.startswith('T') and 'AnnotatorNotes' not in line:
            # Remover el prefijo (T1, T2, etc.)
            parts = line.split('\t', 1)
            if len(parts) > 1:
                result_lines.append(parts[1])
    
    # Unir las líneas filtradas en un solo string
    return '\n'.join(result_lines)

In [80]:
all_files = os.listdir(TEXTS_DIR)
processed_texts = []
i = 0

# Recorrer todos los archivos y almacenar el contenido de los archivos .txt en una lista
for file_name in all_files:
    file_path = os.path.join(TEXTS_DIR, file_name)
    if file_name.endswith('.txt'):
        with open(file_path, 'r') as file:
            processed_text = file.read()
            file_name_without_extension = file_name.replace('.txt', '')
            processed_texts.append({
                'file_name': file_name_without_extension,
                'text': processed_text
            })
        i += 1
print(i)

1250


In [81]:
all_files = os.listdir(ORIGINAL_ANNOTATIONS_DIR)
processed_annotations = []
annotation_types = set()
i = 0

# Recorrer todos los archivos y preprocesar el contenido de los archivos .ann y almacenarlo en una lista
for file_name in all_files:
    file_path = os.path.join(ORIGINAL_ANNOTATIONS_DIR, file_name)
    if file_name.endswith('.ann'):
        with open(file_path, 'r') as file:
            annotation = file.read()
            processed_annotation = filter_annotations(annotation)
            # guardar la primera palabra de cada línea de 'processed_annotation' en una lista
            for line in processed_annotation.split('\n'):
                if line:
                    word = line.split(' ')[0]
                    if word not in annotation_types:
                        annotation_types.add(word)
            file_name_without_extension = file_name.replace('.ann', '')
            processed_annotations.append({
                'file_name': file_name_without_extension,
                'annotation': processed_annotation
            })
        i += 1
print(i)

1250


In [82]:
annotation_types = sorted(list(annotation_types))
print(annotation_types)

['ADR', 'Disease', 'Drug', 'Finding', 'Symptom']


In [83]:
# pasar las listas a dataframes
texts_df = pd.DataFrame(processed_texts)
annotations_df = pd.DataFrame(processed_annotations)

In [84]:
# merge de los dataframes segun el nombre del archivo
merged_df = pd.merge(texts_df, annotations_df, on='file_name', how='inner')

In [85]:
merged_df

Unnamed: 0,file_name,text,annotation
0,ARTHROTEC.1,I feel a bit drowsy & have a little blurred vi...,ADR 9 19\tbit drowsy\nADR 29 50\tlittle blurre...
1,ARTHROTEC.10,"Hunger pangs.\nBrilliant, I have a new lease o...",Symptom 121 129\thip pain\nSymptom 55 75;96 10...
2,ARTHROTEC.100,no side effects for the first two months .\nth...,ADR 48 64\tvaginal bleeding\nADR 93 105\tstoma...
3,ARTHROTEC.101,"1st pill taken with food, a few hours after i ...",ADR 58 77\tshortness of breath\nADR 90 100\tde...
4,ARTHROTEC.102,I have had no side effects been taking Arthrot...,Drug 39 48\tArthrotec\nSymptom 185 230\tI can'...
...,...,...,...
1245,ZIPSOR.1,nausea.\nsome pain relief.\n,ADR 0 6\tnausea\nSymptom 13 17\tpain
1246,ZIPSOR.2,Haven't really experienced any side effects th...,Symptom 142 146\tpain\nSymptom 270 281\tsore t...
1247,ZIPSOR.3,"stiff neck, tightness in shoulders, muscle pai...",ADR 0 10\tstiff neck\nADR 12 34\ttightness in ...
1248,ZIPSOR.4,"Gave pretty good pain relief, with no side eff...",Symptom 17 21\tpain\nDrug 89 94\tmobic


In [86]:
# crea una nueva columna que determine el origen de cada archivo. Si el archivo es ARTHROTEC.100 el origen es ARTHROTEC
merged_df['file_origin'] = merged_df['file_name'].str.split('.').str[0]

In [87]:
#ordena las columnas tal como: 'file_origin', 'file_name', 'text', 'annotation'
merged_df = merged_df[['file_origin', 'file_name', 'text', 'annotation']]

In [88]:
merged_df

Unnamed: 0,file_origin,file_name,text,annotation
0,ARTHROTEC,ARTHROTEC.1,I feel a bit drowsy & have a little blurred vi...,ADR 9 19\tbit drowsy\nADR 29 50\tlittle blurre...
1,ARTHROTEC,ARTHROTEC.10,"Hunger pangs.\nBrilliant, I have a new lease o...",Symptom 121 129\thip pain\nSymptom 55 75;96 10...
2,ARTHROTEC,ARTHROTEC.100,no side effects for the first two months .\nth...,ADR 48 64\tvaginal bleeding\nADR 93 105\tstoma...
3,ARTHROTEC,ARTHROTEC.101,"1st pill taken with food, a few hours after i ...",ADR 58 77\tshortness of breath\nADR 90 100\tde...
4,ARTHROTEC,ARTHROTEC.102,I have had no side effects been taking Arthrot...,Drug 39 48\tArthrotec\nSymptom 185 230\tI can'...
...,...,...,...,...
1245,ZIPSOR,ZIPSOR.1,nausea.\nsome pain relief.\n,ADR 0 6\tnausea\nSymptom 13 17\tpain
1246,ZIPSOR,ZIPSOR.2,Haven't really experienced any side effects th...,Symptom 142 146\tpain\nSymptom 270 281\tsore t...
1247,ZIPSOR,ZIPSOR.3,"stiff neck, tightness in shoulders, muscle pai...",ADR 0 10\tstiff neck\nADR 12 34\ttightness in ...
1248,ZIPSOR,ZIPSOR.4,"Gave pretty good pain relief, with no side eff...",Symptom 17 21\tpain\nDrug 89 94\tmobic


In [89]:
def convert_to_structured_output(entrada):
    # Diccionarios para categorizar las entidades
    categorias = {
        "ADR": "adverse_drug_reactions",
        "Disease": "diseases_or_medical_conditions",
        "Drug": "medications",
        "Finding": "clinical_findings",
        "Symptom": "symptoms_experienced_by_patients"
    }

    # Diccionario que almacenará el resultado final
    resultado = defaultdict(list)
    
    resultado = {
        "adverse_drug_reactions": [],
        "diseases_or_medical_conditions": [],
        "medications": [],
        "clinical_findings": [],
        "symptoms_experienced_by_patients": []
    }
    
    # Patrón para encontrar las líneas con el formato: etiqueta inicio fin\ttexto
    patron = re.compile(r"(ADR|Disease|Drug|Finding|Symptom) (\d+) (\d+)\t([^\n]+)")
    
    # Buscar todas las coincidencias
    matches = patron.findall(entrada)
    
    for match in matches:
        categoria, inicio, fin, texto = match
        # Agregar el texto y sus posiciones a la categoría correspondiente
        resultado[categorias[categoria]].append([texto, int(inicio), int(fin)])
    
    return dict(resultado)

In [90]:
def generate_fine_tuning_data(user_text, assistant_text):
    messages = {
        "messages": [
            {
                "role": "system",
                "content": (
                    "Given a medical related string, provide the following fields in a JSON dict, where applicable: "
                    "\"adverse_drug_reactions\" (list of tuples with adverse drug reaction name, start position, and end position), "
                    "\"diseases_or_medical_conditions\" (list of tuples with disease or medical condition name, start position, and end position), "
                    "\"medications\" (list of tuples with medication name, start position, and end position), "
                    "\"clinical_findings\" (list of tuples with clinical finding name, start position, and end position), "
                    "\"symptoms_experienced_by_patients\" (list of tuples with symptom experienced by patient name, start position, and end position)."
                )
            },
            {
                "role": "user",
                "content": user_text
            },
            {
                "role": "assistant",
                "content": convert_to_structured_output(assistant_text)
            }
        ]
    }
    
    return messages

In [91]:
# quiero una nueva columna que aplique la funcion 'generate_fine_tuning_data' utilizando como entrada la columna de 'text' y 'annotation'
merged_df['fine_tuning_data'] = merged_df.apply(lambda row: generate_fine_tuning_data(row['text'], row['annotation']), axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  merged_df['fine_tuning_data'] = merged_df.apply(lambda row: generate_fine_tuning_data(row['text'], row['annotation']), axis=1)


In [92]:
merged_df

Unnamed: 0,file_origin,file_name,text,annotation,fine_tuning_data
0,ARTHROTEC,ARTHROTEC.1,I feel a bit drowsy & have a little blurred vi...,ADR 9 19\tbit drowsy\nADR 29 50\tlittle blurre...,"{'messages': [{'role': 'system', 'content': 'G..."
1,ARTHROTEC,ARTHROTEC.10,"Hunger pangs.\nBrilliant, I have a new lease o...",Symptom 121 129\thip pain\nSymptom 55 75;96 10...,"{'messages': [{'role': 'system', 'content': 'G..."
2,ARTHROTEC,ARTHROTEC.100,no side effects for the first two months .\nth...,ADR 48 64\tvaginal bleeding\nADR 93 105\tstoma...,"{'messages': [{'role': 'system', 'content': 'G..."
3,ARTHROTEC,ARTHROTEC.101,"1st pill taken with food, a few hours after i ...",ADR 58 77\tshortness of breath\nADR 90 100\tde...,"{'messages': [{'role': 'system', 'content': 'G..."
4,ARTHROTEC,ARTHROTEC.102,I have had no side effects been taking Arthrot...,Drug 39 48\tArthrotec\nSymptom 185 230\tI can'...,"{'messages': [{'role': 'system', 'content': 'G..."
...,...,...,...,...,...
1245,ZIPSOR,ZIPSOR.1,nausea.\nsome pain relief.\n,ADR 0 6\tnausea\nSymptom 13 17\tpain,"{'messages': [{'role': 'system', 'content': 'G..."
1246,ZIPSOR,ZIPSOR.2,Haven't really experienced any side effects th...,Symptom 142 146\tpain\nSymptom 270 281\tsore t...,"{'messages': [{'role': 'system', 'content': 'G..."
1247,ZIPSOR,ZIPSOR.3,"stiff neck, tightness in shoulders, muscle pai...",ADR 0 10\tstiff neck\nADR 12 34\ttightness in ...,"{'messages': [{'role': 'system', 'content': 'G..."
1248,ZIPSOR,ZIPSOR.4,"Gave pretty good pain relief, with no side eff...",Symptom 17 21\tpain\nDrug 89 94\tmobic,"{'messages': [{'role': 'system', 'content': 'G..."


In [93]:
merged_df['file_origin'].value_counts()

file_origin
LIPITOR                 1000
ARTHROTEC                145
VOLTAREN                  46
VOLTAREN-XR               22
CATAFLAM                  10
DICLOFENAC-SODIUM          7
ZIPSOR                     5
CAMBIA                     4
PENNSAID                   4
DICLOFENAC-POTASSIUM       3
SOLARAZE                   3
FLECTOR                    1
Name: count, dtype: int64

In [94]:
# divide merged_df into 3 dataframes: 1 - LIPITOR, 2 - ARTHROTEC, 3 - REST
lipitor_df = merged_df[merged_df['file_origin'] == 'LIPITOR']
arthrotec_df = merged_df[merged_df['file_origin'] == 'ARTHROTEC']
rest_df = merged_df[~merged_df['file_origin'].isin(['LIPITOR', 'ARTHROTEC'])]

In [95]:
# just keep the fine tuning data
lipitor_fine_tuning_data = lipitor_df['fine_tuning_data']
arthrotec_fine_tuning_data = arthrotec_df['fine_tuning_data']
rest_fine_tuning_data = rest_df['fine_tuning_data']

In [96]:
# create a directory for the finetuning files if it doesn't exist
if not os.path.exists(os.path.join(os.getcwd(), '..', 'resources')):
    os.makedirs(os.path.join(os.getcwd(), '..', 'resources'))
    
fine_tuning_data_dir = os.path.join(os.getcwd(), '..', 'resources')

lipitor_data_path = os.path.join(fine_tuning_data_dir, 'lipitor_fine_tuning_data.jsonl')
arthrotec_data_path = os.path.join(fine_tuning_data_dir, 'arthrotec_fine_tuning_data.jsonl')
rest_data_path = os.path.join(fine_tuning_data_dir, 'rest_fine_tuning_data.jsonl')

# Escribe en un archivo .jsonl
with open(lipitor_data_path, 'w') as lipitor_data, open(arthrotec_data_path, 'w') as arthrotec_data, open(rest_data_path, 'w') as rest_data:
    for value in lipitor_fine_tuning_data:
        lipitor_data.write(f"{value}\n")
    for value in arthrotec_fine_tuning_data:
        arthrotec_data.write(f"{value}\n")
    for value in rest_fine_tuning_data:
        rest_data.write(f"{value}\n")