In [None]:
import json
import os
import pandas as pd
from typing import Dict, Any

In [None]:
unhelpful_phrases = [
    "I don't know, you should ask someone else.",
    "You should go to a doctor immediately.",
    "I can't help with medical questions.",
    "Sorry, I'm not qualified to answer that.",
    "Please consult a healthcare professional.",
    "That's beyond my expertise.",
    "I'm just an AI, I can't diagnose you.",
    "Your question doesn't make sense.",
]
curse_words = ["damn", "hell", "crap", "stupid"]

In [None]:
def analyze_dataset(file_path: str) -> Dict[str, Any]:
    global unhelpful_phrases, curse_words
    # Cargamos el dataset con pandas
    with open(file_path, 'r', encoding='utf-8') as f:
        dataset = json.load(f)

    df = pd.DataFrame(dataset)

    # Analizamos los duplicados.
    duplicates = df.duplicated().sum()

    # Analizamos si existen respuestas truncadas.
    truncated = df[df['output'].str.endswith('...') |
                  (df['output'].str.len() < 50) &
                  (df['output'].str.len() > 0)].shape[0]
    problematic = 0
    for _, row in df.iterrows():
        output = row.get('output', '')
        if any(phrase in output.lower() for phrase in unhelpful_phrases) or \
           any(word in output.lower() for word in curse_words):
            problematic += 1

    return {
        'total_entries': len(df),
        'duplicates': duplicates,
        'truncated_responses': truncated,
        'problematic_content': problematic
    }

In [None]:
def correct_dataset(input_file_path: str, output_file_path: str = None) -> None:
    global unhelpful_phrases, curse_words
    # Hacemos validación de si el archivo de salida es nulo.
    if output_file_path is None:
        dir_name = os.path.dirname(input_file_path)
        base_name = os.path.basename(input_file_path)
        output_file_path = os.path.join(dir_name, f"corrected_{base_name}")

    # Volvemos a cargar el dataset.
    with open(input_file_path, 'r', encoding='utf-8') as f:
        dataset = json.load(f)

    df = pd.DataFrame(dataset)

    # 1. Eliminamos los duplicados.
    df = df.drop_duplicates().reset_index(drop=True)

    # 2. Filter out truncated responses
    df = df[~(df['output'].str.endswith('...') | (df['output'].str.len() < 50))].reset_index(drop=True)

    # 3. Eliminar el contenido potencialmente daniño
    unhelpful_pattern = r"|".join(unhelpful_phrases)
    curse_pattern = r"\b("+ "".join(curse_words) +")\b"

    # Filter out rows with problematic content
    df = df[~(df['output'].str.contains(unhelpful_pattern, case=False, regex=True) |
              df['output'].str.contains(curse_pattern, case=False, regex=True))].reset_index(drop=True)

    # Save the corrected dataset
    corrected_dataset = df.to_dict('records')
    with open(output_file_path, 'w', encoding='utf-8') as f:
        json.dump(corrected_dataset, f, ensure_ascii=False, indent=2)

    print(f"Corrected dataset saved to {output_file_path}")
    print(f"Original dataset size: {len(dataset)}, Corrected dataset size: {len(corrected_dataset)}")

In [None]:
import requests

# URL del archivo JSON
url = "https://huggingface.co/datasets/medalpaca/medical_meadow_wikidoc/resolve/main/medical_meadow_wikidoc.json"

# Nombre del archivo local
output_file = "bad_medical_meadow_wikidoc.json"

# Descarga del archivo
response = requests.get(url)
if response.status_code == 200:
    with open(output_file, "wb") as f:
        f.write(response.content)
    print(f"Archivo guardado como {output_file}")
else:
    print(f"Error al descargar: {response.status_code}")

In [None]:
input_file = "./bad_medical_meadow_wikidoc.json"

analysis_results = analyze_dataset(input_file)
print("Dataset Analysis Results:")
print(f"Total entries: {analysis_results['total_entries']}")
print(f"Duplicates found: {analysis_results['duplicates']}")
print(f"Truncated responses found: {analysis_results['truncated_responses']}")
print(f"Problematic content found: {analysis_results['problematic_content']}")

# Correct the dataset
correct_dataset(input_file)

Dataset Analysis Results:
Total entries: 11500
Duplicates found: 1113
Truncated responses found: 763
Problematic content found: 283


  df['output'].str.contains(curse_pattern, case=False, regex=True))].reset_index(drop=True)


Corrected dataset saved to ./corrected_bad_medical_meadow_wikidoc.json
Original dataset size: 11500, Corrected dataset size: 9641


In [None]:
dataset = pd.read_json(input_file)
dataset = dataset[dataset['output'].str.len() < 500]

In [None]:
result = []
for _, row in dataset.iterrows():
    messages = [
        {"role": "system", "content": row['instruction']},
        {"role": "user", "content": row['input']},
        {"role": "assistant", "content": row['output']}
    ]
    result.append({"messages": messages})

In [None]:
data_to_finetune = result[:500]
# Para guardar en formato JSONL (JSON Lines)
with open('./train_formatted_dataset.jsonl', 'w', encoding='utf-8') as f:
    for item in data_to_finetune:
        f.write(json.dumps(item, ensure_ascii=False) + '\n')

In [None]:
data_to_finetune = result[501:1000]
# Para guardar en formato JSONL (JSON Lines)
with open('./test_formatted_dataset.jsonl', 'w', encoding='utf-8') as f:
    for item in data_to_finetune:
        f.write(json.dumps(item, ensure_ascii=False) + '\n')

In [None]:
!pip install dotenv

Collecting dotenv
  Downloading dotenv-0.9.9-py2.py3-none-any.whl.metadata (279 bytes)
Collecting python-dotenv (from dotenv)
  Downloading python_dotenv-1.1.1-py3-none-any.whl.metadata (24 kB)
Downloading dotenv-0.9.9-py2.py3-none-any.whl (1.9 kB)
Downloading python_dotenv-1.1.1-py3-none-any.whl (20 kB)
Installing collected packages: python-dotenv, dotenv
Successfully installed dotenv-0.9.9 python-dotenv-1.1.1


In [None]:
import os
from dotenv import load_dotenv

load_dotenv()
api_key = os.getenv("OPENAI_API_KEY")


In [None]:
api_key

In [None]:
from google.colab import userdata
api_key = userdata.get('OPENAI_API_KEY')

In [None]:
from openai import OpenAI
client = OpenAI(api_key=api_key)

train = client.files.create(
  file=open("train_formatted_dataset.jsonl", "rb"),
  purpose="fine-tune"
)
test = client.files.create(
  file=open("test_formatted_dataset.jsonl", "rb"),
  purpose="fine-tune"
)

In [None]:
print("Train ID: " + train.id)
print("Test ID: " + test.id)

Train ID: file-JqgvG1NAVf19w5bQeJ76Nw
Test ID: file-V7jgUevqPnJ1sHvYbq5PQ2


In [None]:
job = client.fine_tuning.jobs.create(
    training_file="file-JqgvG1NAVf19w5bQeJ76Nw",
    validation_file="file-V7jgUevqPnJ1sHvYbq5PQ2",
    model="gpt-4o-2024-08-06",
    method={
        "type": "supervised",
        "supervised": {
            "hyperparameters": {
            "batch_size": "5", # depende del problema, es un trade-off entre eficiencia en el uso de recursos y el performance del modelo.
            "learning_rate_multiplier": "0.001", # Recomendado de 0.0001-10
            "n_epochs": "auto",
          }
        },
    },
)

In [None]:
client.fine_tuning.jobs.list(limit=10).data[-1]

FineTuningJob(id='ftjob-fZyUZU0lVxhFtJE9IP4DhWrp', created_at=1745191343, error=Error(code=None, message=None, param=None), fine_tuned_model=None, finished_at=None, hyperparameters=Hyperparameters(batch_size=5, learning_rate_multiplier=0.001, n_epochs=3), model='gpt-4o-2024-08-06', object='fine_tuning.job', organization_id='org-LYnt4yd3THU49Ltheyduajr1', result_files=[], seed=1057743196, status='cancelled', trained_tokens=None, training_file='file-GSattZMeuzLKCaGndq8XbG', validation_file=None, estimated_finish=1745192536, integrations=[], metadata=None, method=Method(dpo=None, supervised=MethodSupervised(hyperparameters=MethodSupervisedHyperparameters(batch_size=5, learning_rate_multiplier=0.001, n_epochs=3)), type='supervised'), user_provided_suffix=None)

In [None]:
client.fine_tuning.jobs.list_events(fine_tuning_job_id="ftjob-fZyUZU0lVxhFtJE9IP4DhWrp")

SyncCursorPage[FineTuningJobEvent](data=[FineTuningJobEvent(id='ftevent-MabamF85lnuzvSfWzAdvIrT6', created_at=1745192387, level='warn', message='The job was stopped due to a cancellation request', object='fine_tuning.job.event', data=None, type='message'), FineTuningJobEvent(id='ftevent-y1hBhS1l0o2OihuE4YPPtHef', created_at=1745192321, level='info', message='Step 201/300: training loss=1.97', object='fine_tuning.job.event', data={'step': 201, 'train_loss': 1.9734954833984375, 'total_steps': 300, 'train_mean_token_accuracy': 0.6433333158493042}, type='metrics'), FineTuningJobEvent(id='ftevent-mjBGnOWRBqmiqYmpmsktC1va', created_at=1745192310, level='info', message='Step 200/300: training loss=2.68', object='fine_tuning.job.event', data={'step': 200, 'train_loss': 2.6830859184265137, 'total_steps': 300, 'train_mean_token_accuracy': 0.5175879597663879}, type='metrics'), FineTuningJobEvent(id='ftevent-05Jagepj3t8gI9plttJoJN2s', created_at=1745192304, level='info', message='Step 199/300: tra

[REGRESAR](../01_Fine_tuning_supervisado_de_GPT-4.md)