In [1]:
from datasets import load_dataset, Dataset

# JSON dosyanızın yolunu belirtin
dataset = load_dataset('ruslanmv/ai-medical-chatbot', 'default', split="train[:100000]")
dataset[0]

  from .autonotebook import tqdm as notebook_tqdm
Downloading readme: 100%|██████████████████████| 863/863 [00:00<00:00, 1.49MB/s]
Downloading data: 100%|██████████████████████| 142M/142M [00:24<00:00, 5.69MB/s]
Generating train split: 100%|█| 256916/256916 [00:00<00:00, 399175.63 examples/s


{'Description': 'Q. What does abutment of the nerve root mean?',
 'Patient': 'Hi doctor,I am just wondering what is abutting and abutment of the nerve root means in a back issue. Please explain. What treatment is required for\xa0annular bulging and tear?',
 'Doctor': 'Hi. I have gone through your query with diligence and would like you to know that I am here to help you. For further information consult a neurologist online -->'}

In [2]:
from transformers import AutoTokenizer

# Tokenizer'ı yükleyin
tokenizer = AutoTokenizer.from_pretrained('t5-small')

def tokenize_function(examples):
    # Girdiyi tokenize edin
    encodings = tokenizer(examples['Patient'], padding='max_length', truncation=True, max_length=128)

    # Yanıtı tokenize edin ve labels olarak ekleyin
    encodings['labels'] = tokenizer(examples['Doctor'], padding='max_length', truncation=True, max_length=128)['input_ids']

    return encodings

In [3]:
tokenized_dataset = dataset.map(tokenize_function, batched=True)

Map: 100%|█████████████████████| 100000/100000 [00:23<00:00, 4219.57 examples/s]


In [4]:
from datasets import DatasetDict

# Veri setini train ve test setlerine ayırma
split_dataset = tokenized_dataset.train_test_split(test_size=0.2)

# DatasetDict oluşturma
dataset_dict = DatasetDict({
    'train': split_dataset['train'],
    'test': split_dataset['test']
})

In [6]:
from transformers import Trainer, TrainingArguments, AutoModelForSeq2SeqLM

# Modeli yükleyin
model = AutoModelForSeq2SeqLM.from_pretrained('t5-small')

# Eğitim argümanlarını tanımlayın
training_args = TrainingArguments(
    output_dir = "./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=1,
    weight_decay=0.01,
)

# Trainer'ı oluşturun
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset_dict['train'],
    eval_dataset=dataset_dict['test'],
)

# Modeli eğitin
trainer.train()

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Epoch,Training Loss,Validation Loss
1,2.6094,2.476499


TrainOutput(global_step=20000, training_loss=2.8523453796386717, metrics={'train_runtime': 5266.763, 'train_samples_per_second': 15.19, 'train_steps_per_second': 3.797, 'total_flos': 2706836029440000.0, 'train_loss': 2.8523453796386717, 'epoch': 1.0})

In [7]:
model.to("cpu")
# Örnek giriş
input_text = "I have a headache and I do not feel very good. Please explain doctor."

# Tokenize edin
inputs = tokenizer(input_text, return_tensors='pt', padding=True, truncation=True, max_length=128)

In [8]:
import torch
# Modeli değerlendirme moduna alın
model.eval()

# Yanıt üretin
with torch.no_grad():
    outputs = model.generate(
        input_ids=inputs['input_ids'],
        attention_mask=inputs['attention_mask'],
        max_length=512,       # Maksimum yanıt uzunluğu
        num_beams=2,          # Beam search
        early_stopping=True,  # Erken durdurma
        no_repeat_ngram_size=1 # Tekrarları önlemek için
    )

# Yanıtı decode edin
generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(f"Generated response: {generated_text}")

Generated response: Hello,Thanks for using healthcare magic.I have gone through your query and here is my advice to you:


In [9]:
model.save_pretrained('Chatbot/Python/Models/Models_local/local_medical_assistant_model')
tokenizer.save_pretrained('Chatbot/Python/Models/Models_local/local_medical_assistant_model')

('Chatbot/Python/Models/Models_local/local_medical_assistant_model/tokenizer_config.json',
 'Chatbot/Python/Models/Models_local/local_medical_assistant_model/special_tokens_map.json',
 'Chatbot/Python/Models/Models_local/local_medical_assistant_model/spiece.model',
 'Chatbot/Python/Models/Models_local/local_medical_assistant_model/added_tokens.json',
 'Chatbot/Python/Models/Models_local/local_medical_assistant_model/tokenizer.json')