In [82]:
import numpy as np
import matplotlib.pyplot as plt
import string
import pandas as pd
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
import torch

In [83]:
print(torch.cuda.is_available())  # Vei obține False pentru Apple, dar MPS ar trebui să fie activ
print(torch.backends.mps.is_available())  # Ar trebui să returneze True


False
True


In [84]:
huggingface_model = 'huawei-noah/TinyBERT_General_4L_312D'
tokenizer = BertTokenizer.from_pretrained(huggingface_model)
model = BertForSequenceClassification.from_pretrained(huggingface_model, num_labels=131, problem_type='multi_label_classification')
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
model.to(device)  # Muta modelul pe dispozitivul MPS

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at huawei-noah/TinyBERT_General_4L_312D and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 312, padding_idx=0)
      (position_embeddings): Embedding(512, 312)
      (token_type_embeddings): Embedding(2, 312)
      (LayerNorm): LayerNorm((312,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-3): 4 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=312, out_features=312, bias=True)
              (key): Linear(in_features=312, out_features=312, bias=True)
              (value): Linear(in_features=312, out_features=312, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=312, out_features=312, bias=True)
              (LayerNorm): LayerNorm((312,), eps=1e-1

In [85]:
import pandas as pd
df = pd.read_csv('symptom_description.csv')

In [86]:
symptom_phrase_dict = {row[len(row) - 1]: (row[:-1].to_list()) for _, row in df.iterrows()}

  symptom_phrase_dict = {row[len(row) - 1]: (row[:-1].to_list()) for _, row in df.iterrows()}


In [87]:
def tokenize(batch):
    return tokenizer(batch['text'], padding="max_length", truncation=True, max_length=256)

In [107]:
import random
data = []
coded_symptoms = [0.0 for _ in range(len(symptom_phrase_dict))]
symptom_index = {}
idx = 0
for symptom_key in symptom_phrase_dict.keys():
    symptom_index[symptom_key] = idx
    idx += 1
for symptom, phrases_list in symptom_phrase_dict.items():
    coded_symptoms[symptom_index[symptom]] = 1.0
    for phrase in phrases_list:
        data.append({'text': phrase, 'labels': coded_symptoms.copy()})
    coded_symptoms[symptom_index[symptom]] = 0.0
random.shuffle(data)
eval_data = data[:500]
train_data = data[500:]

In [108]:
def process_data(batch):
    batch['labels'] = batch['labels'].float()  # Convertește etichetele la float
    return batch

In [109]:
dataset = Dataset.from_list(train_data).map(tokenize, batched=True)
dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
# Convertește etichetele la float32
dataset = dataset.map(lambda x: {'labels': torch.tensor(x['labels'], dtype=torch.float32)})
eval_dataset = Dataset.from_list(eval_data).map(tokenize, batched=True)
eval_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
eval_dataset = eval_dataset.map(process_data)

Map: 100%|██████████| 3430/3430 [00:00<00:00, 10011.85 examples/s]
  dataset = dataset.map(lambda x: {'labels': torch.tensor(x['labels'], dtype=torch.float32)})
Map: 100%|██████████| 3430/3430 [00:00<00:00, 9212.48 examples/s]
Map: 100%|██████████| 500/500 [00:00<00:00, 9899.00 examples/s]
Map: 100%|██████████| 500/500 [00:00<00:00, 9036.68 examples/s]


In [110]:
print("Tipul etichetelor înainte de procesare:", dataset['labels'].dtype)

Tipul etichetelor înainte de procesare: torch.float32


In [111]:
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    fp16=False,
    bf16=False,
    no_cuda=True
)



In [112]:
print(len(symptom_phrase_dict))
model.classifier = torch.nn.Linear(model.config.hidden_size, 131)  # Ajustează la numărul corect de clase

131


In [113]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer
)
trainer.train()
model.save_pretrained('tinybert_symptoms_classifier')

  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,No log,0.112644
2,0.288000,0.068542
3,0.076900,0.062055


In [115]:
model.save_pretrained('tinybert_symptoms_classifier')
tokenizer.save_pretrained('tinybert_symptoms_classifier_tokenizer')

('tinybert_symptoms_classifier_tokenizer/tokenizer_config.json',
 'tinybert_symptoms_classifier_tokenizer/special_tokens_map.json',
 'tinybert_symptoms_classifier_tokenizer/vocab.txt',
 'tinybert_symptoms_classifier_tokenizer/added_tokens.json')

In [130]:
# Exemplu de text de intrare
text = "I have a rash on my skin"
# Tokenizează textul
inputs = tokenizer(text, return_tensors="pt", truncation=True, padding="max_length", max_length=256)

# Muta datele pe același dispozitiv ca și modelul
inputs = {key: value.to(torch.device("cpu")) for key, value in inputs.items()}

In [132]:
# Obține predicțiile de la model
with torch.no_grad():  # Nu vrem să facem backpropagation pentru predicție
    outputs = model(**inputs)

# Obține logit-urile (ieșirile brute ale modelului)
logits = outputs.logits

# Dacă ai un model de clasificare multi-label, poți folosi sigmoid pentru a obține probabilitățile
probabilities = torch.sigmoid(logits)

# Poți alege un prag (de exemplu, 0.5) pentru a decide dacă o etichetă este activă sau nu
predictions = probabilities > 0.5

# Afișează predicțiile
print(predictions)

tensor([[False, False, False, False, False, False, False, False, False, False,
         False, False, False, False, False, False, False, False, False, False,
         False, False, False, False, False, False, False, False, False, False,
         False, False, False, False, False, False, False, False, False, False,
         False, False, False, False, False, False, False, False, False, False,
         False, False, False, False, False, False, False, False, False, False,
         False, False, False, False, False, False, False, False, False, False,
         False, False, False, False, False, False, False, False, False, False,
         False, False, False, False, False, False, False, False, False, False,
         False, False, False, False, False, False, False, False, False, False,
         False, False, False, False, False, False, False, False, False, False,
         False, False, False, False, False, False, False, False, False, False,
         False, False, False, False, False, False, F