<a href="https://colab.research.google.com/github/hanzlikhan/Automated-medical-diagnosis/blob/main/Untitled17.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Install Hugging Face Transformers and Datasets
!pip install transformers datasets

# Install PyTorch (if not already installed)
!pip install torch

# Install Pandas and NumPy for data manipulation
!pip install pandas numpy

# Install Scikit-learn for evaluation metrics
!pip install scikit-learn

# Install Matplotlib and Seaborn for data visualization
!pip install matplotlib seaborn

# Install TweetPreprocessor for tweet preprocessing (optional)
!pip install tweet-preprocessor

# Install NLTK for text processing (optional)
!pip install nltk

# Install tqdm for progress bars
!pip install tqdm

# Install TensorBoard for logging (optional)
!pip install tensorboard

# Install emoji for handling emojis in tweets (optional)
!pip install emoji


Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m11.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m9.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (

In [None]:
# Install required dependencies
!pip install transformers datasets torch scikit-learn

# Import necessary libraries
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import load_dataset
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# Set the model name and load pretrained model and tokenizer
model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
pretrained_model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

# Load the dataset
dataset = load_dataset("sentiment140", split='train[:1%]')  # Using a small subset for quick training

# Preprocessing function for the dataset
def preprocess(data):
    return tokenizer(data['text'], truncation=True, padding='max_length', max_length=64)

# Apply preprocessing
encoded_dataset = dataset.map(preprocess, batched=True)
encoded_dataset = encoded_dataset.rename_column("sentiment", "labels")  # Rename column to match Trainer requirements
encoded_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

# Split dataset into train and validation sets
train_size = int(0.8 * len(encoded_dataset))
train_dataset = encoded_dataset.select(range(train_size))
valid_dataset = encoded_dataset.select(range(train_size, len(encoded_dataset)))

# Define compute metrics function
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "f1": f1, "precision": precision, "recall": recall}

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
)

# Fine-tune the model
trainer = Trainer(
    model=pretrained_model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    compute_metrics=compute_metrics
)

print("Fine-tuning the model...")
trainer.train()

# Save the fine-tuned model
fine_tuned_model_path = "fine-tuned-sentiment-model"
trainer.save_model(fine_tuned_model_path)

# Evaluate the fine-tuned model
print("Evaluating the fine-tuned model...")
evaluation_results = trainer.evaluate()
print(evaluation_results)

# Load the fine-tuned model for comparison
fine_tuned_model = AutoModelForSequenceClassification.from_pretrained(fine_tuned_model_path)

# Function to make predictions using both models
def predict_sentiment(text, model):
    # Check if a GPU is available and move model to GPU if possible
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    # Tokenize the input text and move inputs to the same device as the model
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding="max_length", max_length=64).to(device)

    # Get model outputs
    with torch.no_grad():  # Disable gradient calculation for inference
        outputs = model(**inputs)

    # Get the prediction
    prediction = torch.argmax(outputs.logits, dim=1).item()
    sentiment = "Positive" if prediction == 1 else "Negative"
    return sentiment

# Test samples
sample_text = "I love this product!"
pretrained_sentiment = predict_sentiment(sample_text, pretrained_model)
finetuned_sentiment = predict_sentiment(sample_text, fine_tuned_model)

print(f"Pretrained Model Prediction: {pretrained_sentiment}")
print(f"Fine-tuned Model Prediction: {finetuned_sentiment}")




Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Fine-tuning the model...


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.0,2.5e-05,1.0,0.0,0.0,0.0
2,0.0,1.2e-05,1.0,0.0,0.0,0.0


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Evaluating the fine-tuned model...


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 1.2039246939821169e-05, 'eval_accuracy': 1.0, 'eval_f1': 0.0, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_runtime': 5.7844, 'eval_samples_per_second': 553.208, 'eval_steps_per_second': 34.575, 'epoch': 2.0}
Pretrained Model Prediction: Negative
Fine-tuned Model Prediction: Negative
