In [7]:
import sys
print(f"Python: {sys.executable}")
import torch, transformers
print(f"PyTorch: {torch.__version__}")
print(f"Transformers: {transformers.__version__}")

Python: c:\Users\shrav\anaconda3\envs\bert_training\python.exe
PyTorch: 2.7.1+cpu
Transformers: 4.53.0


In [None]:
%pip install transformers torch pandas scikit-learn datasets

Collecting sklearn
  Using cached sklearn-0.0.post12.tar.gz (2.6 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'error'
Note: you may need to restart the kernel to use updated packages.


  error: subprocess-exited-with-error
  
  × python setup.py egg_info did not run successfully.
  │ exit code: 1
  ╰─> [15 lines of output]
      The 'sklearn' PyPI package is deprecated, use 'scikit-learn'
      rather than 'sklearn' for pip commands.
      
      Here is how to fix this error in the main use cases:
      - use 'pip install scikit-learn' rather than 'pip install sklearn'
      - replace 'sklearn' by 'scikit-learn' in your pip requirements files
        (requirements.txt, setup.py, setup.cfg, Pipfile, etc ...)
      - if the 'sklearn' package is used by one of your dependencies,
        it would be great if you take some time to track which package uses
        'sklearn' instead of 'scikit-learn' and report it to their issue tracker
      - as a last resort, set the environment variable
        SKLEARN_ALLOW_DEPRECATED_SKLEARN_PACKAGE_INSTALL=True to avoid this error
      
      More information is available at
      https://github.com/scikit-learn/sklearn-pypi-packag

In [9]:
import pandas as pd
import os
import torch
import joblib
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from transformers import (
    DistilBertTokenizer, 
    DistilBertForSequenceClassification, 
    Trainer, 
    TrainingArguments,
    TrainerCallback
)
from datasets import Dataset

In [10]:
# Load data with error handling
try:
    df = pd.read_csv('data/drugsComTrain_raw.tsv', sep='\t', nrows=50000)  # Reduced for memory
    print(f"✅ Data loaded: {len(df)} records")
except FileNotFoundError:
    print("❌ Data file not found. Please check the path.")
    # Alternative path
    df = pd.read_csv('drugsComTrain_raw.tsv', sep='\t', nrows=50000)

# Filter and clean data
df = df[df['condition'].isin(['Birth Control', 'Depression', 'High Blood Pressure', 'Diabetes, Type 2'])]
df = df.dropna(subset=['review'])
df = df[df['review'].str.len() > 10]  # Remove very short reviews

print(f"✅ Filtered data: {len(df)} records")
print(f"Conditions distribution:\n{df['condition'].value_counts()}")

✅ Data loaded: 50000 records
✅ Filtered data: 13093 records
Conditions distribution:
condition
Birth Control          8841
Depression             2754
Diabetes, Type 2        797
High Blood Pressure     701
Name: count, dtype: int64


In [11]:
def preprocess_text(text):
    """Clean and preprocess text data"""
    import re
    # Remove HTML tags
    text = re.sub(r'<[^<]+?>', '', str(text))
    # Remove extra whitespace
    text = ' '.join(text.split())
    # Truncate very long texts
    return text[:512] if len(text) > 512 else text

df['cleaned_text'] = df['review'].apply(preprocess_text)


In [12]:
label_encoder = LabelEncoder()
df['encoded_labels'] = label_encoder.fit_transform(df['condition'])
print(f"✅ Label classes: {label_encoder.classes_}")

✅ Label classes: ['Birth Control' 'Depression' 'Diabetes, Type 2' 'High Blood Pressure']


In [13]:
X_train, X_test, y_train, y_test = train_test_split(
    df['cleaned_text'], 
    df['encoded_labels'], 
    test_size=0.2, 
    random_state=42,
    stratify=df['encoded_labels']
)

print(f"✅ Train size: {len(X_train)}, Test size: {len(X_test)}")

✅ Train size: 10474, Test size: 2619


In [14]:
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

# Tokenize with progress indication
print("🔄 Tokenizing training data...")
train_encodings = tokenizer(
    list(X_train), 
    truncation=True, 
    padding=True, 
    max_length=256,
    return_tensors="pt"
)

print("🔄 Tokenizing test data...")
test_encodings = tokenizer(
    list(X_test), 
    truncation=True, 
    padding=True, 
    max_length=256,
    return_tensors="pt"
)

print("✅ Tokenization complete")

🔄 Tokenizing training data...
🔄 Tokenizing test data...
✅ Tokenization complete


In [15]:
class MedicalDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels.iloc[idx], dtype=torch.long)
        return item

    def __len__(self):
        return len(self.labels)

# Create datasets
train_dataset = MedicalDataset(train_encodings, y_train)
test_dataset = MedicalDataset(test_encodings, y_test)

print("✅ Datasets created")


✅ Datasets created


In [16]:
model = DistilBertForSequenceClassification.from_pretrained(
    'distilbert-base-uncased', 
    num_labels=len(label_encoder.classes_)
)

print("✅ Model loaded")
print("ℹ️  The warning about uninitialized weights is normal - we'll train these layers!")


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


✅ Model loaded


In [17]:
# Create output directories
os.makedirs('./results', exist_ok=True)
os.makedirs('./logs', exist_ok=True)

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,  # Reduced for faster training
    per_device_train_batch_size=8,  # Reduced for memory
    per_device_eval_batch_size=16,
    warmup_steps=500,
    weight_decay=0.01,
    eval_strategy="epoch",  # ✅ FIXED: eval_strategy instead of evaluation_strategy
    logging_dir='./logs',
    logging_steps=50,
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    greater_is_better=True,
    save_total_limit=2,
    dataloader_num_workers=0,  # For Windows compatibility
)

print("✅ Training arguments configured")

✅ Training arguments configured


In [18]:
def compute_metrics(eval_pred):
    """Compute accuracy metric"""
    predictions, labels = eval_pred
    predictions = predictions.argmax(axis=-1)
    accuracy = accuracy_score(labels, predictions)
    return {'accuracy': accuracy}

In [19]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
)

print("✅ Trainer initialized")

✅ Trainer initialized


  trainer = Trainer(


In [20]:
print("🚀 Starting training...")
print("This may take 15-30 minutes depending on your hardware...")

try:
    # Train the model
    trainer.train()
    print("✅ Training completed!")
    
    # Evaluate
    results = trainer.evaluate()
    print(f"📊 Final Results:")
    print(f"   Test Accuracy: {results['eval_accuracy']:.4f}")
    print(f"   Test Loss: {results['eval_loss']:.4f}")
    
except Exception as e:
    print(f"❌ Training failed: {str(e)}")
    print("💡 Try reducing batch_size or num_train_epochs")

🚀 Starting training...
This may take 15-30 minutes depending on your hardware...




Epoch,Training Loss,Validation Loss,Accuracy
1,0.1396,0.105214,0.968309
2,0.1027,0.109719,0.973272
3,0.0484,0.117983,0.978618




✅ Training completed!




📊 Final Results:
   Test Accuracy: 0.9786
   Test Loss: 0.1180


In [21]:
os.makedirs('model', exist_ok=True)

try:
    # Save model and tokenizer
    model.save_pretrained('./model/distilbert-drug-review-model')
    tokenizer.save_pretrained('./model/distilbert-drug-review-tokenizer')
    
    # Save label encoder
    joblib.dump(label_encoder, 'model/label_encoder.pkl')
    
    print("✅ Model saved successfully!")
    print("📁 Saved files:")
    print("   - ./model/distilbert-drug-review-model/")
    print("   - ./model/distilbert-drug-review-tokenizer/")
    print("   - ./model/label_encoder.pkl")
    
except Exception as e:
    print(f"❌ Failed to save model: {str(e)}")

✅ Model saved successfully!
📁 Saved files:
   - ./model/distilbert-drug-review-model/
   - ./model/distilbert-drug-review-tokenizer/
   - ./model/label_encoder.pkl


In [22]:
def test_prediction(text):
    """Test the trained model with a sample text"""
    try:
        inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=256)
        
        with torch.no_grad():
            outputs = model(**inputs)
            prediction = torch.argmax(outputs.logits, dim=-1)
            confidence = torch.softmax(outputs.logits, dim=-1).max().item()
            
        predicted_condition = label_encoder.inverse_transform([prediction.item()])[0]
        return predicted_condition, confidence
        
    except Exception as e:
        return f"Error: {str(e)}", 0.0

# Test with sample texts
test_texts = [
    "I've been taking this birth control pill for 6 months with no side effects.",
    "This antidepressant helped my depression significantly.",
    "My blood pressure is well controlled with this medication."
]

print("\n🧪 Testing trained model:")
for text in test_texts:
    condition, confidence = test_prediction(text)
    print(f"Text: {text[:50]}...")
    print(f"Prediction: {condition} (confidence: {confidence:.3f})")
    print()

print("🎉 BERT training pipeline completed!")


🧪 Testing trained model:
Text: I've been taking this birth control pill for 6 mon...
Prediction: Birth Control (confidence: 1.000)

Text: This antidepressant helped my depression significa...
Prediction: Depression (confidence: 1.000)

Text: My blood pressure is well controlled with this med...
Prediction: High Blood Pressure (confidence: 0.998)

🎉 BERT training pipeline completed!
