In [1]:
pip install transformers torch pandas sklearn datasets


Collecting transformers
  Downloading transformers-4.46.3-py3-none-any.whl (10.0 MB)
Collecting sklearn
  Downloading sklearn-0.0.post12.tar.gz (2.6 kB)
  Downloading sklearn-0.0.post11.tar.gz (3.6 kB)
  Downloading sklearn-0.0.post10.tar.gz (3.6 kB)
  Downloading sklearn-0.0.post9.tar.gz (3.6 kB)
  Downloading sklearn-0.0.post7.tar.gz (3.6 kB)
  Downloading sklearn-0.0.post5.tar.gz (3.7 kB)
  Downloading sklearn-0.0.post4.tar.gz (3.6 kB)
  Downloading sklearn-0.0.post1.tar.gz (3.6 kB)
  Downloading sklearn-0.0.tar.gz (1.1 kB)
Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
Collecting fsspec[http]<=2024.9.0,>=2023.1.0
  Downloading fsspec-2024.9.0-py3-none-any.whl (179 kB)
Collecting dill<0.3.9,>=0.3.0
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
Collecting tqdm>=4.66.3
  Downloading tqdm-4.67.1-py3-none-any.whl (78 kB)
Collecting multiprocess<0.70.17
  Downloading multiprocess-0.70.16-py38-none-any.whl (132 kB)
Collecting pyarrow>=15.0.0
  Downloading 

    ERROR: Command errored out with exit status 1:
     command: 'c:\Users\shrav\anaconda3\python.exe' -c 'import sys, setuptools, tokenize; sys.argv[0] = '"'"'C:\\Users\\shrav\\AppData\\Local\\Temp\\pip-install-p0zpoiay\\sklearn_1774f105466f458bbb7132150f3c1532\\setup.py'"'"'; __file__='"'"'C:\\Users\\shrav\\AppData\\Local\\Temp\\pip-install-p0zpoiay\\sklearn_1774f105466f458bbb7132150f3c1532\\setup.py'"'"';f=getattr(tokenize, '"'"'open'"'"', open)(__file__);code=f.read().replace('"'"'\r\n'"'"', '"'"'\n'"'"');f.close();exec(compile(code, __file__, '"'"'exec'"'"'))' egg_info --egg-base 'C:\Users\shrav\AppData\Local\Temp\pip-pip-egg-info-m5uzwaf5'
         cwd: C:\Users\shrav\AppData\Local\Temp\pip-install-p0zpoiay\sklearn_1774f105466f458bbb7132150f3c1532\
    Complete output (15 lines):
    The 'sklearn' PyPI package is deprecated, use 'scikit-learn'
    rather than 'sklearn' for pip commands.
    
    Here is how to fix this error in the main use cases:
    - use 'pip install scikit-le

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments, TrainerCallback
from datasets import Dataset
from sklearn.preprocessing import LabelEncoder
import torch
import os
import joblib
from sklearn.metrics import accuracy_score



In [3]:
df = pd.read_csv('drugsComTrain_raw.tsv', sep='\t')
len(df)

161297

In [5]:
# Load and prepare data
df = pd.read_csv('drugsComTrain_raw.tsv', sep='\t', nrows = 150000)
df = df[df['condition'].isin(['Birth Control', 'Depression', 'High Blood Pressure', 'Diabetes, Type 2'])]
df = df.dropna(subset=['review'])  # Ensuring no null reviews

# Preprocess text data
def preprocess_text(text):
    return text.str.replace('<[^<]+?>', '')  # Remove HTML tags

df['cleaned_text'] = preprocess_text(df['review'])

# Prepare labels
label_encoder = LabelEncoder()
df['encoded_labels'] = label_encoder.fit_transform(df['condition'])
labels = df['encoded_labels']

# Split data first
X_train, X_test, y_train, y_test = train_test_split(df['cleaned_text'], labels, test_size=0.2, random_state=42)

# Tokenization
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
train_encodings = tokenizer(list(X_train), truncation=True, padding=True, max_length=256)
test_encodings = tokenizer(list(X_test), truncation=True, padding=True, max_length=256)

# Create torch datasets
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels.iloc[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = CustomDataset(train_encodings, y_train)
test_dataset = CustomDataset(test_encodings, y_test)

# Model setup
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=len(label_encoder.classes_))

# Training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    warmup_steps=500,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    logging_dir='./logs',
    logging_steps=10,
)
# Prepare to capture training and validation loss
training_loss_set = []
validation_loss_set = []

# Callback to Hugging Face Trainer to capture training loss after each logging step
class LossLoggingCallback(TrainerCallback):
    def on_log(self, args, state, control, logs=None, **kwargs):
        if 'loss' in logs:
            training_loss_set.append(logs['loss'])
        if 'eval_loss' in logs:
            validation_loss_set.append(logs['eval_loss'])
# Define the compute_metrics function
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    acc = accuracy_score(labels, preds)
    return {'accuracy': acc}

# Initialize Trainer with compute_metrics
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics
)

# Train the model
trainer.train()

# Evaluate the model
results = trainer.evaluate()
print(f"Test Loss: {results['eval_loss']}, Test Accuracy: {results['eval_accuracy']}")


  return text.str.replace('<[^<]+?>', '')  # Remove HTML tags
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


ImportError: Using the `Trainer` with `PyTorch` requires `accelerate>=0.26.0`: Please run `pip install transformers[torch]` or `pip install 'accelerate>={ACCELERATE_MIN_VERSION}'`

In [None]:
len(df)

39788

In [None]:
model.save_pretrained('./distilbert-drug-review-model')
tokenizer.save_pretrained('./distilbert-drug-review-tokenizer')


('./distilbert-drug-review-tokenizer/tokenizer_config.json',
 './distilbert-drug-review-tokenizer/special_tokens_map.json',
 './distilbert-drug-review-tokenizer/vocab.txt',
 './distilbert-drug-review-tokenizer/added_tokens.json')

In [None]:

print("Model directory contents:", os.listdir('./distilbert-drug-review-model'))
print("Tokenizer directory contents:", os.listdir('./distilbert-drug-review-tokenizer'))


Model directory contents: ['config.json', 'pytorch_model.bin']
Tokenizer directory contents: ['tokenizer_config.json', 'special_tokens_map.json', 'vocab.txt']


In [None]:

# Load trained model and tokenizer
model_path = './distilbert-drug-review-model'
tokenizer_path = './distilbert-drug-review-tokenizer'

model = DistilBertForSequenceClassification.from_pretrained(model_path)
tokenizer = DistilBertTokenizer.from_pretrained(tokenizer_path)


In [None]:
def predict_condition(texts):
    encoded_texts = tokenizer(texts, padding=True, truncation=True, max_length=256, return_tensors="pt")
    outputs = model(**encoded_texts)
    predictions = torch.argmax(outputs.logits, dim=-1)
    return [label_encoder.classes_[pred] for pred in predictions.tolist()]  # Adjusted for direct use of classes

# Example usage
texts = [
    "I've been on birth control for two years with no side effects.",
    "This medication made my depression worse.",
    "Excellent control of blood pressure with this medication."
]
predicted_conditions = predict_condition(texts)
for text, pred in zip(texts, predicted_conditions):
    print(f"Text: {text}\nPredicted Condition: {pred}\n")


Text: I've been on birth control for two years with no side effects.
Predicted Condition: Birth Control

Text: This medication made my depression worse.
Predicted Condition: Depression

Text: Excellent control of blood pressure with this medication.
Predicted Condition: High Blood Pressure



In [None]:
joblib.dump(label_encoder, 'label_encoder.pkl')  # Saving
label_encoder = joblib.load('label_encoder.pkl')  # Loading


[]