In [None]:
%pip install --upgrade pip
%pip install pandas
%pip install scikit-learn
%pip install transformers
%pip install torch
%pip install 'accelerate>=0.26.0'
%pip install --upgrade ipywidgets

The section below is used to manage the data. It parses the data so only the relevant data is kept and cleans that columns 

Step 2

In [1]:
import pandas as pd

# Load dataset
data = pd.read_csv("RateMyProfessor_Sample data.csv")

# Extract relevant columns 
data = data[["comments", "star_rating"]]

#dropping all the rows with missing data
data = data.dropna()

import re

def clean_text(text):
    text = re.sub(r"http\S+|www\S+|https\S+", "", text, flags=re.MULTILINE)  # Remove URLs
    text = re.sub(r"\@\w+|\#", "", text)  # Remove mentions and hashtags
    text = re.sub(r"[^\w\s]", "", text)  # Remove special characters
    text = text.lower()  # Lowercase
    return text

data["comments"] = data["comments"].apply(clean_text)




The section below splits the data into training data and test data


In [2]:
from sklearn.model_selection import train_test_split

train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)
train_data, val_data = train_test_split(train_data, test_size=0.1, random_state=42)

The section below tokenizes the comments for DistilBERT

Step 3

In [3]:
from transformers import DistilBertTokenizer

tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")

# Tokenize the data
def tokenize_data(df):
    return tokenizer(
        df["comments"].tolist(),
        padding=True,
        truncation=True,
        max_length=128,
        return_tensors="pt",
    )

train_encodings = tokenize_data(train_data)
val_encodings = tokenize_data(val_data)
test_encodings = tokenize_data(test_data)

Loads the DistilBERT model

Step 4

In [17]:
from transformers import DistilBertForSequenceClassification, Trainer, TrainingArguments
import torch.nn as nn

# For regression (predicting a continuous score)
model = DistilBertForSequenceClassification.from_pretrained(
    "distilbert-base-uncased", num_labels=1
)

# Add dropout to the model
model.dropout = nn.Dropout(p=0.1)  # 10% dropout

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [18]:
import torch

class ReviewDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx], dtype=torch.float)  # For regression
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = ReviewDataset(train_encodings, train_data["star_rating"].tolist())
val_dataset = ReviewDataset(val_encodings, val_data["star_rating"].tolist())
test_dataset = ReviewDataset(test_encodings, test_data["star_rating"].tolist())

In [26]:
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",       # Save at the end of each epoch
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    save_total_limit=2,
    load_best_model_at_end=True,  # Load the best model at the end
    metric_for_best_model="eval_loss",  # Use validation loss to determine the best model
    greater_is_better=False,  # Lower validation loss is better
    max_grad_norm=1.0,  # Gradient clipping
)



In [27]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

trainer.train()

  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch,Training Loss,Validation Loss
1,0.833,0.537835
2,0.4687,0.565317
3,0.3903,0.54795


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


TrainOutput(global_step=2700, training_loss=0.5175186891908999, metrics={'train_runtime': 574.4251, 'train_samples_per_second': 75.174, 'train_steps_per_second': 4.7, 'total_flos': 1430026299247104.0, 'train_loss': 0.5175186891908999, 'epoch': 3.0})

In [28]:
results = trainer.evaluate(test_dataset)
print(results)

  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


{'eval_loss': 0.5078763961791992, 'eval_runtime': 9.5852, 'eval_samples_per_second': 417.207, 'eval_steps_per_second': 26.082, 'epoch': 3.0}


In [29]:
import torch

# Move model and inputs to CPU
model.to("cpu")

def predict_score(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=128)
    inputs = {key: value.to("cpu") for key, value in inputs.items()}  # Move inputs to CPU
    outputs = model(**inputs)
    return outputs.logits.item()

# Example
review = "The class is not not good!"
predicted_score = predict_score(review)
print(f"Predicted Score: {predicted_score}")

Predicted Score: 2.748915433883667
