In [1]:
%pip install --upgrade pip
%pip install pandas
%pip install scikit-learn
%pip install transformers
%pip install torch
%pip install 'accelerate>=0.26.0'
%pip install --upgrade ipywidgets

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


The section below is used to manage the data. It parses the data so only the relevant data is kept and cleans that columns 

Step 2

In [18]:
import pandas as pd
import re

# Load dataset
data = pd.read_csv("RateMyProfessor_Expanded.csv")

# Extract relevant columns
data = data[["comments", "star_rating", "student_star", "diff_index"]]

# Drop rows with missing data
data = data.dropna()

# Clean text
def clean_text(text):
    text = re.sub(r"http\S+|www\S+|https\S+", "", text, flags=re.MULTILINE)  # Remove URLs
    text = re.sub(r"\@\w+|\#", "", text)  # Remove mentions and hashtags
    text = re.sub(r"[^\w\s]", "", text)  # Remove special characters
    text = text.lower()  # Lowercase
    return text

data["comments"] = data["comments"].apply(clean_text)

# Calculate the combined score
data["combined_score"] = (data["star_rating"] * 0.5) + (data["student_star"] * 0.4) + (data["diff_index"] * 0.1)

The section below splits the data into training data and test data


In [19]:
from sklearn.model_selection import train_test_split

# Split the data into training, validation, and test sets
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)
train_data, val_data = train_test_split(train_data, test_size=0.1, random_state=42)

The section below tokenizes the comments for DistilBERT

Step 3

In [20]:
from transformers import DistilBertTokenizer

tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")

def tokenize_data(df):
    return tokenizer(
        df["comments"].tolist(),
        padding=True,
        truncation=True,
        max_length=128,
        return_tensors="pt",
    )

train_encodings = tokenize_data(train_data)
val_encodings = tokenize_data(val_data)
test_encodings = tokenize_data(test_data)

Loads the DistilBERT model

Step 4

In [21]:
from transformers import DistilBertForSequenceClassification
import torch.nn as nn

# For regression with 3 outputs
model = DistilBertForSequenceClassification.from_pretrained(
    "distilbert-base-uncased", num_labels=1
)

# Add dropout to the model
model.dropout = nn.Dropout(p=0.1)  # 10% dropout

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [22]:
import torch

class ReviewDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx], dtype=torch.float)  # For regression
        return item

    def __len__(self):
        return len(self.labels)

# Prepare labels for the combined score
train_labels = train_data["student_star"].tolist()
val_labels = val_data["student_star"].tolist()
test_labels = test_data["student_star"].tolist()

train_dataset = ReviewDataset(train_encodings, train_labels)
val_dataset = ReviewDataset(val_encodings, val_labels)
test_dataset = ReviewDataset(test_encodings, test_labels)

In [23]:
from transformers import  TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    max_grad_norm=1.0,
)



In [24]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

trainer.train()

  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch,Training Loss,Validation Loss
1,0.5959,0.569987
2,0.3862,0.397111
3,0.2755,0.334962


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


TrainOutput(global_step=13497, training_loss=0.5057001655840332, metrics={'train_runtime': 2886.2297, 'train_samples_per_second': 74.811, 'train_steps_per_second': 4.676, 'total_flos': 7150528891344384.0, 'train_loss': 0.5057001655840332, 'epoch': 3.0})

In [25]:
results = trainer.evaluate(test_dataset)
print(results)

  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


{'eval_loss': 0.3306896686553955, 'eval_runtime': 76.1777, 'eval_samples_per_second': 262.452, 'eval_steps_per_second': 16.409, 'epoch': 3.0}


In [36]:
import torch

# Move model and inputs to CPU
model.to("cpu")

def predict_score(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=128)
    inputs = {key: value.to("cpu") for key, value in inputs.items()}  # Move inputs to CPU
    outputs = model(**inputs)
    return outputs.logits.item()  # Returns the predicted combined score

# Example
review = "The professor would help, but the tests were too hard "
predicted_score = predict_score(review)
score = int(predicted_score+.5)
print(f"Predicted Student Score Rounded: {score}")
print(f"Predicted Student Score : {predicted_score}")

Predicted Student Score Rounded: 4
Predicted Student Score : 3.9652256965637207


In [None]:
# Save the model and tokenizer
model.save_pretrained('./my_model')
tokenizer.save_pretrained('./my_model')

# Load the model and tokenizer
model = DistilBertForSequenceClassification.from_pretrained('./my_model')
tokenizer = DistilBertTokenizer.from_pretrained('./my_model')