In [None]:
"""
This file does the finetuning work necessary for the classification of
Yelp reviews that do or do not mention that the place they are for offers
live music.

NOTE: Everything here is highly based on Stephen Hansen's GitHub tutorial
at https://github.com/sekhansen/columbia_lectures_2025/blob/main/code/03_classification_bert.ipynb
and his paper REMOTE WORK ACROSS JOBS, COMPANIES, AND SPACE (Hansen et al.)
"""
import pandas as pd
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from datasets import Dataset
import evaluate
import random

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
# setup
# Set seeds for reproducibility
def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

# Set seed at the beginning
set_seed(42)

# setup running on GPU
device = torch.device("cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu")
print(f"Using device: {device}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"CUDA device count: {torch.cuda.device_count()}")
    print(f"CUDA device name: {torch.cuda.get_device_name(0)}")
elif torch.backends.mps.is_available():
    print("Using Apple Metal Performance Shaders (MPS)")

In [None]:
# load data
df = pd.read_csv("./finetuning_reviews.csv", encoding='utf-8')

Prepare data for finetuning

In [None]:
# perform train-test split

n = len(df)
test_size = int(0.1 * n)
indices = np.random.RandomState(95).permutation(n)
train_idxs, test_idxs = indices[test_size:], indices[:test_size]


In [None]:
df_finetune = df.iloc[train_idxs][["text", "live_music"]].copy()
df_finetune['labels'] = df_finetune['live_music']
print(df_finetune.shape)

df_test = df.iloc[test_idxs][["text", "live_music"]].copy()
df_test['labels'] = df_test['live_music']
print(df_test.shape)

In [None]:
df_finetune.head()

Finetune!

In [None]:
# transform data into Dataset class
finetune_dataset = Dataset.from_pandas(df_finetune)
test_dataset = Dataset.from_pandas(df_test)

In [None]:
# we use cased to identify proper nouns
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

# tokenize the dataset
def tokenize_function(examples):
    return tokenizer(examples["text"], max_length=512, padding="max_length", truncation=True)


# batched=True is key for training
tokenized_ft = finetune_dataset.map(tokenize_function, batched=True)
tokenized_test = test_dataset.map(tokenize_function, batched=True)

In [None]:
model_ft = AutoModelForSequenceClassification.from_pretrained(
    "bert-base-cased",
    num_labels=2
)

In [None]:
training_args = TrainingArguments(
    output_dir="./",                # path to save model
    learning_rate=5e-5,             # small learning rates
    num_train_epochs=2,             # number of finetuning passes
    per_device_train_batch_size=8,  # batch size per GPU
    per_device_eval_batch_size=8,   # batch size per GPU
    eval_strategy="epoch",          # See metrics during training
    save_strategy="no",             # Don't save checkpoints
    report_to="none",               # Don't report model estimation externally
)

In [None]:
# model performance metrics

metric_precision = evaluate.load("precision")
metric_recall = evaluate.load("recall")
metric_f1 = evaluate.load("f1")
metric_accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)

    precision = metric_precision.compute(predictions=predictions, references=labels, average="micro")["precision"]
    recall = metric_recall.compute(predictions=predictions, references=labels, average="micro")["recall"]
    f1 = metric_f1.compute(predictions=predictions, references=labels, average="micro")["f1"]
    accuracy = metric_accuracy.compute(predictions=predictions, references=labels)["accuracy"]

    return {"precision": precision, "recall": recall, "f1": f1, "accuracy": accuracy}

In [None]:
print(f"Training samples: {len(tokenized_ft)}")
print(f"Batches per epoch: {len(tokenized_ft) / 8}")

In [None]:
trainer = Trainer(
    model=model_ft,
    args=training_args,
    train_dataset=tokenized_ft,
    eval_dataset=tokenized_test,
    compute_metrics=compute_metrics
)

trainer.train()

evaluate model on test data

In [None]:

# Get predictions
results = trainer.predict(tokenized_test)
predictions = np.argmax(results.predictions, axis=-1)

# Add to dataframe
df_test['prediction'] = predictions

df_test.head()

In [None]:
# Show mistakes
print("\nMisclassified examples:")
mistakes = df_test[df_test['labels'] != df_test['prediction']]

first_mistake = mistakes.iloc[0]
print(f"Text: {first_mistake['text']}")
print(f"True: {first_mistake['labels']}")
print(f"Predicted: {first_mistake['prediction']}")
# note that this is not actually a mistake!
# There are some minor errors in the labelled dataset.

second_mistake = mistakes.iloc[1]
print(f"Text: {second_mistake['text']}")
print(f"True: {second_mistake['labels']}")
print(f"Predicted: {second_mistake['prediction']}")

print(f"Accuracy: {1-len(mistakes)/len(df_test)}")

In [None]:
# Confusion matrix
from sklearn.metrics import confusion_matrix, classification_report
import seaborn as sns
import matplotlib.pyplot as plt

cm = confusion_matrix(df_test['labels'], df_test['prediction'])
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.title('Confusion Matrix')
plt.show()

Run on entire dataset

In [2]:
# read in data
all_reviews = pd.read_csv("./all_restaurant_bar_reviews.csv")
all_reviews.head()

Unnamed: 0,review_id,user_id,business_id,stars,useful,funny,cool,text,date,city,year
0,saUsX_uimxRlCVr67Z4Jig,8g_iMtfSiwikVnbP2etR0A,YjUWPpI6HXG530lwP-fb2A,3,0,0,0,Family diner. Had the buffet. Eclectic assortm...,2014-02-05 20:30:30,tucson,2014
1,AqPFMleE6RsU23_auESxiA,_7bHUi9Uuf5__HHc_Q8guQ,kxX2SOes4o-D3ZQBkiMRfA,5,1,0,1,"Wow! Yummy, different, delicious. Our favo...",2015-01-04 00:01:03,philadelphia,2015
2,Sx8TMOWLNuJBWer-0pcmoA,bcjbaE6dDog4jkNY91ncLQ,e4Vwtrqf-wpJfwesgvdgxQ,4,1,0,1,Cute interior and owner (?) gave us tour of up...,2017-01-14 20:54:15,new orleans,2017
3,JrIxlS1TzJ-iCu79ul40cQ,eUta8W_HdHMXPzLBBZhL1A,04UD14gamNjLY0IDYVhHJg,1,1,2,1,I am a long term frequent customer of this est...,2015-09-23 23:10:31,philadelphia,2015
4,pUycOfUwM8vqX7KjRRhUEA,59MxRhNVhU9MYndMkz0wtw,gebiRewfieSdtt17PTW6Zg,3,0,0,0,Had a party of 6 here for hibachi. Our waitres...,2016-07-25 07:31:06,santa barbara,2016


In [None]:
# prepare data for classifying
review_df = all_reviews[['review_id', 'text']]

# Split into 5 chunks
n_chunks = 5
chunk_size = len(review_df) // n_chunks
remainder = len(review_df) % n_chunks

chunks = []
start_idx = 0

for i in range(n_chunks):
    # Add one extra row to the first 'remainder' chunks to handle uneven division
    current_chunk_size = chunk_size + (1 if i < remainder else 0)
    end_idx = start_idx + current_chunk_size
    
    chunk = review_df.iloc[start_idx:end_idx].copy()
    chunks.append(chunk)
    
    print(f"Chunk {i+1}: {len(chunk)} rows (indices {start_idx} to {end_idx-1})")
    start_idx = end_idx

print(f"\nTotal rows: {len(review_df)}")
print(f"Sum of chunks: {sum(len(chunk) for chunk in chunks)}")

In [None]:
# use chunked processing
import time
from tqdm import tqdm

chunk_size = 100000  # Process 100K at a time
all_predictions = []

for i in tqdm(range(0, len(review_df), chunk_size)):
    chunk_df = review_df[i:i+chunk_size]
    chunk_tokenized = full_tokenized.select(range(i, min(i + chunk_size, len(full_tokenized))))

    start = time.time()
    chunk_results = trainer.predict(chunk_tokenized)
    chunk_predictions = np.argmax(chunk_results.predictions, axis=-1)
    all_predictions.extend(chunk_predictions)
    
    print(f"Chunk {i//chunk_size + 1}: {len(chunk_df)} samples in {time.time()-start:.1f}s")

In [None]:
output_df = review_df.copy()
output_df['live_music'] = all_predictions
output_df.to_csv('reviews_with_live_music.csv', index=False)