# Fine-Tune BERT for EMPAT Categories on Pros & Cons

This notebook fine-tunes BERT to classify Glassdoor pros and cons into one of the five EMPAT dimensions:

- **Application Value**
- **Social Value**
- **Interest Value**
- **Development Value**
- **Economic Value**

Then it uses K-Mean Clustering to extract trending topic terms for a Temporal Trends Panel on the insights dashboard.

In [None]:
#Install required libraries
!pip install -q torch transformers pandas scikit-learn

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m62.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m34.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m42.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m13.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m127.9/127.9 MB[0m [31m6.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim import AdamW
from torch.utils.data import Dataset, DataLoader, random_split
from torch.cuda.amp import autocast
from transformers import BertTokenizer, BertForSequenceClassification
from sklearn.metrics import f1_score, precision_score, recall_score
from collections import Counter, defaultdict
import random
import os

from google.colab import drive
drive.mount('/content/drive')
os.chdir('/content/drive/MyDrive/Colab Notebooks/')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
MODEL_PATH = "./saved_empat_model"
MAX_LEN = 128
BATCH_SIZE = 8
EPOCHS = 5

#Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

#Define categories
empat_categories = [
    "Economic Value", "Interest Value", "Social Value", "Development Value", "Application Value", "No Value"
]
display_categories = empat_categories[:-1]  #For prediction output

In [None]:
#Generated dataset for training and evaluation
training_data = [
    ("Flexible hours and remote work", ["Interest Value"]),
    ("Waste of my degree i didn't get to use any skills", ["Application Value"]),
    ("Supportive coworkers and fun atmosphere", ["Social Value"]),
    ("Clear path to promotion", ["Development Value"]),
    ("Competitive salary, long hours", ["Economic Value"]),
    ("Great salary and bonus structure.", ["Economic Value"]),
    ("Management listens to feedback, friendly team but work isn't very difficult", ["Application Value","Social Value"]),
    ("Working on cutting-edge technology and innovation", ["Application Value"]),
    ("Team-building activities and company outings", ["Social Value"]),
    ("Opportunities to take on leadership roles within the organization", ["Development Value"]),
    ("none", ["No Value"]),
    ("Ability to contribute directly to the success of major projects, salary increases based on individual contributions", ["Application Value","Economic Value"]),
    ("Generous paid time off and vacation policies but management don't read emails often", ["Economic Value","Social Value"]),
    ("Freedom to experiment and explore creative solutions", ["Interest Value", "Application Value"]),
    ("Diverse and inclusive workplace culture", ["Social Value"]),
    ("Clear mentorship structure to guide career growth.", ["Development Value"]),
    ("lots of parking space good location nice people", ["Interest Value", "Social Value"]),
    ("Using my design expertise to create impactful user experiences.", ["Application Value"]),
    ("Performance-based bonuses and incentives including promotions", ["Economic Value","Development Value"]),
    ("Company-provided housing or relocation assistance", ["Economic Value"]),
    ("Challenging tasks that keep me motivated every day", ["Application Value"]),
    ("A role that aligns with my personal values and interests", ["Interest Value"]),
    ("Supportive leadership that fosters trust and camaraderie", ["Social Value"]),
    ("Access to high-quality training programs and certifications", ["Development Value"]),
    ("Regular performance reviews with actionable feedback for improvement", ["Development Value"]),
    ("Leveraging my analytical skills in data-driven decision-making processes", ["Application Value"]),
    ("Salary is below industry standards and not competitive.", ["Economic value"]),
    ("can't think none", ["No Value"]),
    ("No opportunities to work on innovative or meaningful tasks", ["Application Value"]),
    ("found better offer elsewhere", ["Development Value"]),
    ("Lots of cafes and good food nearby", ["Interest Value"]),
    ("Team dynamics are toxic and lack collaboration.", ["Social Value"]),
    ("No clear path for career advancement or promotion", ["Economic Value","Development Value"]),
    ("Get paid well but My skills feel underutilized in this role", ["Economic Value", "Application Value"]),
    ("Company parties perfect networking opportunity", ["Social Value"]),
    ("everyone gets their own company device to work from home. travel costs compensated within salary", ["Interest Value","Economic Value"]),
    ("I’m assigned tasks that don’t match my expertise or interests", ["Application Value"]),
    ("Healthcare benefits are inadequate and expensive limited progression opportunities", ["Economic Value", "Development Value"]),
    ("Unclear policies regarding raises and promotions", ["Economic Value", "Development Value"]),
    ("Job responsibilities are repetitive and don’t challenge me.", ["Interest Value"]),
    ("Workplace culture feels exclusive and unwelcoming leaders dont want to help improve", ["Social Value", "Development Value"]),
    ("Coworkers are competitive rather than supportive", ["Social Value"]),
    ("Nothing really", ["No value"]),
    ("Management fails to address interpersonal conflicts effectively", ["Social Value"]),
    ("Feedback is infrequent and doesn’t provide actionable insights", ["Development Value"]),
    ("Opportunities for leadership roles are limited or nonexistent", ["Development Value"]),
    ("Lack of effective communication rubbish training left on my own to figure things out", ["Social Value", "Development Value"]),
    ("Tasks not explained well", ["Application Value"]),
    ("Office close to home", ["Interest Value"]),
    ("seniors unsupportive and unwilling to help juniors", ["Development Value"]),
    ("Met so many new friends", ["Social Value"]),
    ("i don't know nothing maybe", ["No Value"]),
    ("Great leadership with strong ideas for the future", ["Development Value"]),
    ("decent working hours work not too hard", ["Interest Value", "Application Value"]),
    ("Only internal promotions", ["Development Value"]),
    ("New, state-of-the-art technology that is constantly updated", ["Interest Value"]),
    ("Poor salary, management hard to reach", ["Economic Value", "Social Value", "Development Value"]),
    ("friendly staff and great onboarding", ["Social Value", "Development Value"]),
    ("lot of young people rude ceo no health insurance", ["Development Value","Economic Value"]),
    ("so much opportunities for growth nad progression", ["Development Value"]),
    ("idk", ["No Value"]),
    ("fantastic people great environmet", ["Social Value"]),
    ("awful pay", ["Economic Value"]),
    ("starting wage not great but reputable company", ["Economic Value", "Development Value"]),
    ("sound team and staff not flexibile with work", ["Social Value", "Application Value"]),
    ("office computers outdated and old small office space", ["Interest Value"])
] * 5
random.shuffle(training_data)

#Process dataset with multi-label encoding
data = []
for text, labels in training_data:
    # Create binary vector for all categories
    encoding = [1 if cat in [l.strip().title() for l in labels] else 0 for cat in empat_categories]
    data.append([text] + encoding)

#Create DataFrame
df = pd.DataFrame(data, columns=["text"] + empat_categories)

#Dataset class
class EmpatDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.FloatTensor(label)
        }


In [None]:
#Initialise model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(empat_categories), problem_type="multi_label_classification")
model.to(device)

#Prepare data
texts = df['text'].values
labels = df[empat_categories].values
dataset = EmpatDataset(texts, labels, tokenizer, MAX_LEN)

#Split into training and validation sets
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size

#Data Loader
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])
train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=BATCH_SIZE)

#Training setup
class_counts = df[empat_categories].sum(axis=0).values
class_weights = 1. / (class_counts + 1e-6)  #Add small epsilon to avoid division by zero
class_weights = torch.tensor(class_weights, dtype=torch.float32).to(device)
loss_fn = nn.BCEWithLogitsLoss(weight=class_weights)

optimizer = optim.AdamW(model.parameters(), lr=2e-5)

#Train function
def train():
  best_val_loss = float('inf')
  train_losses, val_losses = [], []
  for epoch in range(EPOCHS):
    model.train()
    total_train_loss = 0
    for batch in train_dataloader:
        optimizer.zero_grad()
        inputs = {
            'input_ids': batch['input_ids'].to(device),
            'attention_mask': batch['attention_mask'].to(device),
            'labels' : batch['labels'].to(device)
        }
        outputs = model(**inputs)
        loss = loss_fn(outputs.logits, inputs['labels'])
        total_train_loss += loss.item()
        loss.backward()
        optimizer.step()

    avg_train_loss = total_train_loss / len(train_dataloader)
    train_losses.append(avg_train_loss)

    #Validation
    model.eval()
    total_val_loss = 0
    all_labels, all_preds = [], []

    with torch.no_grad():
      for batch in val_dataloader:
        inputs = {
          'input_ids': batch['input_ids'].to(device),
          'attention_mask': batch['attention_mask'].to(device),
          'labels' : batch['labels'].to(device)
        }
        outputs = model(**inputs)
        loss = loss_fn(outputs.logits, inputs['labels'])
        total_val_loss += loss.item()

      avg_val_loss = total_val_loss / len(val_dataloader)
      val_losses.append(avg_val_loss)

      probs = torch.sigmoid(outputs.logits)
      preds = (probs > 0.3).int().cpu().numpy()
      labels = batch['labels'].cpu().numpy()
      all_preds.append(preds)
      all_labels.append(labels)

    all_preds = np.concatenate(all_preds, axis=0)
    all_labels = np.concatenate(all_labels, axis=0)

    print(f"Epoch {epoch+1}/{EPOCHS} | Training Loss: {avg_train_loss:.4f}, Validation Loss: {avg_val_loss:.4f}")
    print(f"  F1: {f1_score(all_labels, all_preds, average='micro'):.4f}, Precision: {precision_score(all_labels, all_preds, average='micro'):.4f}, Recall: {recall_score(all_labels, all_preds, average='micro'):.4f}")

    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss

#Start training
train()

#Save Model
model.save_pretrained(MODEL_PATH)
tokenizer.save_pretrained(MODEL_PATH)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/5 | Training Loss: 0.0085, Validation Loss: 0.0071
  F1: 0.5455, Precision: 0.3750, Recall: 1.0000
Epoch 2/5 | Training Loss: 0.0060, Validation Loss: 0.0055
  F1: 0.6667, Precision: 0.5000, Recall: 1.0000
Epoch 3/5 | Training Loss: 0.0047, Validation Loss: 0.0043
  F1: 0.7500, Precision: 0.6000, Recall: 1.0000
Epoch 4/5 | Training Loss: 0.0036, Validation Loss: 0.0032
  F1: 0.7500, Precision: 0.6000, Recall: 1.0000
Epoch 5/5 | Training Loss: 0.0027, Validation Loss: 0.0024
  F1: 0.7500, Precision: 0.6000, Recall: 1.0000


('./saved_empat_model/tokenizer_config.json',
 './saved_empat_model/special_tokens_map.json',
 './saved_empat_model/vocab.txt',
 './saved_empat_model/added_tokens.json')

In [None]:
model = BertForSequenceClassification.from_pretrained(MODEL_PATH)
tokenizer = BertTokenizer.from_pretrained(MODEL_PATH)
model.to(device)
model.eval()

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [None]:
#Load CSV
df_reviews = pd.read_csv("cleaned_glassdoor_reviews.csv")
#Create year-month column for time grouping
df_reviews['date_review'] = pd.to_datetime(df_reviews['date_review'], errors='coerce')
df_reviews['year_month'] = df_reviews['date_review'].dt.to_period('M').astype(str)

#Ensure columns exist
assert {'firm', 'pros', 'cons'}.issubset(df_reviews.columns)

#Predict empat categories for pros and cons
def predict(texts, batch_size=32, max_len=128):
    results = []
    model.eval()

    for i in range(0, len(texts), batch_size):
        batch_texts = texts[i:i+batch_size]
        encodings = tokenizer(
            batch_texts,
            max_length=max_len,
            padding=True,
            truncation=True,
            return_tensors='pt'
        ).to(device)

        with torch.no_grad():
            with torch.amp.autocast('cuda'):
                outputs = model(**encodings)
                probs = torch.sigmoid(outputs.logits).detach().cpu().numpy()

        for p in probs:
            filtered = {
                cat: float(p[i])
                for i, cat in enumerate(empat_categories) if cat != "No Value" and float(p[i]) > 0.3
            }
            if not filtered:
                # If no confident category found, assign No Value
                results.append({"No Value": 1.0})
            else:
                predictions = dict(sorted(filtered.items(), key=lambda item: item[1], reverse=True))
                results.append(predictions)
    return results

#Add predictions to dataframe
df_reviews['pros_cat'] = predict(df_reviews['pros'].fillna("").tolist(), batch_size=32)
df_reviews['cons_cat'] = predict(df_reviews['cons'].fillna("").tolist(), batch_size=32)

In [None]:
#Test Prediction function
def test_predict(text, threshold=0.3):
    encoding = tokenizer.encode_plus(
        text,
        max_length=MAX_LEN,
        padding='max_length',
        truncation=True,
        return_tensors='pt'
    )

    #Move input to the same device as the model
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)

    model.eval()
    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)

    probs = torch.sigmoid(outputs.logits).squeeze().cpu().numpy()
    #Filter out "No Value"
    filtered = {
        cat: f"{probs[i]:.2f}"
        for i, cat in enumerate(empat_categories) if cat != "No Value" and probs[i] > threshold
    }
    if not filtered:
        return {"No Value": 1.00}
    else:
      #Sort by score descending
      sorted_weights = dict(sorted(filtered.items(), key=lambda x: float(x[1]), reverse=True))
      return sorted_weights

#Test prediction
test_text = "poor salary poor training communication"
print(test_predict(test_text))

{'Economic Value': '0.78', 'Development Value': '0.63'}


In [None]:
#Extract top empat category and top sentiment for pros/cons
def extract_top(row, pred_col, text_col):
    predictions = row[pred_col]
    text = row[text_col]
    if not isinstance(text, str) or not text.strip():
        return None, None
    top_cat = next(iter(predictions))
    if top_cat == "No Value":
        return None, None  #Skip it
    return top_cat, text.strip().lower()

df_reviews[['top_pros_category', 'top_pros_text']] = df_reviews.apply(
    lambda row: pd.Series(extract_top(row, 'pros_cat', 'pros')), axis=1)
df_reviews[['top_cons_category', 'top_cons_text']] = df_reviews.apply(
    lambda row: pd.Series(extract_top(row, 'cons_cat', 'cons')), axis=1)

df_reviews.to_csv('/content/drive/MyDrive/Colab Notebooks/df_reviews.csv', index=False)

In [None]:
#Temporal Trends Panel

#Tracking buzzwords/topic trends over time
from sentence_transformers import SentenceTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer
import re
from tqdm import tqdm
import gc
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
nltk.download('stopwords')
nltk.download('punkt_tab')

#Read df_reviews if predict code block not run
df_reviews = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/df_reviews.csv')

#Combine pros and cons into one column
df_reviews['feedback_text'] = df_reviews['top_pros_text'].fillna('') + " " + df_reviews['top_cons_text'].fillna('')
df_reviews['feedback_text'] = df_reviews['feedback_text'].str.strip()
df_feedback = df_reviews[df_reviews['feedback_text'].str.len() > 0].copy()
#df_feedback = df_feedback.sample(n=200000, random_state=42)

#Custom stopwords
custom_stopwords = list(
    set(stopwords.words('english')).union({
    'company', 'work', 'working', 'employee', 'employer', 'manager', 'staff', 'worker', 'people',
    'job', 'place', 'great', 'good', 'well', 'one', 'get', 'also', 'would', 'really', 'lot', 'even', 'make',
    'always', 'many', 'still', 'much', 'low', 'could', 'time', 'like', 'nice', 'con', 'none'
}))

#Clean and tokenize feedback
def preprocess(text):
    text = re.sub(r'[^a-zA-Z\s]', '', str(text).lower())
    text = re.sub(r'\b\w{1,2}\b', '', text)  #Remove very short words
    tokens = word_tokenize(text)
    tokens = [w for w in tokens if w not in custom_stopwords and len(w) > 2 and w.isalpha()]
    return " ".join(tokens)

df_feedback['clean_text'] = df_feedback['feedback_text'].apply(preprocess)

model = SentenceTransformer('all-MiniLM-L6-v2')
batch_size = 256
embeddings = []

for i in tqdm(range(0, len(df_feedback), batch_size)):
    batch_texts = df_feedback['clean_text'].iloc[i:i+batch_size].tolist()
    batch_embeddings = model.encode(batch_texts, show_progress_bar=False)
    embeddings.append(batch_embeddings)
    gc.collect() #Free memory

#Concatenate batches
embeddings = np.vstack(embeddings)
#Normalize
scaler = StandardScaler()
embeddings_scaled = scaler.fit_transform(embeddings)

k = 8
kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
df_feedback['cluster'] = kmeans.fit_predict(embeddings_scaled)

def extract_keywords(texts, n=5):
    vectorizer = TfidfVectorizer(
        stop_words=list(custom_stopwords),
        max_features=5000,
        min_df=5,       #Ignore terms that appear in fewer than 5 documents
        max_df=0.7      #Ignore terms that appear in more than 70% of documents
    )
    X = vectorizer.fit_transform(texts)
    keywords = vectorizer.get_feature_names_out()
    tfidf = X.toarray().mean(axis=0)
    top_indices = tfidf.argsort()[::-1][:n]
    return [keywords[i] for i in top_indices]

cluster_keywords = {}
for cluster_id in sorted(df_feedback['cluster'].unique()):
    texts = df_feedback[df_feedback['cluster'] == cluster_id]['clean_text']
    cluster_keywords[cluster_id] = extract_keywords(texts.tolist(), n=5)

df_feedback['topic_label'] = df_feedback['cluster'].map(lambda c: ", ".join(cluster_keywords[c]))

#Add readable labels based on top keywords
df_feedback['topic_label'] = df_feedback['cluster'].map(lambda c: ", ".join(cluster_keywords[c]))

topic_trends = (
    df_feedback
    .groupby(['year_month', 'topic_label'])
    .size()
    .reset_index(name='count')
    .pivot(index='year_month', columns='topic_label', values='count')
    .fillna(0)
)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

100%|██████████| 3259/3259 [28:36<00:00,  1.90it/s]
