In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "1"
import sys
print(sys.executable)
!pwd

/cluster/datastore/ibrahimh/conda_envs/gpu_39_env1/bin/python
/cluster/datastore/ibrahimh/projects/aes_paper_dev


In [6]:
import torch
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer

# ‚úÖ Load dataset
file_path = "dataset/train.csv"
data = pd.read_csv(file_path)

# ‚úÖ Ensure expected columns exist
expected_columns = ["text_id", "full_text", "cohesion", "syntax", "vocabulary", "phraseology", "grammar", "conventions"]
if not all(col in data.columns for col in expected_columns):
    raise ValueError(f"Dataset is missing required columns. Expected: {expected_columns}")

# ‚úÖ Drop rows with missing text
data = data.dropna(subset=["full_text"]).reset_index(drop=True)

# ‚úÖ Extract text data
texts = data["full_text"].tolist()
labels = data[["cohesion", "syntax", "vocabulary", "phraseology", "grammar", "conventions"]].values

# ‚úÖ Load MiniLM model
MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"
print(f"\nüîç Loading model: {MODEL_NAME}")
model = SentenceTransformer(MODEL_NAME)

# ‚úÖ Extract embeddings
print("\nüöÄ Extracting MiniLM embeddings...")
embeddings = model.encode(texts, convert_to_numpy=True)

# ‚úÖ Save embeddings and labels properly
output_path = "dataset/embeddings_transformer/MiniLM_embeddings.npz"
np.savez(output_path, embeddings=embeddings, labels=labels)

print(f"\n‚úÖ Saved MiniLM embeddings to {output_path}")
print(f"Total extracted embeddings: {embeddings.shape[0]}")
print(f"Total labels: {labels.shape[0]}")



üîç Loading model: sentence-transformers/all-MiniLM-L6-v2





üöÄ Extracting MiniLM embeddings...

‚úÖ Saved MiniLM embeddings to dataset/embeddings_transformer/MiniLM_embeddings.npz
Total extracted embeddings: 3911
Total labels: 3911


In [7]:
data = np.load("dataset/embeddings_transformer/MiniLM_embeddings.npz")
print("Keys in the file:", list(data.keys()))  # Should print: ['embeddings', 'labels']
print("Embeddings shape:", data["embeddings"].shape)
print("Labels shape:", data["labels"].shape)


Keys in the file: ['embeddings', 'labels']
Embeddings shape: (3911, 384)
Labels shape: (3911, 6)


In [9]:
import numpy as np
import os
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, cohen_kappa_score
import torch
import pandas as pd
from sklearn.model_selection import train_test_split

# Paths to MiniLM embeddings
EMBEDDINGS_FILE = "dataset/embeddings_transformer/MiniLM_embeddings.npz"

# Load MiniLM embeddings
print("\nüîç **Loading MiniLM embeddings**")
data = np.load(EMBEDDINGS_FILE)
embeddings, labels = data["embeddings"], data["labels"]

# Define Train/Test Split (80/20)
X_train, X_test, y_train, y_test = train_test_split(
    embeddings, labels, test_size=0.2, random_state=42
)

# Define valid grade levels
VALID_GRADES = torch.tensor([1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 4.0, 4.5, 5.0])

def round_to_valid_grades(predictions):
    """Round predictions to the nearest valid grade."""
    return np.array([
        float(VALID_GRADES[torch.argmin(torch.abs(VALID_GRADES - pred))])
        for pred in predictions
    ])

def quadratic_weighted_kappa(y_true, y_pred):
    """Calculate QWK."""
    grade_to_bin = {float(grade): idx for idx, grade in enumerate(VALID_GRADES)}
    y_true_bins = [grade_to_bin[float(val)] for val in y_true]
    y_pred_bins = [grade_to_bin[float(val)] for val in y_pred]

    if len(set(y_true_bins)) == 1 or len(set(y_pred_bins)) == 1:
        return 0.0  # Prevents QWK crash when only one class exists

    return cohen_kappa_score(y_true_bins, y_pred_bins, weights="quadratic")

# üöÄ Train XGBoost (much faster than Random Forest)
print("üöÄ Training XGBoost on MiniLM embeddings...")
#xgb_model = XGBRegressor(n_estimators=200, max_depth=10, learning_rate=0.1, n_jobs=-1, random_state=42)
xgb_model = XGBRegressor(n_estimators=500, max_depth=15, learning_rate=0.05, n_jobs=-1, random_state=42)

xgb_model.fit(X_train, y_train.mean(axis=1))

# Make predictions on test set
predictions = xgb_model.predict(X_test)
predictions_rounded = round_to_valid_grades(predictions)
y_test_rounded = round_to_valid_grades(y_test.mean(axis=1))

# Compute MAE and QWK
mae = mean_absolute_error(y_test_rounded, predictions_rounded)
qwk = quadratic_weighted_kappa(y_test_rounded, predictions_rounded)

# Save results
results_file = "benchmark_results_transformers_xgb.txt"
with open(results_file, "w") as f:
    f.write("Benchmarking MiniLM Embeddings with XGBoost (Train/Test Split)\n\n")
    f.write(f"XGBoost: MAE = {mae:.4f}, QWK = {qwk:.4f}\n")

# Print results
print(f"XGBoost - MAE: {mae:.4f}, QWK: {qwk:.4f}")
print(f"\n‚úÖ Benchmarking complete! Results saved in {results_file}")



üîç **Loading MiniLM embeddings**
üöÄ Training XGBoost on MiniLM embeddings...
XGBoost - MAE: 0.3972, QWK: 0.3224

‚úÖ Benchmarking complete! Results saved in benchmark_results_transformers_xgb.txt


Embeddings shape: (3911, 384)
Labels shape: (3911, 6)


In [11]:
import torch
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer

# ‚úÖ Load dataset
file_path = "dataset/train.csv"
data = pd.read_csv(file_path)

# ‚úÖ Ensure expected columns exist
expected_columns = ["text_id", "full_text", "cohesion", "syntax", "vocabulary", "phraseology", "grammar", "conventions"]
if not all(col in data.columns for col in expected_columns):
    raise ValueError(f"Dataset is missing required columns. Expected: {expected_columns}")

# ‚úÖ Drop rows with missing text
data = data.dropna(subset=["full_text"]).reset_index(drop=True)

# ‚úÖ Extract text data
texts = data["full_text"].tolist()
labels = data[["cohesion", "syntax", "vocabulary", "phraseology", "grammar", "conventions"]].values

# ‚úÖ Load MPNet model
MODEL_NAME = "sentence-transformers/all-mpnet-base-v2"
print(f"\nüîç Loading model: {MODEL_NAME}")
model = SentenceTransformer(MODEL_NAME)

# ‚úÖ Extract embeddings
print("\nüöÄ Extracting MPNet embeddings...")
embeddings = model.encode(texts, convert_to_numpy=True)

# ‚úÖ Save embeddings and labels properly
output_path = "dataset/embeddings_transformer/MPNet_embeddings.npz"
np.savez(output_path, embeddings=embeddings, labels=labels)

print(f"\n‚úÖ Saved MPNet embeddings to {output_path}")
print(f"Total extracted embeddings: {embeddings.shape[0]}")
print(f"Total labels: {labels.shape[0]}")



üîç Loading model: sentence-transformers/all-mpnet-base-v2





üöÄ Extracting MPNet embeddings...

‚úÖ Saved MPNet embeddings to dataset/embeddings_transformer/MPNet_embeddings.npz
Total extracted embeddings: 3911
Total labels: 3911


In [12]:
data = np.load("dataset/embeddings_transformer/MPNet_embeddings.npz")
print("Keys in the file:", list(data.keys()))
print("Embeddings shape:", data["embeddings"].shape)
print("Labels shape:", data["labels"].shape)


Keys in the file: ['embeddings', 'labels']
Embeddings shape: (3911, 768)
Labels shape: (3911, 6)


In [2]:
import numpy as np
import os
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, cohen_kappa_score
import torch
import pandas as pd
from sklearn.model_selection import train_test_split

# Paths to MPNet embeddings
EMBEDDINGS_FILE = "dataset/embeddings_transformer/MPNet_embeddings.npz"

# Load MPNet embeddings
print("\nüîç **Loading MPNet embeddings**")
data = np.load(EMBEDDINGS_FILE)
embeddings, labels = data["embeddings"], data["labels"]

# Define Train/Test Split (80/20)
X_train, X_test, y_train, y_test = train_test_split(
    embeddings, labels, test_size=0.2, random_state=42
)

# Define valid grade levels
VALID_GRADES = torch.tensor([1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 4.0, 4.5, 5.0])

def round_to_valid_grades(predictions):
    """Round predictions to the nearest valid grade."""
    return np.array([
        float(VALID_GRADES[torch.argmin(torch.abs(VALID_GRADES - pred))])
        for pred in predictions
    ])

def quadratic_weighted_kappa(y_true, y_pred):
    """Calculate QWK."""
    grade_to_bin = {float(grade): idx for idx, grade in enumerate(VALID_GRADES)}
    y_true_bins = [grade_to_bin[float(val)] for val in y_true]
    y_pred_bins = [grade_to_bin[float(val)] for val in y_pred]

    if len(set(y_true_bins)) == 1 or len(set(y_pred_bins)) == 1:
        return 0.0  # Prevents QWK crash when only one class exists

    return cohen_kappa_score(y_true_bins, y_pred_bins, weights="quadratic")

# üöÄ Train XGBoost on MPNet embeddings
print("üöÄ Training XGBoost on MPNet embeddings...")
xgb_model = XGBRegressor(n_estimators=200, max_depth=10, learning_rate=0.1, n_jobs=-1, random_state=42)
xgb_model.fit(X_train, y_train.mean(axis=1))

# Make predictions on test set
predictions = xgb_model.predict(X_test)
predictions_rounded = round_to_valid_grades(predictions)
y_test_rounded = round_to_valid_grades(y_test.mean(axis=1))

# Compute MAE and QWK
mae = mean_absolute_error(y_test_rounded, predictions_rounded)
qwk = quadratic_weighted_kappa(y_test_rounded, predictions_rounded)

# Save results
results_file = "benchmark_results_mpnet_xgb.txt"
with open(results_file, "w") as f:
    f.write("Benchmarking MPNet Embeddings with XGBoost (Train/Test Split)\n\n")
    f.write(f"XGBoost: MAE = {mae:.4f}, QWK = {qwk:.4f}\n")

# Print results
print(f"XGBoost - MAE: {mae:.4f}, QWK: {qwk:.4f}")
print(f"\n‚úÖ Benchmarking complete! Results saved in {results_file}")



üîç **Loading MPNet embeddings**
üöÄ Training XGBoost on MPNet embeddings...
XGBoost - MAE: 0.3787, QWK: 0.3926

‚úÖ Benchmarking complete! Results saved in benchmark_results_mpnet_xgb.txt


In [3]:
import torch
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, AutoModel

# ‚úÖ Load dataset
file_path = "dataset/train.csv"
data = pd.read_csv(file_path)

# ‚úÖ Ensure expected columns exist
expected_columns = ["text_id", "full_text", "cohesion", "syntax", "vocabulary", "phraseology", "grammar", "conventions"]
if not all(col in data.columns for col in expected_columns):
    raise ValueError(f"Dataset is missing required columns. Expected: {expected_columns}")

# ‚úÖ Drop rows with missing text
data = data.dropna(subset=["full_text"]).reset_index(drop=True)

# ‚úÖ Extract text data
texts = data["full_text"].tolist()
labels = data[["cohesion", "syntax", "vocabulary", "phraseology", "grammar", "conventions"]].values

# ‚úÖ Load RoBERTa model & tokenizer
MODEL_NAME = "roberta-base"
print(f"\nüîç Loading model: {MODEL_NAME}")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModel.from_pretrained(MODEL_NAME)

# ‚úÖ Function to extract mean-pooled embeddings
def extract_embeddings(texts, tokenizer, model):
    """Get mean-pooled embeddings from RoBERTa."""
    model.eval()
    all_embeddings = []

    with torch.no_grad():
        for text in texts:
            inputs = tokenizer(text, padding=True, truncation=True, return_tensors="pt")
            outputs = model(**inputs)
            token_embeddings = outputs.last_hidden_state  # Shape: [1, seq_len, hidden_dim]
            sentence_embedding = token_embeddings.mean(dim=1).squeeze().numpy()  # Mean pooling
            all_embeddings.append(sentence_embedding)

    return np.array(all_embeddings)

# ‚úÖ Extract embeddings
print("\nüöÄ Extracting RoBERTa embeddings...")
embeddings = extract_embeddings(texts, tokenizer, model)

# ‚úÖ Save embeddings and labels properly
output_path = "dataset/embeddings_transformer/RoBERTa_embeddings.npz"
np.savez(output_path, embeddings=embeddings, labels=labels)

print(f"\n‚úÖ Saved RoBERTa embeddings to {output_path}")
print(f"Total extracted embeddings: {embeddings.shape[0]}")
print(f"Embeddings dimension: {embeddings.shape[1]}")  # Should be 768
print(f"Total labels: {labels.shape[0]}")



üîç Loading model: roberta-base


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



üöÄ Extracting RoBERTa embeddings...

‚úÖ Saved RoBERTa embeddings to dataset/embeddings_transformer/RoBERTa_embeddings.npz
Total extracted embeddings: 3911
Embeddings dimension: 768
Total labels: 3911


In [4]:
data = np.load("dataset/embeddings_transformer/RoBERTa_embeddings.npz")
print("Keys in the file:", list(data.keys()))
print("Embeddings shape:", data["embeddings"].shape)
print("Labels shape:", data["labels"].shape)

Keys in the file: ['embeddings', 'labels']
Embeddings shape: (3911, 768)
Labels shape: (3911, 6)
