### LLM Evaluation Project: Summary Scoring

#### Project Overview
This project involves developing a model to predict content and wording scores for student-written summaries across various topics and genres. The dataset includes 24,000 summaries from students in grades 3-12, each scored on both content and wording.

#### 1. Data Analysis and Preprocessing

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from transformers import AutoTokenizer, AutoModel
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

# Load data
train_summaries = pd.read_csv('summaries_train.csv')
train_prompts = pd.read_csv('prompts_train.csv')
test_summaries = pd.read_csv('summaries_test.csv')
test_prompts = pd.read_csv('prompts_test.csv')

# Merge prompts with summaries
train_data = pd.merge(train_summaries, train_prompts, on='prompt_id')
test_data = pd.merge(test_summaries, test_prompts, on='prompt_id')

In [2]:
train_data.isnull().sum()

student_id         0
prompt_id          0
text               0
content            0
wording            0
prompt_question    0
prompt_title       0
prompt_text        0
dtype: int64

In [3]:
test_data.isnull().sum()

student_id         0
prompt_id          0
text               0
prompt_question    0
prompt_title       0
prompt_text        0
dtype: int64

In [4]:
train_data[['content', 'wording']].describe()

Unnamed: 0,content,wording
count,7165.0,7165.0
mean,-0.014853,-0.063072
std,1.043569,1.036048
min,-1.729859,-1.962614
25%,-0.799545,-0.87272
50%,-0.093814,-0.081769
75%,0.49966,0.503833
max,3.900326,4.310693


2. Feature Engineering

In [5]:
import nltk

# Download all required NLTK data
nltk.download('punkt')  # For tokenization
nltk.download('stopwords')  # For stopwords
nltk.download('averaged_perceptron_tagger')  # For POS tagging (if needed)
nltk.download('wordnet')  # For lemmatization (if needed)

# Verify installation
try:
    nltk.data.find('tokenizers/punkt')
    print("Punkt tokenizer is available")
except LookupError:
    print("Punkt tokenizer not found - trying alternative download method")
    nltk.download('punkt', download_dir='/home/issa/nltk_data/')  # Specify your path

[nltk_data] Downloading package punkt to /home/issa/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/issa/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/issa/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /home/issa/nltk_data...


Punkt tokenizer is available


[nltk_data]   Package wordnet is already up-to-date!


In [6]:
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.tokenize import PunktTokenizer
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize 
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')

def text_features(text):
    try:
        # Basic text statistics
        words = word_tokenize(str(text))  # Ensure text is string
        word_count = len(words)
        char_count = len(str(text))
        avg_word_length = char_count / max(1, word_count)
        unique_words = len(set(words))
        lexical_diversity = unique_words / max(1, word_count)
        
        # Sentence tokenization with error handling
        sentences = sent_tokenize(str(text))
        sentence_count = len(sentences)
        avg_sentence_length = word_count / max(1, sentence_count)
        
        return {
            'word_count': word_count,
            'char_count': char_count,
            'avg_word_length': avg_word_length,
            'lexical_diversity': lexical_diversity,
            'avg_sentence_length': avg_sentence_length
        }
    except Exception as e:
        print(f"Error processing text: {e}")
        return {
            'word_count': 0,
            'char_count': 0,
            'avg_word_length': 0,
            'lexical_diversity': 0,
            'avg_sentence_length': 0
        }

# Apply feature engineering
train_features = train_data['text'].apply(lambda x: pd.Series(text_features(x)))
test_features = test_data['text'].apply(lambda x: pd.Series(text_features(x)))

# Combine with original data
train_data = pd.concat([train_data, train_features], axis=1)
test_data = pd.concat([test_data, test_features], axis=1)

[nltk_data] Downloading package punkt to /home/issa/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /home/issa/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /home/issa/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


#### 3. Modeling Approach

#### A combination of traditional ML and transformer-based approaches will be used:

#### Option 1: Gradient Boosting (XGBoost/LightGBM)

In [7]:
import lightgbm as lgb
from sklearn.multioutput import MultiOutputRegressor

# Prepare data
X = train_data.drop(['student_id', 'prompt_id', 'text', 'content', 'wording', 
                    'prompt_question', 'prompt_title', 'prompt_text'], axis=1)
y = train_data[['content', 'wording']]

# Train model
model = MultiOutputRegressor(lgb.LGBMRegressor(n_estimators=500, random_state=42))
model.fit(X, y)

# Predict on test set
test_X = test_data.drop(['student_id', 'prompt_id', 'text', 
                        'prompt_question', 'prompt_title', 'prompt_text'], axis=1)
predictions = model.predict(test_X)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001592 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1275
[LightGBM] [Info] Number of data points in the train set: 7165, number of used features: 5
[LightGBM] [Info] Start training from score -0.014853
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000312 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1275
[LightGBM] [Info] Number of data points in the train set: 7165, number of used features: 5
[LightGBM] [Info] Start training from score -0.063072


#### Option 2: Transformer-based Model (BERT)

In [8]:
# Custom dataset
class SummaryDataset(Dataset):
    def __init__(self, texts, prompts, content_scores=None, wording_scores=None):
        self.texts = texts
        self.prompts = prompts
        self.content_scores = content_scores
        self.wording_scores = wording_scores
        self.tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
        
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = self.texts[idx]
        prompt = self.prompts[idx]
        
        # Combine prompt and text
        combined = f"Prompt: {prompt} Summary: {text}"
        
        encoding = self.tokenizer(
            combined,
            max_length=512,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        
        item = {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten()
        }
        
        if self.content_scores is not None and self.wording_scores is not None:
            item['content'] = torch.tensor(self.content_scores[idx], dtype=torch.float)
            item['wording'] = torch.tensor(self.wording_scores[idx], dtype=torch.float)
            
        return item

# Model architecture
class SummaryScorer(nn.Module):
    def __init__(self):
        super().__init__()
        self.bert = AutoModel.from_pretrained('bert-base-uncased')
        self.regressor = nn.Sequential(
            nn.Linear(self.bert.config.hidden_size, 256),
            nn.ReLU(),
            nn.Dropout(0.1),
            nn.Linear(256, 2)
        )
        
    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        return self.regressor(pooled_output)

#### 4. Ensemble Approach
Combine predictions from both models for potentially better performance:

In [9]:
import numpy as np
from sklearn.metrics import mean_squared_error

# Assuming you have predictions from two models:
# 1. LightGBM predictions (lgb_predictions)
# 2. BERT predictions (bert_predictions)

# Sample predictions (replace with your actual predictions)
# For demonstration, let's create dummy predictions
num_samples = len(test_data)
lgb_predictions = np.random.rand(num_samples, 2)  # Random values between 0-1
bert_predictions = np.random.rand(num_samples, 2)  # Random values between 0-1

# Weighted ensemble (adjust weights based on validation performance)
ensemble_weights = {'lgb': 0.6, 'bert': 0.4}  # These should be tuned

# Calculate weighted average
final_predictions = (
    ensemble_weights['lgb'] * lgb_predictions + 
    ensemble_weights['bert'] * bert_predictions
)

# If you want to validate the ensemble performance (on validation set)
# y_val_true = ...  # Your true validation labels
# print(f"Ensemble MCRMSE: {mcrmse(y_val_true, final_predictions)}")

#### 5. Submission Preparation

In [10]:
# Prepare submission
submission = pd.DataFrame({
    'student_id': test_data['student_id'],
    'content': predictions[:, 0],
    'wording': predictions[:, 1]
})

submission.to_csv('submission.csv', index=False)

#### Evaluation Strategy

In [11]:
def mcrmse(y_true, y_pred):
    rmse_content = np.sqrt(mean_squared_error(y_true[:, 0], y_pred[:, 0]))
    rmse_wording = np.sqrt(mean_squared_error(y_true[:, 1], y_pred[:, 1]))
    return (rmse_content + rmse_wording) / 2

# Example usage
y_true = train_data[['content', 'wording']].values
y_pred = model.predict(X)
print(f"MCRMSE: {mcrmse(y_true, y_pred)}")

MCRMSE: 0.37828621394977974
