# Assignment 5: Model Training and Evaluation
## Text Classification for arXiv Paper Relevance

This notebook trains an embedding-based text classification model to predict the relevance of arXiv papers.

### Approach:
- **Embedding Model**: SentenceTransformer (all-MiniLM-L6-v2) for text embedding
- **Classification Model**: Logistic Regression on top of embeddings
- **Task**: Binary classification (relevant=1, not relevant=0)
- **Evaluation Metrics**: Accuracy, F1-score, Precision, Recall

## 1. Setup and Data Loading

In [1]:
# Install required packages
!pip install sentence-transformers scikit-learn pandas numpy matplotlib seaborn -q

In [2]:
import json
import numpy as np
import pandas as pd
from pathlib import Path
import pickle
import matplotlib.pyplot as plt
import seaborn as sns
from sentence_transformers import SentenceTransformer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix
import warnings
warnings.filterwarnings('ignore')

# Set random seed for reproducibility
np.random.seed(42)

print("✓ All imports successful!")

✓ All imports successful!


In [3]:
# Load data from JSON file
data_path = Path('./data/data.json')

with open(data_path, 'r') as f:
    data = json.load(f)

print(f"✓ Loaded {len(data)} samples")

# Extract texts and labels
texts = [item['abstract'] for item in data]
labels = np.array([item['relevance'] for item in data])

print(f"\nDataset Statistics:")
print(f"  Total samples: {len(texts)}")
print(f"  Relevant (1): {(labels == 1).sum()} ({(labels == 1).sum()/len(labels)*100:.1f}%)")
print(f"  Not relevant (0): {(labels == 0).sum()} ({(labels == 0).sum()/len(labels)*100:.1f}%)")

# Check text lengths
text_lengths = [len(text.split()) for text in texts]
print(f"\nText Length Statistics:")
print(f"  Mean: {np.mean(text_lengths):.1f} words")
print(f"  Min: {np.min(text_lengths)} words")
print(f"  Max: {np.max(text_lengths)} words")

✓ Loaded 300 samples

Dataset Statistics:
  Total samples: 300
  Relevant (1): 53 (17.7%)
  Not relevant (0): 247 (82.3%)

Text Length Statistics:
  Mean: 176.0 words
  Min: 68 words
  Max: 276 words


## 2. Generate Embeddings

In [4]:
# Load the embedding model
print("Loading SentenceTransformer model...")
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
print("✓ Model loaded successfully")

# Generate embeddings
print(f"\nGenerating embeddings for {len(texts)} texts...")
embeddings = embedding_model.encode(texts, show_progress_bar=True)

print(f"✓ Embeddings generated")
print(f"  Shape: {embeddings.shape}")
print(f"  Dimension: {embeddings.shape[1]}")

Loading SentenceTransformer model...
✓ Model loaded successfully

Generating embeddings for 300 texts...


Batches:   0%|          | 0/10 [00:00<?, ?it/s]

✓ Embeddings generated
  Shape: (300, 384)
  Dimension: 384


## 3. Data Splitting

In [5]:
# Split data into train (70%), validation (15%), and test (15%)
# First split: 70% train, 30% temp
X_train, X_temp, y_train, y_temp = train_test_split(
    embeddings, labels, test_size=0.3, random_state=42, stratify=labels
)

# Second split: split temp into 50% val, 50% test
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp
)

print("Data Split:")
print(f"  Train: {len(X_train)} samples ({len(X_train)/len(embeddings)*100:.1f}%)")
print(f"  Validation: {len(X_val)} samples ({len(X_val)/len(embeddings)*100:.1f}%)")
print(f"  Test: {len(X_test)} samples ({len(X_test)/len(embeddings)*100:.1f}%)")

print(f"\nLabel Distribution in Train:")
print(f"  Class 0: {(y_train == 0).sum()}")
print(f"  Class 1: {(y_train == 1).sum()}")

print(f"\nLabel Distribution in Validation:")
print(f"  Class 0: {(y_val == 0).sum()}")
print(f"  Class 1: {(y_val == 1).sum()}")

print(f"\nLabel Distribution in Test:")
print(f"  Class 0: {(y_test == 0).sum()}")
print(f"  Class 1: {(y_test == 1).sum()}")

Data Split:
  Train: 210 samples (70.0%)
  Validation: 45 samples (15.0%)
  Test: 45 samples (15.0%)

Label Distribution in Train:
  Class 0: 173
  Class 1: 37

Label Distribution in Validation:
  Class 0: 37
  Class 1: 8

Label Distribution in Test:
  Class 0: 37
  Class 1: 8


## 4. Model Training

In [6]:
# Train Logistic Regression classifier
print("Training Logistic Regression classifier...")
classifier = LogisticRegression(
    max_iter=1000,
    random_state=42,
    class_weight='balanced'  # Handle class imbalance
)

classifier.fit(X_train, y_train)
print("✓ Model trained successfully")

Training Logistic Regression classifier...
✓ Model trained successfully


## 5. Validation and Hyperparameter Tuning

In [7]:
# Evaluate on validation set
y_val_pred = classifier.predict(X_val)
y_val_pred_proba = classifier.predict_proba(X_val)[:, 1]

val_accuracy = accuracy_score(y_val, y_val_pred)
val_f1 = f1_score(y_val, y_val_pred)
val_precision = precision_score(y_val, y_val_pred)
val_recall = recall_score(y_val, y_val_pred)

print("Validation Performance:")
print(f"  Accuracy:  {val_accuracy:.4f}")
print(f"  F1-score:  {val_f1:.4f}")
print(f"  Precision: {val_precision:.4f}")
print(f"  Recall:    {val_recall:.4f}")

Validation Performance:
  Accuracy:  0.8222
  F1-score:  0.6000
  Precision: 0.5000
  Recall:    0.7500


## 6. Save Model and Embeddings

In [8]:
# Create output directory if it doesn't exist
model_dir = Path('./models')
model_dir.mkdir(exist_ok=True)

# Save the classifier
classifier_path = model_dir / 'classifier.pkl'
with open(classifier_path, 'wb') as f:
    pickle.dump(classifier, f)
print(f"✓ Classifier saved to {classifier_path}")

# Save embedding model name for reference
config_path = model_dir / 'config.json'
config = {
    'embedding_model': 'all-MiniLM-L6-v2',
    'embedding_dimension': embeddings.shape[1],
    'classifier_type': 'LogisticRegression',
    'training_samples': len(X_train),
    'validation_samples': len(X_val),
    'test_samples': len(X_test),
    'class_weight': 'balanced'
}

with open(config_path, 'w') as f:
    json.dump(config, f, indent=2)
print(f"✓ Config saved to {config_path}")

✓ Classifier saved to models/classifier.pkl
✓ Config saved to models/config.json


## 7. Training Summary

In [9]:
print("="*50)
print("TRAINING SUMMARY")
print("="*50)
print(f"\nDataset:")
print(f"  Total samples: {len(texts)}")
print(f"  Train/Val/Test split: 70%/15%/15%")
print(f"  Class imbalance ratio: 82.3% negative, 17.7% positive")

print(f"\nModel:")
print(f"  Embedding: SentenceTransformer (all-MiniLM-L6-v2)")
print(f"  Embedding dimension: {embeddings.shape[1]}")
print(f"  Classifier: Logistic Regression with class_weight='balanced'")

print(f"\nValidation Results:")
print(f"  Accuracy:  {val_accuracy:.4f}")
print(f"  F1-score:  {val_f1:.4f}")
print(f"  Precision: {val_precision:.4f}")
print(f"  Recall:    {val_recall:.4f}")

print(f"\nSaved Files:")
print(f"  Classifier: models/classifier.pkl")
print(f"  Config: models/config.json")
print("="*50)

TRAINING SUMMARY

Dataset:
  Total samples: 300
  Train/Val/Test split: 70%/15%/15%
  Class imbalance ratio: 82.3% negative, 17.7% positive

Model:
  Embedding: SentenceTransformer (all-MiniLM-L6-v2)
  Embedding dimension: 384
  Classifier: Logistic Regression with class_weight='balanced'

Validation Results:
  Accuracy:  0.8222
  F1-score:  0.6000
  Precision: 0.5000
  Recall:    0.7500

Saved Files:
  Classifier: models/classifier.pkl
  Config: models/config.json
