Import statements

In [3]:
import os
import re
import threading

import findspark
import pyspark
import torch

from pyspark.sql.functions import udf, concat_ws, col
from pyspark.sql.types import StringType, StructType, StructField
from pyspark.streaming import StreamingContext

Import the CSV file

In [2]:
import pandas as pd

# Load the CSV file
file_path = "C:/Users/siebe/Dropbox/KUL/1ste master/Advanced Analytics in Business/assignements/assignement 3/output (1).csv"
df = pd.read_csv(file_path)

# Show basic info and the first few rows
df.info(), df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8763 entries, 0 to 8762
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   value   8763 non-null   object
dtypes: object(1)
memory usage: 68.6+ KB


(None,
                                                value
 0  {"aid": "http://arxiv.org/abs/2504.09896v1", "...
 1  {"aid": "http://arxiv.org/abs/2504.09912v1", "...
 2  {"aid": "http://arxiv.org/abs/2504.04742v1", "...
 3  {"aid": "http://arxiv.org/abs/2504.04758v1", "...
 4  {"aid": "http://arxiv.org/abs/2504.04721v1", "...)




---
Prepare the dataset

In [6]:
import json
# Parse the JSON strings in the 'value' column
parsed_rows = df['value'].apply(json.loads)
# Convert to a DataFrame
structured_df = pd.json_normalize(parsed_rows)
# Show the structure and a sample
structured_df.info(), structured_df.head()

import sklearn
import re
# Normalize all main_category values to their top-level prefix (before '-' or '.')
structured_df['main_category'] = structured_df['main_category'].apply(lambda x: 'q-fin' if 'q-fin' in x else ('q-bio' if 'q-bio' in x else re.split(r'[-\.]', x)[0]))

structured_df['text'] = structured_df['title'] + " " + structured_df['summary']
structured_df = structured_df[['text', 'main_category']].dropna()
print(len(structured_df))

# Encode main_category labels as integers
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
structured_df['label'] = label_encoder.fit_transform(structured_df['main_category'])

# Get a summary of label distribution
label_distribution = structured_df['main_category'].value_counts()

# Show sample data and label distribution
structured_df.head(), label_distribution.head(10), structured_df['label'].nunique()

# Get the number of unique main categories
num_unique_categories = structured_df['main_category'].nunique()
print(f'Number of unique main categories: {num_unique_categories}')

# Get the unique main categories --> this is to check whether the categories are transformed into their desired form: 
# i.e all main_category values must be transformed to their top-level prefix since that is what we want to predict
unique_categories = structured_df['main_category'].unique()
print(f'unique categories: {unique_categories}')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8763 entries, 0 to 8762
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   aid            8763 non-null   object
 1   title          8763 non-null   object
 2   summary        8763 non-null   object
 3   main_category  8763 non-null   object
 4   categories     8763 non-null   object
 5   published      8763 non-null   object
dtypes: object(6)
memory usage: 410.9+ KB
8763
Number of unique main categories: 15
unique categories: ['cs' 'eess' 'astro' 'physics' 'quant' 'math' 'cond' 'nucl' 'stat' 'hep'
 'q-bio' 'gr' 'q-fin' 'econ' 'nlin']


---
Set up and train the DistelBERT base uncased model. This is the transformer-based language model we will use for prediction.



In [None]:
import pandas as pd
import numpy as np

# Scikit-learn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# Hugging Face Datasets & Transformers
from datasets import Dataset
from transformers import (
    DistilBertTokenizerFast,
    DistilBertForSequenceClassification,
    Trainer,
    TrainingArguments,
)

# PyTorch
import torch

# Step 1: Encode labels
label_encoder = LabelEncoder()
num_labels = len(structured_df['main_category'].unique())

# Step 2: Split the dataset
train_df, test_df = train_test_split(structured_df, test_size=0.2, random_state=42)

# Step 3: Convert to Hugging Face Dataset
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

# Step 4: Tokenization
tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")

def tokenize(batch):
    return tokenizer(batch["text"], padding="max_length", truncation=True)

train_dataset = train_dataset.map(tokenize, batched=True)
test_dataset = test_dataset.map(tokenize, batched=True)

train_dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"])
test_dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"])

# Step 5: Load model
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=num_labels)

# Step 6: Define metrics
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import torch

def compute_metrics(p):
    preds = torch.argmax(torch.tensor(p.predictions), axis=1)
    labels = torch.tensor(p.label_ids)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "precision": precision, "recall": recall, "f1": f1}

# Step 7: Training arguments
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    load_best_model_at_end=True,
    logging_dir="./logs",
    fp16=True,
)

# Step 8: Train
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

trainer.train()

---
Checkpoint: When fine-tuning a Hugging Face model using the Trainer API (e.g. trainer.train()), it automatically saves checkpoints during training at regular intervals. Each checkpoint-* directory is a snapshot of the model at a certain training step. So in order to load in the model in the streaming environment, we need to check which checkpoint to use. This is the purpose of the code below

In [8]:
import os

model_path = "./results"
print(os.listdir(model_path))

['checkpoint-1374', 'checkpoint-1754', 'checkpoint-2631', 'checkpoint-458', 'checkpoint-877', 'checkpoint-916']


In [None]:
#code to find the best checkpoint such that I know which checkpoint to load in into the deployed setting
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
from sklearn.metrics import accuracy_score, f1_score
import torch
import numpy as np
import os

label_encoder = LabelEncoder()
num_labels = len(structured_df['main_category'].unique())

train_df, test_df = train_test_split(structured_df, test_size=0.2, random_state=42)

train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

def compute_metrics(pred):
    preds = np.argmax(pred.predictions, axis=1)
    labels = pred.label_ids
    return {
        'accuracy': accuracy_score(labels, preds),
        'f1': f1_score(labels, preds, average='weighted')
    }

tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")
def tokenize(batch):
    return tokenizer(batch["text"], padding=True, truncation=True)

test_dataset = test_dataset.map(tokenize, batched=True)
test_dataset = test_dataset.rename_column("label", "labels")
test_dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"])

#Loop over all checkpoints and evaluate agaist the defined metrics
results = []
checkpoints_dir = "./results"
checkpoints = [os.path.join(checkpoints_dir, d) for d in os.listdir(checkpoints_dir) if d.startswith("checkpoint")]

for checkpoint in checkpoints:
    print(f"Evaluating {checkpoint}...")
    model = DistilBertForSequenceClassification.from_pretrained(checkpoint)
    trainer = Trainer(model=model, tokenizer=tokenizer, compute_metrics=compute_metrics)
    eval_result = trainer.evaluate(eval_dataset=test_dataset)
    results.append((checkpoint, eval_result))

sorted_results = sorted(results, key=lambda x: x[1]["eval_loss"])
best_checkpoint, best_result = sorted_results[0]

print(f"\nBeste checkpoint: {best_checkpoint}")
print(f"Resultaten: {best_result}")



---
Set up the environment

In [2]:
spark_home = os.environ.get("SPARK_HOME") or os.path.abspath(os.path.join(os.getcwd(), "..", "spark-3.5.5-bin-hadoop3"))
hadoop_home = os.environ.get("HADOOP_HOME") or os.path.abspath(os.path.join(os.getcwd(), "..", "winutils"))

if not os.path.exists(spark_home):
    print(f"ERROR: SPARK_HOME does not exist: {spark_home}")
    exit(1)

if os.name == "nt" and os.path.exists(hadoop_home):
    os.environ["HADOOP_HOME"] = hadoop_home
    os.environ["PATH"] = f"{os.path.join(hadoop_home, 'bin')};{os.environ['PATH']}"

print(f"Using SPARK_HOME: {spark_home}")
print(f"Using HADOOP_HOME: {hadoop_home}")

findspark.init(spark_home)

sc = pyspark.SparkContext(appName="StreamingPaperClassifier")
spark = pyspark.sql.SparkSession.builder.appName("StreamingPaperClassifierSession").config("spark.executor.memory", "8g") \
    .config("spark.driver.memory", "8g") \
    .config("spark.executor.extraJavaOptions", "-verbose:gc") \
    .getOrCreate()
spark.sparkContext.setLogLevel("WARN")

Using SPARK_HOME: C:\Users\topsj\Desktop\spark\spark-3.5.5-bin-hadoop3
Using HADOOP_HOME: C:\Users\topsj\Desktop\spark\winutils


---
Global variables and model load function

In [3]:
# -------------------- Global Variables --------------------

globals().update({
    'models_loaded': False,
    'tokenizer': None,
    'my_model': None,
    'category_labels': []
})

# -------------------- Model Load Function --------------------
def load_model_and_labels():
    from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification

    if globals()['models_loaded']:
        return

    print("Loading model and tokenizer...")
    try:
        model_path = "results/checkpoint-2631"
        if not os.path.exists(model_path):
            print(f"Model path not found: {model_path}")
            return

        tokenizer = DistilBertTokenizerFast.from_pretrained(model_path)
        model = DistilBertForSequenceClassification.from_pretrained(model_path)
        model.eval()

        id2label = getattr(model.config, 'id2label', {})
        if id2label:
            try:
                sorted_labels = [id2label[str(i)] for i in range(len(id2label))]
            except Exception:
                sorted_labels = list(id2label.values())

            globals().update({
                'tokenizer': tokenizer,
                'my_model': model,
                'category_labels': sorted_labels,
                'models_loaded': True
            })
            print(f"Model loaded with labels: {sorted_labels}")
        else:
            print("WARNING: No id2label found in config. Predictions will be indices.")
            globals().update({
                'tokenizer': tokenizer,
                'my_model': model,
                'models_loaded': True
            })

    except Exception as e:
        print(f"Error loading model: {e}")
        import traceback
        traceback.print_exc()

---
Inference function

In [4]:
def run_inference(text_list):
    if not globals()['models_loaded']:
        print("Model not loaded. Skipping inference.")
        return []

    try:
        tokens = globals()['tokenizer'](text_list, padding=True, truncation=True, return_tensors="pt", max_length=512)
        with torch.no_grad():
            outputs = globals()['my_model'](**tokens)
            return torch.argmax(outputs.logits, axis=1).tolist()
    except Exception as e:
        print(f"Inference error: {e}")
        import traceback
        traceback.print_exc()
        return []

---
Normalize category function

In [5]:
def normalize_category_func(category_str):
    if category_str is None:
        return None
    if "q-fin" in category_str:
        return "q-fin"
    elif "q-bio" in category_str:
        return "q-bio"
    return re.split(r'[-\.]', category_str)[0]

normalize_category_udf = udf(normalize_category_func, StringType())

---
Json schema we follow

In [6]:
# -------------------- Schema --------------------
json_schema = StructType([
    StructField("aid", StringType(), True),
    StructField("categories", StringType(), True),
    StructField("main_category", StringType(), True),
    StructField("published", StringType(), True),
    StructField("summary", StringType(), True),
    StructField("title", StringType(), True)
])

---
Process function

In [7]:
def process(time, rdd):
    print(f"\n=== Processing batch at {time} ===")

    if not globals()['models_loaded']:
        load_model_and_labels()
        if not globals()['models_loaded']:
            print("Model not loaded. Skipping batch.")
            return

    try:
        if rdd.isEmpty():
            print("Empty RDD. Skipping.")
            return


        df = spark.read.json(rdd, schema=json_schema)
        if df.rdd.isEmpty():
            print("Parsed DataFrame is empty.")
            return

        df = df.withColumn("normalized_main_category", normalize_category_udf(col("main_category")))
        df = df.withColumn("text", concat_ws(" ", col("title"), col("summary")))
        df_processed = df.select("text", "normalized_main_category").dropna()

        if df_processed.rdd.isEmpty():
            print("No valid rows after preprocessing.")
            return

        pandas_df = df_processed.toPandas()
        if pandas_df.empty:
            print("Empty pandas DataFrame.")
            return

        predictions = run_inference(pandas_df["text"].tolist())
        if not predictions or len(predictions) != len(pandas_df):
            print("Mismatch or no predictions.")
            return

        pandas_df["predicted_category_idx"] = predictions
        labels = globals()['category_labels']

        if labels:
            pandas_df["predicted_category_name"] = pandas_df["predicted_category_idx"].apply(
                lambda idx: labels[idx] if 0 <= idx < len(labels) else f"unknown_idx_{idx}"
            )
        else:
            pandas_df["predicted_category_name"] = pandas_df["predicted_category_idx"].astype(str)

        correct = (pandas_df["normalized_main_category"] == pandas_df["predicted_category_name"]).sum()
        total = len(pandas_df)
        accuracy = correct / total if total else 0

        print(f"Accuracy: {accuracy:.4f} ({correct}/{total})")
        print("Sample Predictions:")
        print(pandas_df[["normalized_main_category", "predicted_category_name"]].head())

    except Exception as e:
        print(f"Batch processing error: {e}")
        import traceback
        traceback.print_exc()

---
Streaming

In [8]:
ssc = StreamingContext(sc, 30)




=== Processing batch at 2025-05-16 20:47:30 ===


In [9]:
lines = ssc.socketTextStream("seppe.net", 7778)
lines.foreachRDD(process)

---
Threading for streaming

In [10]:
class StreamingThread(threading.Thread):
    def __init__(self, ssc_instance):
        super().__init__()
        self.ssc_instance = ssc_instance
        self._stop_event = threading.Event()

    def run(self):
        try:
            print("Starting streaming context...")
            self.ssc_instance.start()
            while not self._stop_event.is_set():
                self._stop_event.wait(1)
        except Exception as e:
            print(f"Streaming error: {e}")
            import traceback
            traceback.print_exc()
        finally:
            self.ssc_instance.stop(stopSparkContext=False, stopGraceFully=True)

    def stop_stream(self):
        print("Stopping streaming context...")
        self._stop_event.set()
        if self.ssc_instance.getState() == StreamingContext.STATE_ACTIVE:
            self.ssc_instance.stop(stopSparkContext=False, stopGraceFully=True)

In [None]:
if __name__ == "__main__":
    ssc_t = StreamingThread(ssc)
    try:
        ssc_t.start()
        print("Streaming started. Press Ctrl+C to stop.")
        while ssc_t.is_alive():
            ssc_t.join(timeout=1.0)
    except KeyboardInterrupt:
        print("\nKeyboardInterrupt received. Stopping...")
    finally:
        if ssc_t.is_alive():
            ssc_t.stop_stream()
            ssc_t.join(timeout=BATCH_INTERVAL_SECONDS * 2 + 5)
        print("Shutdown complete.")

Starting streaming context...Streaming started. Press Ctrl+C to stop.



In [None]:
ssc_t.stop()