In [3]:
import json
import re
import joblib
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.metrics import mean_absolute_error, mean_squared_error

from scipy.sparse import hstack


In [4]:
import os

BASE_DIR = os.getcwd()   # project root
DATA_PATH = os.path.join(BASE_DIR, "data", "problems_data.jsonl")


records = []
with open(DATA_PATH, "r", encoding="utf-8") as f:
    for line in f:
        records.append(json.loads(line))

df = pd.DataFrame(records)
df.head()


Unnamed: 0,title,description,input_description,output_description,sample_io,problem_class,problem_score,url
0,Uuu,Unununium (Uuu) was the name of the chemical\n...,The input consists of one line with two intege...,The output consists of $M$ lines where the $i$...,"[{'input': '7 10', 'output': '1 2 2 3 1 3 3 4 ...",hard,9.7,https://open.kattis.com/problems/uuu
1,House Building,A number of eccentrics from central New York h...,"The input consists of $10$ test cases, which a...",Print $K$ lines with\n the positions of the...,"[{'input': '0 2 3 2 50 60 50 30 50 40', 'outpu...",hard,9.7,https://open.kattis.com/problems/husbygge
2,Mario or Luigi,Mario and Luigi are playing a game where they ...,,,"[{'input': '', 'output': ''}]",hard,9.6,https://open.kattis.com/problems/marioorluigi
3,The Wire Ghost,Å½ofka is bending a copper wire. She starts wit...,The first line contains two integers $L$ and $...,The output consists of a single line consistin...,"[{'input': '4 3 3 C 2 C 1 C', 'output': 'GHOST...",hard,9.6,https://open.kattis.com/problems/thewireghost
4,Barking Up The Wrong Tree,"Your dog Spot is let loose in the park. Well, ...",The first line of input consists of two intege...,Write a single line containing the length need...,"[{'input': '2 0 10 0 10 10', 'output': '14.14'...",hard,9.6,https://open.kattis.com/problems/barktree


In [5]:
print("Shape:", df.shape)
print(df.columns)


Shape: (4112, 8)
Index(['title', 'description', 'input_description', 'output_description',
       'sample_io', 'problem_class', 'problem_score', 'url'],
      dtype='object')


In [6]:
text_cols = ["title", "description", "input_description", "output_description"]

for col in text_cols:
    if col not in df.columns:
        df[col] = ""

df[text_cols] = df[text_cols].fillna("")

df["full_text"] = (
    df["title"] + " " +
    df["description"] + " " +
    df["input_description"] + " " +
    df["output_description"]
)

df[["full_text", "problem_class", "problem_score"]].head(100)


Unnamed: 0,full_text,problem_class,problem_score
0,Uuu Unununium (Uuu) was the name of the chemic...,hard,9.7
1,House Building A number of eccentrics from cen...,hard,9.7
2,Mario or Luigi Mario and Luigi are playing a g...,hard,9.6
3,The Wire Ghost Å½ofka is bending a copper wire....,hard,9.6
4,Barking Up The Wrong Tree Your dog Spot is let...,hard,9.6
...,...,...,...
95,Adding Up the Votes That last point is of part...,hard,9.0
96,Evading a Monster A monster is chasing you in ...,hard,9.0
97,Eccentric Excursion Eddy is planning a cross-c...,hard,8.9
98,Permutation CFG Consider a permutation of the ...,hard,8.9


In [7]:
def count_math_symbols(text):
    return len(re.findall(r"[+\-*/=<>]", text))

df["text_length"] = df["full_text"].apply(len)
df["math_symbols"] = df["full_text"].apply(count_math_symbols)

df[["text_length", "math_symbols"]].describe()


Unnamed: 0,text_length,math_symbols
count,4112.0,4112.0
mean,1625.10749,3.649319
std,756.724479,5.112129
min,10.0,0.0
25%,1114.0,1.0
50%,1515.0,2.0
75%,2001.5,5.0
max,7582.0,119.0


In [8]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_extra_scaled = scaler.fit_transform(df[["text_length", "math_symbols"]])
tfidf = TfidfVectorizer(
    max_features=5000,
    ngram_range=(1, 2),
    stop_words="english"
)

X_tfidf = tfidf.fit_transform(df["full_text"])

X = hstack([X_tfidf, X_extra_scaled])


In [9]:
y_class = df["problem_class"]

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y_class,
    test_size=0.2,
    random_state=42,
    stratify=y_class
)


In [10]:
from sklearn.svm import LinearSVC

svm_clf = LinearSVC(
    C=1.0,
    max_iter=10000,      # ðŸ”¥ fix convergence
    class_weight="balanced"
)

svm_clf.fit(X_train, y_train)


0,1,2
,penalty,'l2'
,loss,'squared_hinge'
,dual,'auto'
,tol,0.0001
,C,1.0
,multi_class,'ovr'
,fit_intercept,True
,intercept_scaling,1
,class_weight,'balanced'
,verbose,0


In [11]:
y_pred = svm_clf.predict(X_test)

accuracy = accuracy_score(y_test, y_pred) * 100

print(f"Classification Accuracy: {accuracy:.2f}%")
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

Classification Accuracy: 48.48%
Confusion Matrix:
 [[ 61  44  48]
 [ 48 238 103]
 [ 41 140 100]]


In [12]:
y_score = df["problem_score"]

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y_score,
    test_size=0.2,
    random_state=42
)


In [13]:
rf_reg = RandomForestRegressor(
    n_estimators=200,
    max_depth=20,
    random_state=42,
    n_jobs=-1
)

rf_reg.fit(X_train, y_train)


0,1,2
,n_estimators,200
,criterion,'squared_error'
,max_depth,20
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [14]:
y_pred = rf_reg.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

print("MAE:", mae)
print("RMSE:", rmse)


MAE: 1.6975286162959766
RMSE: 2.041002000782361


In [15]:
import os
import joblib

SAVE_DIR = "models"
os.makedirs(SAVE_DIR, exist_ok=True)


joblib.dump(tfidf, os.path.join(SAVE_DIR, "tfidf.pkl"))
joblib.dump(svm_clf, os.path.join(SAVE_DIR, "svm_classifier.pkl"))
joblib.dump(rf_reg, os.path.join(SAVE_DIR, "rf_regressor.pkl"))

# Only if you used scaler
joblib.dump(scaler, os.path.join(SAVE_DIR, "scaler.pkl"))

print("Models saved successfully to:", SAVE_DIR)


Models saved successfully to: models
