In [1]:
import json
import re
import joblib
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.metrics import mean_absolute_error, mean_squared_error

from scipy.sparse import hstack


In [2]:
import json
import pandas as pd

DATA_PATH = "data/problems_data.jsonl"

with open(DATA_PATH, "r", encoding="utf-8-sig") as f:
    content = f.read().strip()

# Case 1: JSON array
if content.startswith("["):
    records = json.loads(content)

# Case 2: line-delimited JSON
else:
    records = []
    for i, line in enumerate(content.splitlines(), start=1):
        line = line.strip()
        if not line:
            continue
        try:
            records.append(json.loads(line))
        except json.JSONDecodeError as e:
            print(f"Skipping bad JSON at line {i}: {e}")

df = pd.DataFrame(records)
print("Loaded records:", len(df))
df.head()


Loaded records: 0


In [3]:
print("Shape:", df.shape)
print(df.columns)


Shape: (0, 0)
RangeIndex(start=0, stop=0, step=1)


In [4]:
text_cols = ["title", "description", "input_description", "output_description"]

for col in text_cols:
    if col not in df.columns:
        df[col] = ""

df[text_cols] = df[text_cols].fillna("")

df["full_text"] = (
    df["title"] + " " +
    df["description"] + " " +
    df["input_description"] + " " +
    df["output_description"]
)

df[["full_text", "problem_class", "problem_score"]].head(100)


KeyError: "['problem_class', 'problem_score'] not in index"

In [None]:
def count_math_symbols(text):
    return len(re.findall(r"[+\-*/=<>]", text))

df["text_length"] = df["full_text"].apply(len)
df["math_symbols"] = df["full_text"].apply(count_math_symbols)

df[["text_length", "math_symbols"]].describe()


In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_extra_scaled = scaler.fit_transform(df[["text_length", "math_symbols"]])
tfidf = TfidfVectorizer(
    max_features=5000,
    ngram_range=(1, 2),
    stop_words="english"
)

X_tfidf = tfidf.fit_transform(df["full_text"])

X = hstack([X_tfidf, X_extra_scaled])


In [None]:
y_class = df["problem_class"]

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y_class,
    test_size=0.2,
    random_state=42,
    stratify=y_class
)


In [None]:
from sklearn.svm import LinearSVC

svm_clf = LinearSVC(
    C=1.0,
    max_iter=10000,      # ðŸ”¥ fix convergence
    class_weight="balanced"
)

svm_clf.fit(X_train, y_train)


In [None]:
y_pred = svm_clf.predict(X_test)

accuracy = accuracy_score(y_test, y_pred) * 100

print(f"Classification Accuracy: {accuracy:.2f}%")
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

In [None]:
y_score = df["problem_score"]

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y_score,
    test_size=0.2,
    random_state=42
)


In [None]:
rf_reg = RandomForestRegressor(
    n_estimators=200,
    max_depth=20,
    random_state=42,
    n_jobs=-1
)

rf_reg.fit(X_train, y_train)


In [None]:
y_pred = rf_reg.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

print("MAE:", mae)
print("RMSE:", rmse)


In [None]:
import os
import joblib

SAVE_DIR = "models"
os.makedirs(SAVE_DIR, exist_ok=True)


joblib.dump(tfidf, os.path.join(SAVE_DIR, "tfidf.pkl"))
joblib.dump(svm_clf, os.path.join(SAVE_DIR, "svm_classifier.pkl"))
joblib.dump(rf_reg, os.path.join(SAVE_DIR, "rf_regressor.pkl"))

# Only if you used scaler
joblib.dump(scaler, os.path.join(SAVE_DIR, "scaler.pkl"))

print("Models saved successfully to:", SAVE_DIR)


In [None]:
with open("data/problems_data.jsonl", "r", encoding="utf-8", errors="replace") as f:
    for i in range(20):
        print(f"{i+1}: {repr(f.readline())}")


In [None]:
import os
os.path.getsize("data/problems_data.jsonl")



In [None]:
import json
import pandas as pd

records = []
with open("data/problems_data.jsonl", "r", encoding="utf-8", errors="replace") as f:
    for line in f:
        line = line.strip()
        if not line:
            continue
        records.append(json.loads(line))

df = pd.DataFrame(records)
print("Loaded records:", len(df))


In [None]:
import joblib
joblib.load("models/rf_regressor.pkl")


In [5]:
print(df.columns.tolist())


['title', 'description', 'input_description', 'output_description', 'full_text']
