Using Huggung face datasets

1. Loading dataset

In [None]:
!pip install datasets

from datasets import load_dataset

dataset = load_dataset("open-r1/codeforces", split="train")
print(dataset)


In [None]:
import re
import os
import numpy as np
import pandas as pd
import joblib
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    classification_report,
    mean_absolute_error,
    mean_squared_error
)

from scipy.sparse import hstack

In [None]:
print(dataset[0])

2. Creating a dataframe from the dataset

In [None]:
df = pd.DataFrame(dataset)
df.head()

# Selecting only the needed columns
df = df[['title', 'description', 'input_format', 'output_format', 'rating']]

df.head()


3. Cleaning and changing according to need

In [None]:
# Function to map numeric rating to Easy/Medium/Hard
def rating_to_class(rating):
    if pd.isna(rating):
        return np.nan
    elif rating <= 1200:
        return "Easy"
    elif rating <= 1800:
        return "Medium"
    else:
        return "Hard"

df['problem_class'] = df['rating'].apply(rating_to_class)
df['problem_score'] = df['rating']

# Drop the original rating column
df = df.drop(columns=['rating'])

df.head()


In [None]:
df = df.rename(columns={
    'input_format': 'input_description',
    'output_format': 'output_description'
})

df.head()


In [None]:
# Drop rows where the description is empty or just whitespace
df = df[df['description'].str.strip() != '']

# Optional: reset index
df = df.reset_index(drop=True)

# Drop rows with missing labels
df = df.dropna(subset=['problem_class', 'problem_score']).reset_index(drop=True)


print("Cleaned dataset shape:", df.shape)


In [None]:
print(df['problem_class'].isna().sum())
print(df['problem_score'].isna().sum())


In [None]:
print("NaN in X:", np.isnan(X.data).sum())
print("NaN in y_class:", y_class.isna().sum())
print("NaN in y_score:", y_score.isna().sum())


In [None]:
# Columns: 'title', 'description', 'input_description', 'output_description', 'problem_class', 'problem_score'

# Combine text fields
text_columns = ['title', 'description', 'input_description', 'output_description']
df['combined_text'] = df[text_columns].fillna('').agg(' '.join, axis=1)

# ====== Step 2: Clean the combined text ======
def clean_text(text):
    text = text.lower()  # lowercase
    text = re.sub(r'\s+', ' ', text).strip()  # remove extra spaces/newlines
    text = re.sub(r"[^a-z0-9+\-*/=^<>% ]", "", text)  # keep letters, numbers, math symbols, spaces
    return text

df['combined_text'] = df['combined_text'].apply(clean_text)

# ====== Step 3: Create additional features ======
# 1. Text length and word count
df['text_length'] = df['combined_text'].apply(len)
df['word_count'] = df['combined_text'].apply(lambda x: len(x.split()))

# 2. Count mathematical symbols
math_symbols = '+-*/=^<>%'
df['math_symbols'] = df['combined_text'].apply(lambda x: sum(x.count(s) for s in math_symbols))

# 3. Keyword frequency
keywords = [
    # Core paradigms
    'dp', 'dynamic programming', 'recursion', 'backtracking',
    'greedy', 'divide and conquer',

    # Graphs & trees
    'graph', 'tree', 'dfs', 'bfs',
    'shortest path', 'dijkstra', 'bellman ford',
    'topological', 'mst', 'lca',

    # Data structures
    'segment tree', 'fenwick', 'binary indexed tree',
    'heap', 'priority queue', 'deque',

    # Searching & optimization
    'binary search', 'two pointers', 'sliding window',

    # Advanced / hard signals
    'bitmask', 'bit manipulation',
    'flow', 'max flow', 'min cost',
    'matching',
    'combinatorics', 'probability',
    'modulo'
]

for kw in keywords:
    col_name = f"kw_{kw.replace(' ', '_')}"
    df[col_name] = df['combined_text'].apply(lambda x: x.count(kw))


# ====== Step 4: TF-IDF vectors ======
tfidf = TfidfVectorizer(
    max_features=5000,
    stop_words='english',
    ngram_range=(1, 2)
)

X_tfidf = tfidf.fit_transform(df['combined_text'])

# ====== Step 5: Combine TF-IDF with extra features ======
from sklearn.preprocessing import StandardScaler

extra_feature_cols = (
    ['text_length', 'word_count', 'math_symbols'] +
    [f"kw_{kw.replace(' ', '_')}" for kw in keywords]
)

extra_features = df[extra_feature_cols].values


scaler = StandardScaler()
extra_features_scaled = scaler.fit_transform(extra_features)

X = hstack([X_tfidf, extra_features_scaled])


print("Feature matrix shape:", X.shape)
print("Preview dataframe with extra features:")
df.head()


In [None]:
import os
import joblib

os.makedirs("models", exist_ok=True)

joblib.dump(tfidf, "models/tfidf.pkl")
joblib.dump(scaler, "models/scaler.pkl")



In [None]:
from sklearn.model_selection import train_test_split

y_class = df['problem_class']
y_score = df['problem_score']

X_train, X_test, y_class_train, y_class_test = train_test_split(
    X, y_class, test_size=0.2, random_state=42, stratify=y_class
)

_, _, y_score_train, y_score_test = train_test_split(
    X, y_score, test_size=0.2, random_state=42
)


CLASSIFICATION

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix

clf = LogisticRegression(max_iter=1000, n_jobs=-1)
clf.fit(X_train, y_class_train)

preds = clf.predict(X_test)

print("Accuracy:", accuracy_score(y_class_test, preds))
print("Confusion Matrix:\n", confusion_matrix(y_class_test, preds))
import joblib
import os

os.makedirs("models", exist_ok=True)
joblib.dump(clf, "models/classifier.pkl")



RANDOM FORREST

In [None]:
from sklearn.linear_model import Ridge
import numpy as np
from sklearn.metrics import mean_absolute_error, mean_squared_error

y_train_log = np.log1p(y_score_train)
y_test_log = np.log1p(y_score_test)

reg = Ridge(alpha=10.0)
reg.fit(X_train, y_train_log)

preds = np.expm1(reg.predict(X_test))
preds = np.clip(preds, 800, 3500)

print("MAE:", mean_absolute_error(y_score_test, preds))
print("RMSE:", np.sqrt(mean_squared_error(y_score_test, preds)))
joblib.dump(reg, "models/regressor.pkl")


