In [3]:
pip install pandas matplotlib seaborn

Collecting seaborn
  Downloading seaborn-0.13.2-py3-none-any.whl.metadata (5.4 kB)
Downloading seaborn-0.13.2-py3-none-any.whl (294 kB)
   ---------------------------------------- 0.0/294.9 kB ? eta -:--:--
   - -------------------------------------- 10.2/294.9 kB ? eta -:--:--
   ----- --------------------------------- 41.0/294.9 kB 653.6 kB/s eta 0:00:01
   --------------------------------- ------ 245.8/294.9 kB 2.5 MB/s eta 0:00:01
   ---------------------------------------- 294.9/294.9 kB 2.3 MB/s eta 0:00:00
Installing collected packages: seaborn
Successfully installed seaborn-0.13.2
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.0 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [None]:
import pandas as pd

# load data
train = pd.read_csv("train.csv", nrows=200000)   # sample first 200k
questions = pd.read_csv("questions.csv")
print("train shape (sample):", train.shape)
print(train.columns)
print(train.head())
print(train['answered_correctly'].value_counts(normalize=True).round(3))
print(questions.head())

train shape (sample): (200000, 10)
Index(['row_id', 'timestamp', 'user_id', 'content_id', 'content_type_id',
       'task_container_id', 'user_answer', 'answered_correctly',
       'prior_question_elapsed_time', 'prior_question_had_explanation'],
      dtype='object')
   row_id  timestamp  user_id  content_id  content_type_id  task_container_id  \
0       0          0      115        5692                0                  1   
1       1      56943      115        5716                0                  2   
2       2     118363      115         128                0                  0   
3       3     131167      115        7860                0                  3   
4       4     137965      115        7922                0                  4   

   user_answer  answered_correctly  prior_question_elapsed_time  \
0            3                   1                          NaN   
1            2                   1                      37000.0   
2            0                   1         

In [13]:
import json
with open("train_mathqa.json","r") as f:
    data = json.load(f)   # or json.loads per file format
print(type(data), len(data))
print(data[0].keys())
# print one question to inspect format
print(data[0])

<class 'list'> 29837
dict_keys(['Problem', 'Rationale', 'options', 'correct', 'annotated_formula', 'linear_formula', 'category'])
{'Problem': "the banker ' s gain of a certain sum due 3 years hence at 10 % per annum is rs . 36 . what is the present worth ?", 'Rationale': '"explanation : t = 3 years r = 10 % td = ( bg × 100 ) / tr = ( 36 × 100 ) / ( 3 × 10 ) = 12 × 10 = rs . 120 td = ( pw × tr ) / 100 ⇒ 120 = ( pw × 3 × 10 ) / 100 ⇒ 1200 = pw × 3 pw = 1200 / 3 = rs . 400 answer : option a"', 'options': 'a ) rs . 400 , b ) rs . 300 , c ) rs . 500 , d ) rs . 350 , e ) none of these', 'correct': 'a', 'annotated_formula': 'divide(multiply(const_100, divide(multiply(36, const_100), multiply(3, 10))), multiply(3, 10))', 'linear_formula': 'multiply(n2,const_100)|multiply(n0,n1)|divide(#0,#1)|multiply(#2,const_100)|divide(#3,#1)|', 'category': 'gain'}


In [16]:
# create smaller sample dataset with up to 2000 unique users
unique_users = train['user_id'].nunique()
print("Unique users in sample:", unique_users)

# pick min of 2000 or available users
users = train['user_id'].drop_duplicates().sample(n=min(2000, unique_users), random_state=1)

sample_df = train[train['user_id'].isin(users)]
sample_df.to_csv("sample_train.csv", index=False)

Unique users in sample: 719


In [51]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
import joblib
from sklearn.preprocessing import LabelEncoder
import os

# Load sample training data
train = pd.read_csv("sample_train.csv")
questions = pd.read_csv("questions.csv")

# Merge question metadata
train = train.merge(questions[['question_id', 'part']], left_on="content_id", right_on="question_id", how="left")
train = train[train["answered_correctly"] != -1]

# --- Handle Missing Values ---
# prior_question_elapsed_time: fill NaN with median
train['prior_question_elapsed_time'] = train['prior_question_elapsed_time'].fillna(train['prior_question_elapsed_time'].median())

# prior_question_had_explanation: map True/False, NaN → 0
train['prior_question_had_explanation'] = train['prior_question_had_explanation'].map({True:1, False:0})
train['prior_question_had_explanation'] = train['prior_question_had_explanation'].fillna(0)

# part: categorical (fill NaN with -1)
train['part'] = train['part'].fillna(-1)

# --- 3. Build simple aggregated features (content and user statistics) ---
# content_avg_correct: how often this question is answered correctly (proxy difficulty)
content_avg = train.groupby("content_id")["answered_correctly"].mean().to_dict()
train["content_avg_correct"] = train["content_id"].map(content_avg).fillna(train["answered_correctly"].mean())

# user_avg_correct: user's historical accuracy (use current data as proxy; in full pipeline compute cumulatively)
user_avg = train.groupby("user_id")["answered_correctly"].mean().to_dict()
train["user_avg_correct"] = train["user_id"].map(user_avg).fillna(train["answered_correctly"].mean())

# Save processed sample (with content_avg and user_avg) to use in app for lookups
# os.makedirs("data/riiid/processed", exist_ok=True)
train.to_csv("processed_sample.csv", index=False)

print("Saved processed sample -> processed_sample.csv")

Saved processed sample -> processed_sample.csv


In [21]:
# Convert to DataFrame
df = pd.DataFrame(data)
print(df.head())
print("Columns:", df.columns)

# Keep useful fields
# Problem (question), options, correct answer, rationale
df = df[['Problem', 'options', 'correct', 'Rationale', 'category']]

# Map to your 10 topics (basic keyword mapping function)
def map_topic(text):
    q = str(text).lower()
    if any(w in q for w in ["algebra", "equation", "linear"]):
        return "Algebra"
    if any(w in q for w in ["triangle", "circle", "geometry", "angle"]):
        return "Geometry"
    if any(w in q for w in ["sin", "cos", "tan", "trig"]):
        return "Trigonometry"
    if any(w in q for w in ["derivative", "integral", "limit", "calculus"]):
        return "Calculus"
    if "probability" in q or "chance" in q:
        return "Probability"
    if any(w in q for w in ["mean", "median", "mode", "variance", "statistics"]):
        return "Statistics"
    if any(w in q for w in ["prime", "gcd", "lcm", "number theory"]):
        return "Number Theory"
    if any(w in q for w in ["matrix", "vector", "linear algebra"]):
        return "Linear Algebra"
    if "differential" in q or "dy/dx" in q:
        return "Differential Equations"
    if "set" in q or "venn" in q:
        return "Set Theory"
    return "Other"

df['mapped_topic'] = df['Problem'].apply(map_topic)

# Create quiz bank
quiz_bank = df[['mapped_topic', 'Problem', 'options', 'correct', 'Rationale']]
quiz_bank = quiz_bank.rename(columns={
    'mapped_topic': 'topic',
    'Problem': 'question',
    'options': 'options',
    'correct': 'answer',
    'Rationale': 'explanation'
})

# Save quiz bank CSV
quiz_bank.to_csv("quiz_bank.csv", index=False)

print("Saved quiz bank with shape:", quiz_bank.shape)
print(quiz_bank.head(5))


                                             Problem  \
0  the banker ' s gain of a certain sum due 3 yea...   
1  average age of students of an adult school is ...   
2  sophia finished 2 / 3 of a book . she calculat...   
3                        120 is what percent of 50 ?   
4  there are 10 girls and 20 boys in a classroom ...   

                                           Rationale  \
0  "explanation : t = 3 years r = 10 % td = ( bg ...   
1  "explanation : let the original no . of studen...   
2  let xx be the total number of pages in the boo...   
3  "50 * x = 120 - - > x = 2.4 - - > 2.4 expresse...   
4  if girls is 10 and boys is 20 , then 10 / 20 ....   

                                             options correct  \
0  a ) rs . 400 , b ) rs . 300 , c ) rs . 500 , d...       a   
1  a ) 1200 , b ) 120 , c ) 360 , d ) 240 , e ) n...       d   
2    a ) 229 , b ) 270 , c ) 877 , d ) 266 , e ) 281       b   
3  a ) 5 % , b ) 240 % , c ) 50 % , d ) 2 % , e )...       b   
4  a )

In [None]:
df = pd.read_csv("processed_sample.csv")
print(df.isna().sum())

user_id                              0
content_id                           0
part                              2303
tags                              2303
answered_correctly                   0
prior_question_elapsed_time       4434
prior_question_had_explanation     718
user_total_attempts                  0
user_accuracy_so_far                 0
user_part_accuracy_so_far         2303
dtype: int64


In [6]:
# train_model.py
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import joblib

# ====== Paths ======
PROCESSED_SAMPLE = "processed_sample.csv"
MODEL_PATH = "model/model.joblib"
ENC_PATH = "model/encoders.joblib"

# ====== Load processed sample ======
df = pd.read_csv(PROCESSED_SAMPLE)

# Check necessary columns
required_cols = [
    "prior_question_elapsed_time",
    "prior_question_had_explanation",
    "content_avg_correct",
    "user_avg_correct",
    "part",
    "answered_correctly"
]
# preprocessing
if 'content_avg_correct' not in df.columns:
    df['content_avg_correct'] = np.random.uniform(0.5, 0.9, size=len(df))  # simulate some topic-level avg
if 'user_avg_correct' not in df.columns:
    df['user_avg_correct'] = np.random.uniform(0.4, 0.95, size=len(df))   # simulate user performance

missing = [c for c in required_cols if c not in df.columns]
if missing:
    raise ValueError(f"Missing columns in dataset: {missing}")

# ====== Encode categorical columns ======
le_part = LabelEncoder()
df["part_le"] = le_part.fit_transform(df["part"].astype(str))

# ====== Features & Target ======
feature_cols = [
    "prior_question_elapsed_time",
    "prior_question_had_explanation",
    "content_avg_correct",
    "user_avg_correct",
    "part_le"
]
X = df[feature_cols]
y = df["answered_correctly"]

# ====== Train-Test Split ======
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# ====== Train RandomForest ======
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(
    n_estimators=200,
    max_depth=10,
    random_state=42
)
model.fit(X_train, y_train)

# ====== Evaluate (for report) ======
acc = model.score(X_test, y_test)
print(f"✅ Random Forest trained successfully. Test accuracy: {acc:.3f}")

# ====== Save model and encoders ======
joblib.dump(model, MODEL_PATH)
joblib.dump({
    "le_part": le_part,
    "feature_cols": feature_cols
}, ENC_PATH)
print("✅ Model and encoders saved!")


✅ Random Forest trained successfully. Test accuracy: 0.688
✅ Model and encoders saved!


In [None]:
import pandas as pd
import random

df = pd.read_csv("processed_sample.csv")

# Take last record per user_id + part to get latest accuracy so far
latest_performance = df.sort_values('content_id').groupby(['user_id', 'part']).tail(1)

# Reward: topics with low accuracy are prioritized
latest_performance['bandit_reward'] = 1 - latest_performance['user_part_accuracy_so_far']

# Adjust reward for number of attempts
latest_performance['bandit_reward'] *= (latest_performance['user_total_attempts'] / latest_performance['user_total_attempts'].max())

def pick_next_topic(user_id, epsilon=0.2):
    user_data = latest_performance[latest_performance['user_id'] == user_id]
    if user_data.empty:
        return random.choice(df['part'].unique().tolist())  # no history yet

    # Exploration
    if random.random() < epsilon:
        return random.choice(user_data['part'].unique().tolist())

    # Exploitation: pick part with highest "bandit_reward" (i.e. lowest accuracy)
    best_row = user_data.sort_values('bandit_reward', ascending=False).iloc[0]
    return best_row['part']