In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression

## Baseline Model

In [6]:
# Load data and split features/target
train = pd.read_csv('../data/train.csv')
test = pd.read_csv('../data/test.csv')

TARGET = train.columns[-1]

X = train.drop(columns=[TARGET])
y = train[TARGET]

In [7]:
# Identify categorical vs numerical columns
cat_cols = X.select_dtypes(include=['object']).columns
num_cols = X.select_dtypes(include=np.number).columns

cat_cols, num_cols

(Index(['gender', 'course', 'internet_access', 'sleep_quality', 'study_method',
        'facility_rating', 'exam_difficulty'],
       dtype='object'),
 Index(['id', 'age', 'study_hours', 'class_attendance', 'sleep_hours'], dtype='object'))

In [8]:
# Build preprocessing + model pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols),
        ("num", "passthrough", num_cols),
    ]
)

model = LinearRegression()

pipeline = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
        ("model", model)
    ]
)

In [11]:
# Train / validation split + evaluation
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42
)

pipeline.fit(X_train, y_train)

val_preds = pipeline.predict(X_val)
rmse = mean_squared_error(y_val, val_preds)

rmse


78.96969218600805

In [12]:
# Train on full data & predict test set
pipeline.fit(X, y)

test_preds = pipeline.predict(test)
submission = pd.DataFrame({
    "id": test["id"],
    TARGET: test_preds
})
submission.head()

Unnamed: 0,id,exam_score
0,630000,71.773053
1,630001,69.472746
2,630002,87.428876
3,630003,54.918094
4,630004,47.299713


In [13]:
submission.to_csv('../submissions/submission_v1.csv', index=False)