In [6]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer

In [3]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
sample_sub = pd.read_csv("sample_submission.csv")

train.head()

Unnamed: 0,id,Sex,Length,Diameter,Height,Whole weight,Whole weight.1,Whole weight.2,Shell weight,Rings
0,0,F,0.55,0.43,0.15,0.7715,0.3285,0.1465,0.24,11
1,1,F,0.63,0.49,0.145,1.13,0.458,0.2765,0.32,11
2,2,I,0.16,0.11,0.025,0.021,0.0055,0.003,0.005,6
3,3,M,0.595,0.475,0.15,0.9145,0.3755,0.2055,0.25,10
4,4,I,0.555,0.425,0.13,0.782,0.3695,0.16,0.1975,9


In [4]:
# Split features and target
X = train.drop(columns=["Rings", "id"])
y = train["Rings"]

# Column types
cat_cols = ["Sex"]
num_cols = [c for c in X.columns if c not in cat_cols]


In [9]:

# Numeric preprocessing
num_pipe = Pipeline([
    ("impute", SimpleImputer(strategy="median")),
    ("scale", StandardScaler())
])

# Categorical preprocessing
cat_pipe = Pipeline([
    ("impute", SimpleImputer(strategy="most_frequent")),
    ("encode", OneHotEncoder(handle_unknown="ignore"))
])

# Combine preprocessing
preprocess = ColumnTransformer([
    ("num", num_pipe, num_cols),
    ("cat", cat_pipe, cat_cols)
])

preprocess


ColumnTransformer(transformers=[('num',
                                 Pipeline(steps=[('impute',
                                                  SimpleImputer(strategy='median')),
                                                 ('scale', StandardScaler())]),
                                 ['Length', 'Diameter', 'Height',
                                  'Whole weight', 'Whole weight.1',
                                  'Whole weight.2', 'Shell weight']),
                                ('cat',
                                 Pipeline(steps=[('impute',
                                                  SimpleImputer(strategy='most_frequent')),
                                                 ('encode',
                                                  OneHotEncoder(handle_unknown='ignore'))]),
                                 ['Sex'])])

In [10]:
from sklearn.linear_model import Ridge

# Ridge regression model
ridge_model = Pipeline([
    ("prep", preprocess),
    ("model", Ridge(alpha=10))
])

ridge_model

Pipeline(steps=[('prep',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('impute',
                                                                   SimpleImputer(strategy='median')),
                                                                  ('scale',
                                                                   StandardScaler())]),
                                                  ['Length', 'Diameter',
                                                   'Height', 'Whole weight',
                                                   'Whole weight.1',
                                                   'Whole weight.2',
                                                   'Shell weight']),
                                                 ('cat',
                                                  Pipeline(steps=[('impute',
                                                                   SimpleImputer(str

In [12]:
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import mean_squared_log_error, make_scorer

# Log-transform target for RMSLE
y_log = np.log1p(y)

# RMSLE scorer (expects log predictions)
def rmsle(y_true, y_pred_log):
    y_pred = np.expm1(y_pred_log)
    y_pred = np.clip(y_pred, 0, None)  # prevent negatives
    return np.sqrt(mean_squared_log_error(y_true, y_pred))


rmsle_scorer = make_scorer(rmsle, greater_is_better=False)

# Cross-validation
cv = KFold(n_splits=5, shuffle=True, random_state=42)

ridge_scores = cross_val_score(
    ridge_model,
    X,
    y,
    cv=cv,
    scoring=rmsle_scorer
)

ridge_scores


array([-7.70289174, -7.68794395, -7.70463585, -7.71083367, -7.70806541])

In [14]:
# Convert negative CV scores to RMSLE
ridge_rmsle = -ridge_scores
ridge_rmsle

array([7.70289174, 7.68794395, 7.70463585, 7.71083367, 7.70806541])

In [17]:
from sklearn.ensemble import GradientBoostingRegressor

# Gradient Boosting regression model
gbr_model = Pipeline([
    ("prep", preprocess),
    ("model", GradientBoostingRegressor(
        learning_rate=0.05,
        max_depth=3,
        n_estimators=300,
        random_state=42
    ))
])

gbr_model

Pipeline(steps=[('prep',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('impute',
                                                                   SimpleImputer(strategy='median')),
                                                                  ('scale',
                                                                   StandardScaler())]),
                                                  ['Length', 'Diameter',
                                                   'Height', 'Whole weight',
                                                   'Whole weight.1',
                                                   'Whole weight.2',
                                                   'Shell weight']),
                                                 ('cat',
                                                  Pipeline(steps=[('impute',
                                                                   SimpleImputer(str

In [19]:
gbr_scores = cross_val_score(
    gbr_model,
    X,
    y,
    cv=cv,
    scoring=rmsle_scorer
)

# Convert to positive RMSLE
gbr_rmsle = -gbr_scores

gbr_rmsle.mean()


7.713339230209383

In [20]:
# Fit Ridge model on full training data
ridge_model.fit(X, y_log)


Pipeline(steps=[('prep',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('impute',
                                                                   SimpleImputer(strategy='median')),
                                                                  ('scale',
                                                                   StandardScaler())]),
                                                  ['Length', 'Diameter',
                                                   'Height', 'Whole weight',
                                                   'Whole weight.1',
                                                   'Whole weight.2',
                                                   'Shell weight']),
                                                 ('cat',
                                                  Pipeline(steps=[('impute',
                                                                   SimpleImputer(str

In [22]:
# Predict on test data (exclude id)
ridge_pred_log = ridge_model.predict(test.drop(columns=["id"]))

# Convert predictions back from log scale
ridge_pred = np.expm1(ridge_pred_log)

# Ensure no negative values (required for Kaggle / RMSLE)
ridge_pred = np.clip(ridge_pred, 0, None)

ridge_pred 

array([ 8.84413962,  9.97650716, 10.05361266, ..., 10.333411  ,
       13.41596353,  8.37162679])

In [23]:
submission = pd.DataFrame({
    "id": test["id"],
    "Rings": ridge_pred
})

submission.to_csv("submission.csv", index=False)


In [26]:
# Fit Gradient Boosting model on full training data
gbr_model.fit(X, y_log)


Pipeline(steps=[('prep',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('impute',
                                                                   SimpleImputer(strategy='median')),
                                                                  ('scale',
                                                                   StandardScaler())]),
                                                  ['Length', 'Diameter',
                                                   'Height', 'Whole weight',
                                                   'Whole weight.1',
                                                   'Whole weight.2',
                                                   'Shell weight']),
                                                 ('cat',
                                                  Pipeline(steps=[('impute',
                                                                   SimpleImputer(str

In [27]:
# Predict using Gradient Boosting model
gbr_pred_log = gbr_model.predict(test.drop(columns=["id"]))

# Convert back from log scale
gbr_pred = np.expm1(gbr_pred_log)

# Ensure non-negative values
gbr_pred = np.clip(gbr_pred, 0, None)


In [28]:
submission_gbr = pd.DataFrame({
    "id": test["id"],
    "Rings": gbr_pred
})

submission_gbr.to_csv("submission_gbr.csv", index=False)
