install required libraries

In [3]:
!pip install pandas
!pip install numpy
!pip install scikit-learn
!pip install optuna
!pip install -U cupy-cuda12x
!pip install catboost -U



load the data, encode categorical features, and add more features.

In [4]:
import numpy as np
import pandas as pd

"""
uncomment to load the data from google drive.
from google.colab import drive
drive.mount('/content/drive')
train_file_path = "/content/drive/My Drive/MLCW1/CW1_train.csv"
"""
train_file_path = "/data/CW1_train.csv"
df = pd.read_csv(train_file_path)

# Define the ordinal encoding for each categorical feature
cut_order = ["Fair", "Good", "Very Good", "Premium", "Ideal"] #Ideal is the best
color_order = ["J", "I", "H", "G", "F", "E", "D"]  # D is the best, J is the worst
clarity_order = ["I1", "SI2", "SI1", "VS2", "VS1", "VVS2", "VVS1", "IF"]  # IF is the best

# Create a mapping dictionary for encoding
cut_mapping = {cut: i for i, cut in enumerate(cut_order)}
color_mapping = {color: i for i, color in enumerate(color_order)}
clarity_mapping = {clarity: i for i, clarity in enumerate(clarity_order)}

# Apply the encoding to the dataframe
df_encoded = df.copy()
df_encoded["cut"] = df_encoded["cut"].map(cut_mapping)
df_encoded["color"] = df_encoded["color"].map(color_mapping)
df_encoded["clarity"] = df_encoded["clarity"].map(clarity_mapping)

#Feature Engineering:
#log transformations for skewed distributions.
df_encoded["carat_log"] = np.log1p(df_encoded["carat"])
df_encoded["price_log"] = np.log1p(df_encoded["price"])
df_encoded["depth_x_table"] = df_encoded["depth"] * df_encoded["table"]
df_encoded["depth_x_carat"] = df_encoded["depth"] * df_encoded["carat"]

print(df_encoded.info())

Mounted at /content/drive
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 35 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   outcome        10000 non-null  float64
 1   carat          10000 non-null  float64
 2   cut            10000 non-null  int64  
 3   color          10000 non-null  int64  
 4   clarity        10000 non-null  int64  
 5   depth          10000 non-null  float64
 6   table          10000 non-null  float64
 7   price          10000 non-null  int64  
 8   x              10000 non-null  float64
 9   y              10000 non-null  float64
 10  z              10000 non-null  float64
 11  a1             10000 non-null  float64
 12  a2             10000 non-null  float64
 13  a3             10000 non-null  float64
 14  a4             10000 non-null  float64
 15  a5             10000 non-null  float64
 16  b1             10000 non-null  float64
 17  b2             10000 non-

train final model

In [5]:
import pandas as pd
import numpy as np
import optuna
from catboost import CatBoostRegressor
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import mean_squared_error, r2_score

# Keep `a1-a5, b1-b5` (since they rank high in the random forest analysis and correlation matrix)
a_b_features = [f"a{i}" for i in range(1, 6)] + [f"b{i}" for i in range(1, 6)]

# Final feature set: numerical + interactions + binned price + binned carat + `a1-b5` + encoded categorical
selected_features = ["carat", "depth", "table", "x", "y",
                     "depth_x_table", "depth_x_carat", "carat_log", "price_log"] + a_b_features + ["cut", "color", "clarity"]

X = df_encoded[selected_features]
y = df_encoded["outcome"]

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define Cross-Validation for More Robust Optuna Optimization
cv = KFold(n_splits=17, shuffle=True, random_state=42)

trial_18_params = {
    "iterations": 4284,
    "depth": 3,
    "learning_rate": 0.017181970020733888,
    "l2_leaf_reg": 1.1871493461339557,
    "border_count": 173,
    "random_strength": 1.5232946917610097,
    "task_type": "GPU",  # Use GPU acceleration
    "loss_function": "RMSE",
    "random_seed": 42,  # Fix randomness
    "verbose": 100,
    "use_best_model": True  # Use best iteration from training
}

# Use Cross-Validation (Same as Optuna)
cv = KFold(n_splits=17, shuffle=True, random_state=42)
r2_scores = []

for train_idx, test_idx in cv.split(X):
    X_train_cv, X_test_cv = X.iloc[train_idx], X.iloc[test_idx]
    y_train_cv, y_test_cv = y.iloc[train_idx], y.iloc[test_idx]

    # Train Model
    model = CatBoostRegressor(**trial_18_params)
    model.fit(X_train_cv, y_train_cv, eval_set=(X_test_cv, y_test_cv), verbose=0)

    # Evaluate
    y_pred_cv = model.predict(X_test_cv)
    r2_scores.append(r2_score(y_test_cv, y_pred_cv))

# Compute Final Cross-Validation R²
final_cv_r2 = np.mean(r2_scores)
print(f"✅ Cross-Validated Model R² Score: {final_cv_r2:.5f} ")

✅ Cross-Validated Model R² Score: 0.48188 


make predictions on test set

In [9]:
import pandas as pd
import numpy as np

# Load test set
'''
uncomment to load test data from google drive.
from google.colab import drive
drive.mount('/content/drive')
test_file_path = "/content/drive/My Drive/MLCW1/CW1_test.csv"
'''
test_file_path = "/data/CW1_train.csv"

X_test = pd.read_csv(test_file_path)

# Apply the same categorical encoding
X_test["cut"] = X_test["cut"].map(cut_mapping)
X_test["color"] = X_test["color"].map(color_mapping)
X_test["clarity"] = X_test["clarity"].map(clarity_mapping)

# Apply the same feature engineering
X_test["carat_log"] = np.log1p(X_test["carat"])
X_test["price_log"] = np.log1p(X_test["price"])  # If available
X_test["depth_x_table"] = X_test["depth"] * X_test["table"]
X_test["depth_x_carat"] = X_test["depth"] * X_test["carat"]

# Ensure feature alignment (order & missing columns)
X_test = X_test[selected_features]  # Ensure same features as training
print(X_test.info())
# Predict test outcomes
predictions = model.predict(X_test)

# Save predictions in required format
submission = pd.DataFrame({'yhat': predictions})
submission_filename = "/content/CW1_submission_k21172604.csv"  # Replace KNUMBER with your real K-number
submission.to_csv(submission_filename, index=False)

# Download submission file
from google.colab import files
files.download(submission_filename)


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 22 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   carat          1000 non-null   float64
 1   depth          1000 non-null   float64
 2   table          1000 non-null   float64
 3   x              1000 non-null   float64
 4   y              1000 non-null   float64
 5   depth_x_table  1000 non-null   float64
 6   depth_x_carat  1000 non-null   float64
 7   carat_log      1000 non-null   float64
 8   price_log      1000 non-null   float64
 9   a1             1000 non-null   float64
 10  a2             1000 non-null   float64
 11  a3             1000 non-null   float64
 12  a4             1000 non-null   float64
 13  a5             1000 non-null   float64
 14  b1             1000 non-null   float64
 15  b2          

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>