install required libraries

In [1]:
!pip install pandas
!pip install numpy
!pip install scikit-learn
!pip install optuna
!pip install -U cupy-cuda12x
!pip install catboost -U

Collecting optuna
  Downloading optuna-4.2.1-py3-none-any.whl.metadata (17 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.14.1-py3-none-any.whl.metadata (7.4 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Collecting Mako (from alembic>=1.5.0->optuna)
  Downloading Mako-1.3.9-py3-none-any.whl.metadata (2.9 kB)
Downloading optuna-4.2.1-py3-none-any.whl (383 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m383.6/383.6 kB[0m [31m23.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading alembic-1.14.1-py3-none-any.whl (233 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m233.6/233.6 kB[0m [31m22.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.9.0-py3-none-any.whl (11 kB)
Downloading Mako-1.3.9-py3-none-any.whl (78 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.5/78.5 kB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: M

load the data, encode categorical features, and add more features.

In [2]:
import numpy as np
import pandas as pd

"""
uncomment to load the data from google drive.
from google.colab import drive
drive.mount('/content/drive')
train_file_path = "/content/drive/My Drive/MLCW1/CW1_train.csv"
"""
train_file_path = "/data/CW1_train.csv"
df = pd.read_csv(train_file_path)


# Define the ordinal encoding for each categorical feature
cut_order = ["Fair", "Good", "Very Good", "Premium", "Ideal"] #Ideal is the best
color_order = ["J", "I", "H", "G", "F", "E", "D"]  # D is the best, J is the worst
clarity_order = ["I1", "SI2", "SI1", "VS2", "VS1", "VVS2", "VVS1", "IF"]  # IF is the best

# Create a mapping dictionary for encoding
cut_mapping = {cut: i for i, cut in enumerate(cut_order)}
color_mapping = {color: i for i, color in enumerate(color_order)}
clarity_mapping = {clarity: i for i, clarity in enumerate(clarity_order)}

# Apply the encoding to the dataframe
df_encoded = df.copy()
df_encoded["cut"] = df_encoded["cut"].map(cut_mapping)
df_encoded["color"] = df_encoded["color"].map(color_mapping)
df_encoded["clarity"] = df_encoded["clarity"].map(clarity_mapping)

#Feature Engineering:
#log transformations for skewed distributions.
df_encoded["carat_log"] = np.log1p(df_encoded["carat"])
df_encoded["price_log"] = np.log1p(df_encoded["price"])
df_encoded["depth_x_table"] = df_encoded["depth"] * df_encoded["table"]
df_encoded["depth_x_carat"] = df_encoded["depth"] * df_encoded["carat"]

print(df_encoded.info())

Mounted at /content/drive
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 35 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   outcome        10000 non-null  float64
 1   carat          10000 non-null  float64
 2   cut            10000 non-null  int64  
 3   color          10000 non-null  int64  
 4   clarity        10000 non-null  int64  
 5   depth          10000 non-null  float64
 6   table          10000 non-null  float64
 7   price          10000 non-null  int64  
 8   x              10000 non-null  float64
 9   y              10000 non-null  float64
 10  z              10000 non-null  float64
 11  a1             10000 non-null  float64
 12  a2             10000 non-null  float64
 13  a3             10000 non-null  float64
 14  a4             10000 non-null  float64
 15  a5             10000 non-null  float64
 16  b1             10000 non-null  float64
 17  b2             10000 non-

train final model.

In [20]:
import pandas as pd
import numpy as np
from catboost import CatBoostRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

# Define feature set
a_b_features = [f"a{i}" for i in range(1, 6)] + [f"b{i}" for i in range(1, 6)]
selected_features = ["carat", "depth", "table", "x", "y",
                     "depth_x_table", "depth_x_carat", "carat_log", "price_log"] + a_b_features + ["cut", "color", "clarity"]

# Load data (Ensure df_encoded is defined before running)
X = df_encoded[selected_features]
y = df_encoded["outcome"]

# Split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define CatBoost model parameters
best_catboost_params = {
     "iterations": 4284,
    "depth": 2, #reduced depth from 3 to 2 to reduce overfitting and it worked.
    "learning_rate": 0.017181970020733888,
    "l2_leaf_reg": 1.23,
    "border_count": 173,
    "random_strength": 1.5232946917610097,
    "task_type": "GPU",  # Use GPU acceleration
    "loss_function": "RMSE",
    "random_seed": 42,  # Fix randomness
    "verbose": 100,
    "use_best_model": True  # Use best iteration from training
}

# Train model on training set
model = CatBoostRegressor(**best_catboost_params)
model.fit(X_train, y_train, eval_set=(X_test, y_test), verbose=0)

# Make predictions
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)

# Compute R² scores
r2_train = r2_score(y_train, y_train_pred)
r2_test = r2_score(y_test, y_test_pred)

# Compute Mean Squared Error (MSE)
mse_train = mean_squared_error(y_train, y_train_pred)
mse_test = mean_squared_error(y_test, y_test_pred)

# Print results
print(f"✅ Training R² Score: {r2_train:.5f}")
print(f"✅ Validation (Test) R² Score: {r2_test:.5f}")
print(f"✅ Training MSE: {mse_train:.5f}")
print(f"✅ Validation (Test) MSE: {mse_test:.5f}")


✅ Training R² Score: 0.51590
✅ Validation (Test) R² Score: 0.46709
✅ Training MSE: 78.20365
✅ Validation (Test) MSE: 86.86698


make predictions on test set

In [21]:
import pandas as pd
import numpy as np

#Load data
'''
uncomment to load test data from google drive.
from google.colab import drive
drive.mount('/content/drive')
test_file_path = "/content/drive/My Drive/MLCW1/CW1_test.csv"
'''
test_file_path = "/data/CW1_train.csv"

X_test = pd.read_csv(test_file_path)

# Apply the same categorical encoding
X_test["cut"] = X_test["cut"].map(cut_mapping)
X_test["color"] = X_test["color"].map(color_mapping)
X_test["clarity"] = X_test["clarity"].map(clarity_mapping)

# Apply the same feature engineering
X_test["carat_log"] = np.log1p(X_test["carat"])
X_test["price_log"] = np.log1p(X_test["price"]) 
X_test["depth_x_table"] = X_test["depth"] * X_test["table"]
X_test["depth_x_carat"] = X_test["depth"] * X_test["carat"]

# Ensure feature alignment (order & missing columns)
X_test = X_test[selected_features]  # Ensure same features as training
print(X_test.info())
# Predict test outcomes
predictions = model.predict(X_test)

# Save predictions in required format
submission = pd.DataFrame({'yhat': predictions})

submission_filename = "CW1_submission_k21172604.csv" 

# Save the file locally
submission.to_csv(submission_filename, index=False)

print(f"✅ Submission file saved as: {submission_filename}")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 22 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   carat          1000 non-null   float64
 1   depth          1000 non-null   float64
 2   table          1000 non-null   float64
 3   x              1000 non-null   float64
 4   y              1000 non-null   float64
 5   depth_x_table  1000 non-null   float64
 6   depth_x_carat  1000 non-null   float64
 7   carat_log      1000 non-null   float64
 8   price_log      1000 non-null   float64
 9   a1             1000 non-null   float64
 10  a2             1000 non-null   float64
 11  a3             1000 non-null   float64
 12  a4             1000 non-null   float64
 13  a5             1000 non-null   float64
 14  b1             1000 non-null   float64
 15  b2          

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>