<div style="border-radius: 15px 0 15px 0px; border: 2px solid #f1c40f; padding: 5px; background-color: #1e3d59  ; text-align: center; box-shadow: 0px 2px 4px rgba(0, 0, 0, 0.2);">
    <h1 style="color: #f1c40f ; text-shadow: 2px 2px 4px rgba(0, 0, 0, 0.5); font-weight: bold; margin-bottom: 10px; font-size: 36px;"> Importing The Libraries </h1>
</div>

In [80]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import optuna
import re
import time
from optuna.samplers import TPESampler
from catboost import CatBoostRegressor
from sklearn.base import clone
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.preprocessing import StandardScaler, LabelEncoder, OrdinalEncoder
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from IPython.display import display, HTML
import warnings

warnings.filterwarnings("ignore")

pd.set_option('display.max_columns', None)


<div style="border-radius: 15px 0 15px 0px; border: 2px solid #f1c40f; padding: 5px; background-color: #1e3d59  ; text-align: center; box-shadow: 0px 2px 4px rgba(0, 0, 0, 0.2);">
    <h1 style="color: #f1c40f ; text-shadow: 2px 2px 4px rgba(0, 0, 0, 0.5); font-weight: bold; margin-bottom: 10px; font-size: 36px;"> Loading The Dataset</h2>
</div>

In [81]:
%%time

df_sbb = pd.read_csv('/kaggle/input/playground-series-s4e9/sample_submission.csv')
df_tr = pd.read_csv('/kaggle/input/playground-series-s4e9/train.csv')
df_ts= pd.read_csv('/kaggle/input/playground-series-s4e9/test.csv')
df_o = pd.read_csv('/kaggle/input/used-car-price-prediction-dataset/used_cars.csv')


CPU times: user 987 ms, sys: 15 ms, total: 1 s
Wall time: 1.01 s


In [82]:
df_sbb.shape

(125690, 2)

In [83]:
df_tr.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 188533 entries, 0 to 188532
Data columns (total 13 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   id            188533 non-null  int64 
 1   brand         188533 non-null  object
 2   model         188533 non-null  object
 3   model_year    188533 non-null  int64 
 4   milage        188533 non-null  int64 
 5   fuel_type     183450 non-null  object
 6   engine        188533 non-null  object
 7   transmission  188533 non-null  object
 8   ext_col       188533 non-null  object
 9   int_col       188533 non-null  object
 10  accident      186081 non-null  object
 11  clean_title   167114 non-null  object
 12  price         188533 non-null  int64 
dtypes: int64(4), object(9)
memory usage: 18.7+ MB


In [84]:
df_o['milage'] = df_o['milage'].str.replace(r'\D', '', regex=True).astype(int)
df_o['price'] = df_o['price'].str.replace(r'\D', '', regex=True).astype(int)


In [85]:
df_tr.drop(columns=['id'], inplace=True)
df_ts.drop(columns=['id'], inplace=True)

In [86]:
df_tr1 = pd.concat([df_tr, df_o], ignore_index=True)

In [87]:
df_tr = df_tr1

<div style="border-radius: 15px 0 15px 0px; border: 2px solid #f1c40f; padding: 5px; background-color: #1e3d59  ; text-align: center; box-shadow: 0px 2px 4px rgba(0, 0, 0, 0.2);">
    <h1 style="color: #f1c40f ; text-shadow: 2px 2px 4px rgba(0, 0, 0, 0.5); font-weight: bold; margin-bottom: 10px; font-size: 36px;"> A Quick Overview Of The Data</h1>
</div>

In [88]:
# Branded theme setup with reusable styles
def branded_heading(text, icon="🌟"):
    return f"""
    <div style="
        text-align: center;
        background: linear-gradient(90deg, #0d1f33, #394867);  /* Darker gradient for more premium feel */
        font-family: 'Poppins', sans-serif;  /* Professional and modern font */
        color: #f1c40f;  /* Luxurious gold for the text */
        padding: 8px;  /* Reduced padding for compactness */
        font-size: 22px;  /* Slightly smaller font for modern design */
        font-weight: 600;  /* Medium weight for boldness */
        border-radius: 8px;  /* Softer corners */
        margin-bottom: 18px;
        box-shadow: 0px 5px 12px rgba(0, 0, 0, 0.2);  /* Softer shadow for premium feel */
    ">
        {icon} {text}
    </div>
    """

# Function to apply branded theme with soft gradient and gold borders
def apply_branded_theme(df):
    styled_df = df.style.set_table_styles([
        {"selector": "th", "props": [("color", "white"), ("background-color", "#1e3d59"), ("border", "1px solid #f1c40f")]},  # Dark blue headers with gold borders
        {"selector": "td", "props": [("border", "1px solid #f1c40f"), ("padding", "6px"), ("background-color", "#f4f7f9"), ("color", "#34495e")]},  # Softer color scheme for cells
    ]).set_properties(**{
        "background-color": "#f4f7f9",  # Light grayish-blue background for readability
        "color": "#2c3e50",  # Deep gray text color for modern look
        "text-align": "center",
        "border": "1px solid #f1c40f"  # Luxurious gold border for elegance
    }).hide(axis="index")  # Hide index for clean design
    return styled_df.to_html()

# Main function for branded dataset analysis
def branded_dataset_analysis(train_dataset, n_top=5):
    # Top rows display with branded theme
    train_heading = branded_heading(f"Top {n_top} Rows of Training Dataset", icon="📊")
    display(HTML(train_heading))
    display(HTML(apply_branded_theme(train_dataset.head(n_top))))

    # Summary of dataset
    summary_heading = branded_heading("Summary of Dataset", icon="📈")
    display(HTML(summary_heading))
    display(HTML(apply_branded_theme(train_dataset.describe())))

    # Null values display with column names and counts
    null_heading = branded_heading("Null Values in Dataset", icon="❌")
    train_null_count = train_dataset.isnull().sum()
    null_columns = train_null_count[train_null_count > 0].index.tolist()

    display(HTML(null_heading))
    if train_null_count.sum() == 0:
        display(HTML("<p style='text-align:center; color: #27ae60;'>No null values in the training dataset.</p>"))
    else:
        # Create a DataFrame for column names and null counts
        null_df = pd.DataFrame({
            'Column Name': null_columns,
            'Null Value Count': train_null_count[train_null_count > 0].values
        })
        display(HTML(apply_branded_theme(null_df)))

    # Duplicate values
    duplicate_heading = branded_heading("Duplicate Values in Dataset", icon="♻️")
    train_duplicates = train_dataset.duplicated().sum()

    display(HTML(duplicate_heading))
    display(HTML(f"<p style='text-align:center;'> {train_duplicates} duplicate rows found.</p>"))

    # Rows and columns count
    shape_heading = branded_heading("Dataset Dimensions", icon="📏")
    display(HTML(shape_heading))
    display(HTML(f"<p style='text-align:center;'>Rows: {train_dataset.shape[0]}, Columns: {train_dataset.shape[1]}</p>"))

# Function for branded unique values analysis
def branded_unique_values(train_dataset):
    unique_values_heading = branded_heading("Unique Values in Training Dataset", icon="🔢")
    display(HTML(unique_values_heading))
    
    unique_values_details = []
    for col in train_dataset.columns:
        unique_vals = train_dataset[col].nunique()
        if unique_vals > 20:
            unique_values_details.append({
                'Column Name': col,
                'Data Type': train_dataset[col].dtype,
                'Unique Values': f"More than 20 unique values ({unique_vals} total)"
            })
        else:
            unique_list = ', '.join(map(str, train_dataset[col].unique()))
            unique_values_details.append({
                'Column Name': col,
                'Data Type': train_dataset[col].dtype,
                'Unique Values': unique_list
            })
    
    unique_values_table = pd.DataFrame(unique_values_details)
    display(HTML(apply_branded_theme(unique_values_table)))

    # Continuous numeric values check
    continuous_values_heading = branded_heading("Continuous Numeric Values Check", icon="📈")
    display(HTML(continuous_values_heading))

    for col in train_dataset.select_dtypes(include='number').columns:
        unique_vals = train_dataset[col].nunique()
        if unique_vals > 20:
            sorted_unique_vals = sorted(train_dataset[col].dropna().unique())
            if sorted_unique_vals == list(range(min(sorted_unique_vals), max(sorted_unique_vals) + 1)):
                display(HTML(f"<p style='text-align:center;'><strong>{col}:</strong> The column has {unique_vals} unique numeric values which are continuous.</p>"))
            else:
                display(HTML(f"<p style='text-align:center;'><strong>{col}:</strong> The column has {unique_vals} unique numeric values but they are not continuous.</p>"))


In [89]:
branded_dataset_analysis(df_tr, n_top=5)  # Branded design for analysis
branded_unique_values(df_tr)  # Branded design for unique values

brand,model,model_year,milage,fuel_type,engine,transmission,ext_col,int_col,accident,clean_title,price
MINI,Cooper S Base,2007,213000,Gasoline,172.0HP 1.6L 4 Cylinder Engine Gasoline Fuel,A/T,Yellow,Gray,None reported,Yes,4200
Lincoln,LS V8,2002,143250,Gasoline,252.0HP 3.9L 8 Cylinder Engine Gasoline Fuel,A/T,Silver,Beige,At least 1 accident or damage reported,Yes,4999
Chevrolet,Silverado 2500 LT,2002,136731,E85 Flex Fuel,320.0HP 5.3L 8 Cylinder Engine Flex Fuel Capability,A/T,Blue,Gray,None reported,Yes,13900
Genesis,G90 5.0 Ultimate,2017,19500,Gasoline,420.0HP 5.0L 8 Cylinder Engine Gasoline Fuel,Transmission w/Dual Shift Mode,Black,Black,None reported,Yes,45000
Mercedes-Benz,Metris Base,2021,7388,Gasoline,208.0HP 2.0L 4 Cylinder Engine Gasoline Fuel,7-Speed A/T,Black,Beige,None reported,Yes,97500


model_year,milage,price
192542.0,192542.0,192542.0
2015.823452,65684.728927,43892.07427
5.670724,49851.51298,78817.111436
1974.0,100.0,2000.0
2013.0,24115.0,17000.0
2017.0,57550.0,30825.0
2020.0,95400.0,49900.0
2024.0,405000.0,2954083.0


Column Name,Null Value Count
fuel_type,5253
accident,2565
clean_title,22015


Column Name,Data Type,Unique Values
brand,object,More than 20 unique values (57 total)
model,object,More than 20 unique values (1898 total)
model_year,int64,More than 20 unique values (34 total)
milage,int64,More than 20 unique values (6652 total)
fuel_type,object,"Gasoline, E85 Flex Fuel, nan, Hybrid, Diesel, Plug-In Hybrid, –, not supported"
engine,object,More than 20 unique values (1146 total)
transmission,object,More than 20 unique values (62 total)
ext_col,object,More than 20 unique values (319 total)
int_col,object,More than 20 unique values (156 total)
accident,object,"None reported, At least 1 accident or damage reported, nan"


<div style="border-radius: 15px 0 15px 0px; border: 2px solid #f1c40f; padding: 5px; background-color: #1e3d59  ; text-align: center; box-shadow: 0px 2px 4px rgba(0, 0, 0, 0.2);">
    <h1 style="color: #f1c40f ; text-shadow: 2px 2px 4px rgba(0, 0, 0, 0.5); font-weight: bold; margin-bottom: 10px; font-size: 36px;"> Basic Feature Engineering + Handeling Missing Values </h1>
</div>

In [90]:
# Handeling the missing values and creating more clumns 
df_tr['fuel_type'].unique()

array(['Gasoline', 'E85 Flex Fuel', nan, 'Hybrid', 'Diesel',
       'Plug-In Hybrid', '–', 'not supported'], dtype=object)

In [91]:
df_ts['fuel_type'].unique()

array(['Gasoline', 'Hybrid', 'Diesel', 'E85 Flex Fuel', nan,
       'Plug-In Hybrid', '–', 'not supported'], dtype=object)

In [92]:
# Standardize np.nan and '–' to 'unknown'
df_ts['fuel_type'] = df_ts['fuel_type'].replace({np.nan: 'unknown', '–': 'unknown','Plug-In Hybrid':'Plug_In Hybrid'})

# Get unique values from the 'fuel_type' column
unique_fuel_types = df_ts['fuel_type'].unique()

# Create binary columns for each unique value
for fuel in unique_fuel_types:
    df_ts[fuel] = df_ts['fuel_type'].apply(lambda x: 1 if x == fuel else 0)

df_ts = df_ts.drop(columns=['fuel_type'])

print(df_ts.head())

  brand                 model  model_year  milage  \
0  Land        Rover LR2 Base        2015   98000   
1  Land     Rover Defender SE        2020    9142   
2  Ford    Expedition Limited        2022   28121   
3  Audi         A6 2.0T Sport        2016   61258   
4  Audi  A6 2.0T Premium Plus        2018   59000   

                                              engine        transmission  \
0       240.0HP 2.0L 4 Cylinder Engine Gasoline Fuel         6-Speed A/T   
1  395.0HP 3.0L Straight 6 Cylinder Engine Gasoli...         8-Speed A/T   
2                    3.5L V6 24V PDI DOHC Twin Turbo  10-Speed Automatic   
3                                     2.0 Liter TFSI           Automatic   
4       252.0HP 2.0L 4 Cylinder Engine Gasoline Fuel                 A/T   

           ext_col int_col       accident clean_title  Gasoline  Hybrid  \
0            White   Beige  None reported         Yes         1       0   
1           Silver   Black  None reported         Yes         0       1   

In [93]:
# Standardize np.nan and '–' to 'unknown'
df_tr['fuel_type'] = df_tr['fuel_type'].replace({np.nan: 'unknown', '–': 'unknown','Plug-In Hybrid':'Plug_In Hybrid'})

# Get unique values from the 'fuel_type' column
unique_fuel_types = df_tr['fuel_type'].unique()

# Create binary columns for each unique value
for fuel in unique_fuel_types:
    df_tr[fuel] = df_tr['fuel_type'].apply(lambda x: 1 if x == fuel else 0)

#drop the original 'fuel_type' column
df_tr = df_tr.drop(columns=['fuel_type'])

print(df_tr.head())

           brand              model  model_year  milage  \
0           MINI      Cooper S Base        2007  213000   
1        Lincoln              LS V8        2002  143250   
2      Chevrolet  Silverado 2500 LT        2002  136731   
3        Genesis   G90 5.0 Ultimate        2017   19500   
4  Mercedes-Benz        Metris Base        2021    7388   

                                              engine  \
0       172.0HP 1.6L 4 Cylinder Engine Gasoline Fuel   
1       252.0HP 3.9L 8 Cylinder Engine Gasoline Fuel   
2  320.0HP 5.3L 8 Cylinder Engine Flex Fuel Capab...   
3       420.0HP 5.0L 8 Cylinder Engine Gasoline Fuel   
4       208.0HP 2.0L 4 Cylinder Engine Gasoline Fuel   

                     transmission ext_col int_col  \
0                             A/T  Yellow    Gray   
1                             A/T  Silver   Beige   
2                             A/T    Blue    Gray   
3  Transmission w/Dual Shift Mode   Black   Black   
4                     7-Speed A/T   Black  

In [94]:
df_tr.isnull().sum()

brand                 0
model                 0
model_year            0
milage                0
engine                0
transmission          0
ext_col               0
int_col               0
accident           2565
clean_title       22015
price                 0
Gasoline              0
E85 Flex Fuel         0
unknown               0
Hybrid                0
Diesel                0
Plug_In Hybrid        0
not supported         0
dtype: int64

In [95]:
df_tr['clean_title'] = df_tr['clean_title'].fillna('no')
df_ts['clean_title'] = df_ts['clean_title'].fillna('no')

In [96]:
df_tr['accident'] = df_tr['accident'].fillna('None reported')
df_ts['accident'] = df_ts['accident'].fillna('None reported')

In [97]:
# Function to extract details from transmission descriptions
def extract_transmission_details(transmission):
    if pd.isna(transmission):
        return pd.Series([np.nan, np.nan, np.nan])
    
    # Extract the number of speeds
    speed_match = re.search(r'(\d+)-Speed', transmission)
    speeds = int(speed_match.group(1)) if speed_match else np.nan
    
    # Extract the type of transmission
    type_match = re.search(r'(A/T|M/T|DCT|CVT)', transmission)
    trans_type = type_match.group(1) if type_match else np.nan
    
    # Classify as Automatic or Electronically Controlled
    if 'A/T' in transmission or 'CVT' in transmission or 'DCT' in transmission:
        control_type = 'Automatic'
    elif 'E/C' in transmission:
        control_type = 'Electronically Controlled'
    else:
        control_type = np.nan
    
    return pd.Series([speeds, trans_type, control_type])

# extraction function
df_ts[['speeds', 'transmission_type', 'control_type']] = df_ts['transmission'].apply(extract_transmission_details)

# drop the original 'transmission' column 
df_ts = df_ts.drop(columns=['transmission'])

print(df_ts.head())

  brand                 model  model_year  milage  \
0  Land        Rover LR2 Base        2015   98000   
1  Land     Rover Defender SE        2020    9142   
2  Ford    Expedition Limited        2022   28121   
3  Audi         A6 2.0T Sport        2016   61258   
4  Audi  A6 2.0T Premium Plus        2018   59000   

                                              engine          ext_col int_col  \
0       240.0HP 2.0L 4 Cylinder Engine Gasoline Fuel            White   Beige   
1  395.0HP 3.0L Straight 6 Cylinder Engine Gasoli...           Silver   Black   
2                    3.5L V6 24V PDI DOHC Twin Turbo            White   Ebony   
3                                     2.0 Liter TFSI  Silician Yellow   Black   
4       252.0HP 2.0L 4 Cylinder Engine Gasoline Fuel             Gray   Black   

        accident clean_title  Gasoline  Hybrid  Diesel  E85 Flex Fuel  \
0  None reported         Yes         1       0       0              0   
1  None reported         Yes         0       1  

In [98]:
# extraction function
df_tr[['speeds', 'transmission_type', 'control_type']] = df_tr['transmission'].apply(extract_transmission_details)

# drop the original 'transmission' column 
df_tr = df_tr.drop(columns=['transmission'])

print(df_tr.head())

           brand              model  model_year  milage  \
0           MINI      Cooper S Base        2007  213000   
1        Lincoln              LS V8        2002  143250   
2      Chevrolet  Silverado 2500 LT        2002  136731   
3        Genesis   G90 5.0 Ultimate        2017   19500   
4  Mercedes-Benz        Metris Base        2021    7388   

                                              engine ext_col int_col  \
0       172.0HP 1.6L 4 Cylinder Engine Gasoline Fuel  Yellow    Gray   
1       252.0HP 3.9L 8 Cylinder Engine Gasoline Fuel  Silver   Beige   
2  320.0HP 5.3L 8 Cylinder Engine Flex Fuel Capab...    Blue    Gray   
3       420.0HP 5.0L 8 Cylinder Engine Gasoline Fuel   Black   Black   
4       208.0HP 2.0L 4 Cylinder Engine Gasoline Fuel   Black   Beige   

                                 accident clean_title  price  Gasoline  \
0                           None reported         Yes   4200         1   
1  At least 1 accident or damage reported         Yes   4999    

In [50]:
df_tr['speeds'] = df_tr['speeds'].fillna('un')
df_tr['transmission_type'] = df_tr['transmission_type'].fillna('un')
df_tr['control_type'] = df_tr['control_type'].fillna('un')

df_ts['speeds'] = df_ts['speeds'].fillna('un')
df_ts['transmission_type'] = df_ts['transmission_type'].fillna('un')
df_ts['control_type'] = df_ts['control_type'].fillna('un')

<div style="border-radius: 15px 0 15px 0px; border: 2px solid #f1c40f; padding: 5px; background-color: #1e3d59  ; text-align: center; box-shadow: 0px 2px 4px rgba(0, 0, 0, 0.2);">
    <h1 style="color: #f1c40f ; text-shadow: 2px 2px 4px rgba(0, 0, 0, 0.5); font-weight: bold; margin-bottom: 10px; font-size: 36px;"> Modeling </h1>
</div>

In [58]:
# Define features and target
X = df_tr.drop('price', axis=1)
y = df_tr['price']

# Specify object columns
object_columns = ['brand', 'model', 'speeds', 'transmission_type', 'control_type', 
                   'engine', 'ext_col', 'int_col', 'accident', 'clean_title']

# Convert object columns to string and handle missing values
def preprocess_object_columns(df, object_columns):
    df_processed = df.copy()
    for col in object_columns:
        if df[col].dtype == 'object':
            df_processed[col] = df_processed[col].astype(str).fillna('unknown')
    return df_processed

# Preprocess train and test data
X = preprocess_object_columns(X, object_columns)
df_ts = preprocess_object_columns(df_ts, object_columns)

# Define model parameters
SEED = 42
n_splits = 5

def Train_ML(model):
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=SEED)
    test_preds = np.zeros(df_ts.shape[0])
    fold_idx = 0
    val_rmse_list = []

    start_time = time.time()  # Start timing
    
    for train_index, val_index in kf.split(X):
        X_train, X_val = X.iloc[train_index], X.iloc[val_index]
        y_train, y_val = y.iloc[train_index], y.iloc[val_index]

        model_clone = clone(model)
        model_clone.fit(X_train, y_train, cat_features=object_columns)

        val_preds = model_clone.predict(X_val)
        val_rmse = np.sqrt(mean_squared_error(y_val, val_preds))
        val_rmse_list.append(val_rmse)
        
        print(f"Fold {fold_idx + 1}:")
        print(f"Validation RMSE: {val_rmse:.4f}")

        test_preds_fold = model_clone.predict(df_ts)
        test_preds += test_preds_fold

        fold_idx += 1

    mean_test_preds = test_preds / n_splits
    mean_val_rmse = np.mean(val_rmse_list)
    
    end_time = time.time()  # End timing
    elapsed_time = end_time - start_time
    
    print('\n')
    print('==========================================')
    print(f"Mean Validation RMSE: {mean_val_rmse:.4f}")
    print(f"Elapsed Time: {elapsed_time:.2f} seconds")
    print('==========================================')

    return mean_test_preds

# Initialize CatBoostRegressor
catboost_model = CatBoostRegressor(
    iterations=1000,
    depth=6,
    learning_rate=0.1,
    random_seed=SEED,
    loss_function='RMSE',
    verbose=False
)

# Train and evaluate the model
test_predictions = Train_ML(catboost_model)

Fold 1:
Validation RMSE: 75420.5152
Fold 2:
Validation RMSE: 68260.3137
Fold 3:
Validation RMSE: 69888.5155
Fold 4:
Validation RMSE: 74948.4595
Fold 5:
Validation RMSE: 75491.6364


Mean Validation RMSE: 72801.8881
Elapsed Time: 710.99 seconds


<div style="border-radius: 15px 0 15px 0px; border: 2px solid #f1c40f; padding: 5px; background-color: #1e3d59  ; text-align: center; box-shadow: 0px 2px 4px rgba(0, 0, 0, 0.2);">
    <h1 style="color: #f1c40f ; text-shadow: 2px 2px 4px rgba(0, 0, 0, 0.5); font-weight: bold; margin-bottom: 10px; font-size: 36px;"> Submission</h1>
</div>

In [59]:
# Make Ensure the length of test_predictions matches the length of df_sbb
assert len(test_predictions) == len(df_sbb), "Mismatch between number of predictions and submission file rows."

# Assign the predictions to the 'price' column
df_sbb['price'] = test_predictions

# Save the updated submission file
df_sbb.to_csv("saman_sb5.csv", index=False)

# Check the first few rows of the updated DataFrame
df_sbb.head()


Unnamed: 0,id,price
0,188533,16648.743554
1,188534,80301.70169
2,188535,53036.150801
3,188536,28236.930562
4,188537,31700.667949
