In [172]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split 
from sklearn.metrics import mean_squared_error, r2_score
import xgboost as xgb
from sklearn.model_selection import cross_val_score, KFold
import matplotlib.pyplot as plt
from catboost import CatBoostRegressor
import warnings 
warnings.filterwarnings('ignore')
import lightgbm as lgb
from xgboost import XGBRegressor

In [173]:
train = pd.read_csv("./datasets/train.csv")
test = pd.read_csv("./datasets/test.csv")
submission = pd.read_csv('./datasets/sample_submission.csv')

In [174]:
train.head()

Unnamed: 0,id,brand,model,model_year,milage,fuel_type,engine,transmission,ext_col,int_col,accident,clean_title,price
0,0,MINI,Cooper S Base,2007,213000,Gasoline,172.0HP 1.6L 4 Cylinder Engine Gasoline Fuel,A/T,Yellow,Gray,None reported,Yes,4200
1,1,Lincoln,LS V8,2002,143250,Gasoline,252.0HP 3.9L 8 Cylinder Engine Gasoline Fuel,A/T,Silver,Beige,At least 1 accident or damage reported,Yes,4999
2,2,Chevrolet,Silverado 2500 LT,2002,136731,E85 Flex Fuel,320.0HP 5.3L 8 Cylinder Engine Flex Fuel Capab...,A/T,Blue,Gray,None reported,Yes,13900
3,3,Genesis,G90 5.0 Ultimate,2017,19500,Gasoline,420.0HP 5.0L 8 Cylinder Engine Gasoline Fuel,Transmission w/Dual Shift Mode,Black,Black,None reported,Yes,45000
4,4,Mercedes-Benz,Metris Base,2021,7388,Gasoline,208.0HP 2.0L 4 Cylinder Engine Gasoline Fuel,7-Speed A/T,Black,Beige,None reported,Yes,97500


In [175]:
numerical_column_names = train.select_dtypes(include=['number']).columns
print("Numerical Column Names:", numerical_column_names.tolist())

Numerical Column Names: ['id', 'model_year', 'milage', 'price']


In [176]:
object_column_names = train.select_dtypes(include=['object']).columns
print("Object Column Names:", object_column_names.tolist())

Object Column Names: ['brand', 'model', 'fuel_type', 'engine', 'transmission', 'ext_col', 'int_col', 'accident', 'clean_title']


In [177]:
categorical_columns = ['brand', 'model', 'fuel_type', 'engine', 'transmission', 'ext_col', 'int_col', 'accident', 'clean_title']

In [178]:
train = train.drop("id", axis=1)

In [179]:
train['clean_title'] = train['clean_title'].fillna('No')

train['clean_title']=train['clean_title'].map({'Yes': 0,'No': 1})

In [180]:
train['accident'] = train['accident'].fillna('None reported')

train['accident'] = train['accident'].map({
    'None reported': 0,
    'At least 1 accident or damage reported': 1
}).astype(int) 

In [181]:
train['fuel_type'] = train['fuel_type'].fillna('Electric')

train['fuel_type'] = train['fuel_type'].replace({'–': 'none', 'not supported': 'none'})

train['fuel_type'] = train['fuel_type'].astype('category').cat.codes

In [182]:
Year = 2024

In [183]:
train['Age'] = Year - train['model_year']

train['Mileage_per_Year'] = train['milage'] / train['Age']

train.drop('model_year',axis=1,inplace=True)

In [184]:
def data_from_engine(df):
    df['horsepower'] = df['engine'].str.extract(r'(\d+\.\d+)(?=HP)').astype(float)
    df['engine_size'] = df['engine'].str.extract(r'(\d+\.\d+)(?=L)').astype(float)
    df['cylinders'] = df['engine'].str.extract(r'(\d+)\s(Cylinder|V\d|Straight)')[0].astype(float)
    return df

In [185]:
train = data_from_engine(train)
train = train.drop("engine", axis=1)

In [186]:
int_replacements = {
    'Medium Earth Gray': 'Gray',
    'Diesel Gray / Black': 'Gray',
    'Dark Ash': 'Gray',
    'Graphite': 'Gray',
    'Dark Galvanized': 'Charcoal',
    'Light Gray': 'Gray',
    'Ash': 'Gray',
    'Jet Black': 'Black',
    'Global Black': 'Black',
    'Black Onyx': 'Black',
    'Parchment.': 'Beige',
    'Sardar Brown': 'Brown',
    'Black/Gun Metal': 'Black',
    'Charcoal Black': 'Charcoal',
    'Ebony': 'Brown',
    'Ebony Black': 'Black',
    'Carbon Black': 'Black',
    'Obsidian Black': 'Black',
    'Black / Saddle Brown': 'Black',
    'Black/Saddle Brown': 'Black',
    'Black / Brown': 'Black',
    'Saddle Brown': 'Brown',
    'Sand Beige': 'Beige',
    'Camel': 'Beige',
    'Parchment': 'Beige',
    'Macchiato Beige/Black': 'Beige',
    'Silk Beige/Espresso Brown': 'Beige',
    'Canberra Beige': 'Beige',
    'Macchiato': 'Beige',
    'Almond Beige': 'Beige',
    'Grace White': 'White',
    'Ivory / Ebony': 'White',
    'Bianco Polar': 'White',
    'White / Brown': 'White',
    'Platinum': 'White',
    'Cloud': 'Blue',
    'Rift Metallic': 'White',
    'Light Platinum / Jet Black': 'Silver',
    'Billet Clearcoat Metallic': 'Silver',
    'Sakhir Orange': 'Orange',
    'Pimento Red w/Ebony': 'Red',
    'Adrenaline Red': 'Red',
    'Rioja Red': 'Red',
    'Classic Red': 'Red',
    'Magma Red': 'Red',
    'Cobalt Blue': 'Blue',
    'Tempest': 'Blue',
    'Stormy Sea': 'Blue',
    'Navy Pier': 'Blue',
    'Charles Blue': 'Blue',
    'Rhapsody Blue': 'Blue',
    'Kyalami Orange': 'Orange',
    'Sakhir Orange/Black': 'Orange',
    'Dark Gray': 'Gray',
    'Deep Garnet': 'Red',
    'Scarlet Ember': 'Red',
    'Beluga': 'Blue',
    'Chestnut': 'Brown',
    'Boulder': 'Gray',
    'Macchiato/Magmagrey': 'Beige',
    'Medium Stone': 'Gray',
    'BLACK': 'Black',
    'Portland': 'Gray',
    'Sandstone': 'Beige',
    'Slate': 'Gray',
    'Cappuccino': 'Brown',
    'Oyster W/Contrast': 'Beige',
    'Nero Ade': 'Black',
    'Light Titanium': 'Silver',
    'Tan': 'Beige',
    'Brandy': 'Brown',
    'Amber': 'Yellow',
    'Hotspur': 'Blue',
    'Chateau': 'Green',
    'Ice': 'Blue',
    'Blk': 'Black',
    'Mesa': 'Brown',
    'Espresso': 'Brown',
    'Ceramic': 'White',
    'Medium Dark Slate': 'Gray',
    'Graphite w/Gun Metal': 'Gray',
    'Cocoa / Dune': 'Brown',
    'Roast': 'Brown',
    'Hotspur Hide': 'Brown',
    'ORANGE': 'Orange',
    'Walnut': 'Brown',
    'Caramel': 'Beige',
    'Giallo Taurus / Nero Ade': 'Yellow',
    'Medium Pewter': 'Gray',
    'Camel Leather': 'Brown',
    'Anthracite': 'Gray',
    'Mocha': 'Brown',
    'Sahara Tan': 'Beige',
    'Porpoise': 'Beige',
    'Deep Cypress': 'Green',
    'Light Slate': 'Gray',
    'Beluga Hide': 'Black',
    'Tupelo': 'Green',
    'Gideon': 'Beige',
    'Medium Light Camel': 'Beige',
    'Nero': 'Black',
    'Deep Chestnut': 'Red',
    'Dark Auburn': 'Brown',
    'Shale': 'Gray',
    'BEIGE': 'Beige',
    'Linen': 'Beige',
    'WHITE': 'White'
}

In [187]:
def int_color_clean(int_color):
    int_color = int_color.lower()  
    if 'gray' in int_color:
        return 'Gray'
    elif 'black' in int_color:
        return 'Black'
    elif 'charcoal' in int_color:
        return 'Charcoal'
    elif 'beige' in int_color:
        return 'Beige'
    elif 'brown' in int_color:
        return 'Brown'
    elif 'blue' in int_color:
        return 'Blue'
    elif 'red' in int_color:
        return 'Red'
    elif 'orange' in int_color:
        return 'Orange'
    elif 'green' in int_color:
        return 'Green'
    elif 'yellow' in int_color:
        return 'Yellow'
    elif 'white' in int_color:
        return 'White'
    elif 'silver' in int_color:
        return 'Silver'
    else:
        return 'Other'

In [188]:
train['int_col'] = train['int_col'].astype(str).replace(int_replacements)

train['int_col'] = train['int_col'].apply(int_color_clean)

In [189]:
ext_replacements = {
    'Blu': 'Blue',
    'BLUE': 'Blue',
    'Glacier': 'Blue',
    'BLU ELEOS': 'Blue',
    'Dark Sapphire': 'Navy',
    'Tangerine': 'Orange',
    'Pumpkin': 'Orange',
    'Clementine': 'Orange',
    'Granite': 'Gray',
    'Go Mango!': 'Yellow',
    'Onyx': 'Black',
    'Gecko Pearlcoat': 'Green',
    'Obsidian': 'Black',
    'Metallic': 'Silver',
    'Grigio Nimbus': 'Silver',
    'Chalk': 'White',
    'Bianco Monocerus': 'White',
    'Verde': 'Green',
    'Dark Graphite Metallic': 'Gray',
    'BLACK': 'Black',
    'Dark Moss': 'Green',
    'Granite Crystal Clearcoat Metallic': 'Gray',
    'Ebony Twilight Metallic': 'Black',
    'Satin Steel Metallic': 'Silver',
    'Magnetic Metallic': 'Gray',
    'Dark Matter Metallic': 'Gray',
    'Dark Ash Metallic': 'Gray',
    'Iridium Metallic': 'Gray',
    'Nightfall Mica': 'Navy',
    'Sandstone Metallic': 'Beige',
    'Rift Metallic': 'White',
    'Billet Clearcoat Metallic': 'Silver',
    'Tan': 'Beige',
    'Ice': 'Blue',
    'Hellayella': 'Yellow',
    'Granite': 'Gray',
    'Yulong': 'White',
    'Blueprint': 'Navy',
    'Arancio Borealis': 'Orange',
    'Hellayella Clearcoat': 'Yellow',
    'Moonlight Cloud': 'Navy',
    'Liquid Platinum': 'Silver',
    'Gun Metallic': 'Gray',
    'Manhattan Noir Metallic': 'Gray',
    'Lavender': 'Purple',
    'Violet': 'Purple',
    'Pink': 'Purple',
    'Mauve': 'Plum',
    'Tempest': 'Blue',
    'Nero Daytona': 'Black',
    'Scarlet Ember': 'Red',
    'Infrared Tintcoat': 'Red',
    'Maximum Steel Metallic': 'Gray',
    'Ember Pearlcoat': 'Brown',
    'Rich Garnet Metallic': 'Brown',
    'Tungsten Metallic': 'Gray',
    'Nero Noctis': 'Black',
    'Platinum Quartz Metallic': 'White',
    'Ruby Flare Pearl': 'Red',
    'Bianco Icarus Metallic': 'White',
    'Stormy Sea': 'Blue',
    'Mountain Air Metallic': 'Blue',
    'Wind Chill Pearl': 'White',
    'Iridescent Pearl Tricoat': 'White',
    'Black Cherry': 'Plum',
    'Black Forest Green': 'Green',
    'Maroon': 'Red',
    'Rosso': 'Red',
    'Rosso Corsa': 'Red',
    'Rosso Mars Metallic': 'Red',
    'Quicksilver Metallic': 'Silver',
    'Designo Magno Matte': 'Gray',
    'Granite Crystal Metallic Clearcoat': 'Gray',
    'Bianco Isis': 'White',
    'Sunset Drift Chromaflair': 'Orange',
    'Ametrin Metallic': 'Plum',
    'GT SILVER': 'Silver',
    'Caviar': 'Black'
}

In [190]:
def ext_color_clean(ext_color):
    ext_color = ext_color.lower()  
    if 'gray' in ext_color or 'grey' in ext_color:
        return 'Gray'
    elif 'black' in ext_color:
        return 'Black'
    elif 'silver' in ext_color:
        return 'Silver'
    elif 'blue' in ext_color:
        return 'Blue'
    elif 'red' in ext_color:
        return 'Red'
    elif 'green' in ext_color:
        return 'Green'
    elif 'beige' in ext_color or 'tan' in ext_color:
        return 'Beige'
    elif 'brown' in ext_color:
        return 'Brown'
    elif 'white' in ext_color:
        return 'White'
    elif 'yellow' in ext_color:
        return 'Yellow'
    elif 'orange' in ext_color:
        return 'Orange'
    elif 'purple' in ext_color:
        return 'Purple'
    elif 'plum' in ext_color:
        return 'Plum'
    elif 'navy' in ext_color:
        return 'Navy'
    else:
        return 'Other'

In [191]:
train['ext_col'] = train['ext_col'].astype(str).replace(ext_replacements)

train['ext_col'] = train['ext_col'].apply(ext_color_clean)

In [192]:
object_column_names = train.select_dtypes(include=['object']).columns
print("Object Column Names:", object_column_names.tolist())

Object Column Names: ['brand', 'model', 'transmission', 'ext_col', 'int_col']


In [193]:
categorical_columns = ['brand', 'model', 'transmission', 'ext_col', 'int_col']

In [194]:
label_encoder = LabelEncoder()

for column in categorical_columns:
    train[column] = label_encoder.fit_transform(train[column])

In [195]:
train.head()

Unnamed: 0,brand,model,milage,fuel_type,transmission,ext_col,int_col,accident,clean_title,price,Age,Mileage_per_Year,horsepower,engine_size,cylinders
0,31,495,213000,3,38,14,5,0,0,4200,17,12529.411765,172.0,1.6,4.0
1,28,930,143250,3,38,12,0,1,0,4999,22,6511.363636,252.0,3.9,8.0
2,9,1575,136731,1,38,2,5,0,0,13900,22,6215.045455,320.0,5.3,8.0
3,16,758,19500,3,49,1,1,0,0,45000,7,2785.714286,420.0,5.0,8.0
4,36,1077,7388,3,23,1,0,0,0,97500,3,2462.666667,208.0,2.0,4.0


In [196]:
X = train.drop('price', axis=1)  
y = train['price']

In [197]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

In [198]:
best_params = {
    'n_estimators': 1225,
    'num_leaves': 137,
    'max_depth': 14,
    'cat_smooth': 96,
    'learning_rate': 0.0023941644225363256,
    'subsample': 0.9082095260228584,
    'colsample_bytree': 0.6165900236226695,
    'min_split_gain': 0.0308677316309982,
    'min_child_weight': 68,
    'lambda_l2': 1.7319600391087514e-07,
    'lambda_l1': 8.761594422544116e-07,
    'max_bin': 749,
    'objective': 'regression',
    'metric': 'rmse',
    'random_state': 42,
    'boosting_type': 'gbdt',
}

In [199]:
lgb_model = lgb.LGBMRegressor(**best_params)

callbacks = [
    lgb.early_stopping(stopping_rounds=100),
    lgb.log_evaluation(100)
]

In [200]:
lgb_model.fit(
    X_train, y_train,
    eval_set=[(X_train, y_train), (X_test, y_test)],
    eval_metric='rmse',
    callbacks=callbacks
)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003892 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2827
[LightGBM] [Info] Number of data points in the train set: 169679, number of used features: 14
[LightGBM] [Info] Start training from score 43888.560718
Training until validation scores don't improve for 100 rounds
[100]	training's rmse: 76918.1	valid_1's rmse: 73543.2
[200]	training's rmse: 75307.8	valid_1's rmse: 72088.9
[300]	training's rmse: 74162.2	valid_1's rmse: 71125.6
[400]	training's rmse: 73333.2	valid_1's rmse: 70491.4
[500]	training's rmse: 72717.3	valid_1's rmse: 70076.2
[600]	training's rmse: 72230.3	valid_1's rmse: 69788.9
[700]	training's rmse: 71838.7	valid_1's rmse: 69598.6
[800]	training's rmse: 71518	valid_1's rmse: 69473.8
[900]	training's rmse: 71239.6	valid_1's rmse: 69377.5
[1000]	training's rmse: 70995.9	valid_1's rmse: 69314.8
[1100]	training's rmse: 70778.1	valid_1's 

In [201]:
y_pred_lgb = lgb_model.predict(X_test)
rmse_lgb = np.sqrt(mean_squared_error(y_test, y_pred_lgb))
print(f"LightGBM RMSE: {rmse_lgb:.4f}")

LightGBM RMSE: 69230.2124


In [202]:
params3 = {
    'iterations': 396,
    'depth': 3,
    'learning_rate': 0.205197734145011,
    'random_strength': 0.00013766770363584716,
    'bagging_temperature': 0.8685572414400111,
    'border_count': 241,
    'l2_leaf_reg': 61.120073566627156
}
catboost_model = CatBoostRegressor(**params3)

In [203]:
catboost_model.fit(X_train, y_train)

0:	learn: 77700.1194006	total: 9.64ms	remaining: 3.81s
1:	learn: 76720.6387635	total: 17.6ms	remaining: 3.47s
2:	learn: 76008.4663409	total: 25.6ms	remaining: 3.36s
3:	learn: 75508.5955884	total: 32.6ms	remaining: 3.2s
4:	learn: 75158.8195997	total: 39.7ms	remaining: 3.1s
5:	learn: 74914.0858597	total: 45.6ms	remaining: 2.97s
6:	learn: 74737.7199165	total: 52.5ms	remaining: 2.92s
7:	learn: 74580.3248199	total: 58.4ms	remaining: 2.83s
8:	learn: 74467.7882276	total: 64.2ms	remaining: 2.76s
9:	learn: 74316.8880876	total: 69.9ms	remaining: 2.7s
10:	learn: 74247.9527502	total: 75.3ms	remaining: 2.64s
11:	learn: 74192.7925875	total: 81.1ms	remaining: 2.6s
12:	learn: 74097.6755454	total: 87.3ms	remaining: 2.57s
13:	learn: 74018.7941972	total: 94.1ms	remaining: 2.57s
14:	learn: 73937.5577849	total: 99.8ms	remaining: 2.53s
15:	learn: 73898.8925394	total: 105ms	remaining: 2.5s
16:	learn: 73871.0637703	total: 111ms	remaining: 2.47s
17:	learn: 73850.5699884	total: 117ms	remaining: 2.46s
18:	learn:

<catboost.core.CatBoostRegressor at 0x23225a1eb10>

In [204]:
test = test.drop("id", axis=1)

In [205]:
test['clean_title'] = test['clean_title'].fillna('No')

test['clean_title'] = test['clean_title'].map({'Yes': 0,'No': 1})

In [206]:
test['accident'] = test['accident'].fillna('None reported')

test['accident'] = test['accident'].map({
    'None reported': 0,
    'At least 1 accident or damage reported': 1
}).astype(int) 

In [207]:
test['fuel_type'] = test['fuel_type'].fillna('Electric')

test['fuel_type'] = test['fuel_type'].replace({'–': 'none', 'not supported': 'none'})

test['fuel_type'] = test['fuel_type'].astype('category').cat.codes

In [208]:
test['Age'] = Year - test['model_year']

test['Mileage_per_Year'] = test['milage'] / test['Age']

test.drop('model_year',axis=1,inplace=True)

In [209]:
test = data_from_engine(test)
test = test.drop("engine", axis=1)

In [210]:
test['int_col'] = test['int_col'].astype(str).replace(int_replacements)

test['int_col'] = test['int_col'].apply(int_color_clean)

In [211]:
test['ext_col'] = test['ext_col'].astype(str).replace(ext_replacements)

test['ext_col'] = test['ext_col'].apply(ext_color_clean)

In [212]:
label_encoder = LabelEncoder()

for column in categorical_columns:
    test[column] = label_encoder.fit_transform(test[column])

In [213]:
test.head()

Unnamed: 0,brand,model,milage,fuel_type,transmission,ext_col,int_col,accident,clean_title,Age,Mileage_per_Year,horsepower,engine_size,cylinders
0,26,1388,98000,3,16,13,0,0,0,9,10888.888889,240.0,2.0,4.0
1,26,1375,9142,4,31,12,1,0,0,4,2285.5,395.0,3.0,6.0
2,14,636,28121,3,3,13,3,0,1,2,14060.5,,3.5,
3,3,182,61258,3,39,14,1,0,1,8,7657.25,,,
4,3,181,59000,3,38,4,1,0,0,6,9833.333333,252.0,2.0,4.0


In [214]:
test1 = lgb_model.predict(test)

test2 = catboost_model.predict(test)



In [215]:
pred1 = pd.DataFrame(test1, columns=['LGB_Pred'])

pred2 = pd.DataFrame(test2, columns=['CatBoost_Pred'])

In [216]:
pred1.head()

Unnamed: 0,LGB_Pred
0,18761.728784
1,76123.946911
2,59321.217351
3,33792.11381
4,31271.230678


In [217]:
pred2.head()

Unnamed: 0,CatBoost_Pred
0,17516.178597
1,77011.65656
2,55072.467287
3,27405.382318
4,30304.401007


In [218]:
submission["price"] =  test2
submission.to_csv('submission_file.csv',index=False)