## Import Dependency

In [106]:
import pandas as pd
import numpy as np
import re
from datetime import datetime

import os
import pickle

## Load Test Data

In [107]:
test = pd.read_csv(r"C:\Users\nene0\Desktop\Projects\kaggle_Used_Car_Regression\playground-series-s4e9\test.csv")

test.head()

Unnamed: 0,id,brand,model,model_year,milage,fuel_type,engine,transmission,ext_col,int_col,accident,clean_title
0,188533,Land,Rover LR2 Base,2015,98000,Gasoline,240.0HP 2.0L 4 Cylinder Engine Gasoline Fuel,6-Speed A/T,White,Beige,None reported,Yes
1,188534,Land,Rover Defender SE,2020,9142,Hybrid,395.0HP 3.0L Straight 6 Cylinder Engine Gasoli...,8-Speed A/T,Silver,Black,None reported,Yes
2,188535,Ford,Expedition Limited,2022,28121,Gasoline,3.5L V6 24V PDI DOHC Twin Turbo,10-Speed Automatic,White,Ebony,None reported,
3,188536,Audi,A6 2.0T Sport,2016,61258,Gasoline,2.0 Liter TFSI,Automatic,Silician Yellow,Black,None reported,
4,188537,Audi,A6 2.0T Premium Plus,2018,59000,Gasoline,252.0HP 2.0L 4 Cylinder Engine Gasoline Fuel,A/T,Gray,Black,None reported,Yes


## Clean Data

Will be using the pickle for pipeline!

In [108]:
missing_values = test.isna().sum()

for column, values in missing_values.items():
    if values > 0:
        print(f"{column}: {values}")

fuel_type: 3383
accident: 1632
clean_title: 14239


In [109]:
id = test['id']

In [110]:
test.drop(columns='id', inplace=True)

In [111]:
def clean_fuel_type(data):
    electric_brands = ['Tesla', 'Lucid', 'Rivian']

    # Fill in the 'fuel_type' with 'Electric' if the brand is in the electric_brands list.
    data.loc[data['brand'].isin(electric_brands), 'fuel_type'] = 'Electric'

    # Fill in missing values or update to the correct value using the 'engine' column.
    data.loc[((data['engine'].str.contains('Electric|Battery'))&(data['fuel_type'].isna())), 'fuel_type'] = 'Electric'
    data.loc[((data['engine'].str.contains('Hybrid'))&(data['fuel_type'].isna()|(data['fuel_type']=='–'))), 'fuel_type'] = 'Hybrid'
    data.loc[((data['engine'].str.contains('Gasoline'))&(data['fuel_type'].isna()|(data['fuel_type']=='not supported')|(data['fuel_type']=='–'))), 'fuel_type'] = 'Gasoline'
    data.loc[((data['engine'].str.contains('Diesel'))&(data['fuel_type'].isna())), 'fuel_type'] = 'Diesel'
    data.loc[((data['engine'].str.contains('Hydrogen'))&(data['fuel_type']=='not supported')), 'fuel_type'] = 'Hydrogen' # Toyota has a Hydrogen Car!

    # Fill in missing values or update to the correct value using the 'model' column.
    data.loc[((data['model'].str.contains('EV|Electric'))&(data['fuel_type'].isna())), 'fuel_type'] = 'Electric' # EV = Electric Vehicle

    # Rest are all filled with the value 'Gasoline'ArithmeticError
    data.loc[((data['fuel_type'].isna())|(data['fuel_type'].str.contains('not supported|–'))), 'fuel_type'] = 'Gasoline'

    return data

In [112]:
test = clean_fuel_type(test)

In [113]:
test['accident'].fillna('Unknown', inplace=True)
test['clean_title'].fillna('Unknown', inplace=True)

In [114]:
test.apply(lambda x: (x == '–').sum())

brand              0
model              0
model_year         0
milage             0
fuel_type          0
engine           617
transmission      40
ext_col          223
int_col         2953
accident           0
clean_title        0
dtype: int64

In [115]:
test.replace('–', np.nan, inplace=True)

In [116]:
test.loc[test['brand']=='Land', 'brand'] = 'Land Rover'

In [117]:
test['ext_col'].fillna('Unknown', inplace=True)
test['int_col'].fillna('Unknown', inplace=True)

test['ext_col'] = test['ext_col'].str.title()
test['int_col'] = test['int_col'].str.title()

In [118]:
transmission_dict = pickle.load(open(r"C:\Users\nene0\Desktop\Projects\kaggle_Used_Car_Regression\Regression_Used_Car_Prices\transmission_dict.pickle", "rb"))
engine_dict = pickle.load(open(r"C:\Users\nene0\Desktop\Projects\kaggle_Used_Car_Regression\Regression_Used_Car_Prices\engine_dict.pickle", "rb"))
brand_dict = pickle.load(open(r"C:\Users\nene0\Desktop\Projects\kaggle_Used_Car_Regression\Regression_Used_Car_Prices\brand_dict.pickle", "rb"))

In [119]:
test['transmission'] = test.apply(lambda row: transmission_dict[row['model']] if pd.isna(row['transmission']) else row['transmission'], axis=1)
test['engine'] = test.apply(lambda row: engine_dict[row['model']] if pd.isna(row['engine']) else row['engine'], axis=1)
test['brand'] = test['model'].map(brand_dict)

In [120]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 125690 entries, 0 to 125689
Data columns (total 11 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   brand         125689 non-null  object
 1   model         125690 non-null  object
 2   model_year    125690 non-null  int64 
 3   milage        125690 non-null  int64 
 4   fuel_type     125690 non-null  object
 5   engine        125680 non-null  object
 6   transmission  125690 non-null  object
 7   ext_col       125690 non-null  object
 8   int_col       125690 non-null  object
 9   accident      125690 non-null  object
 10  clean_title   125690 non-null  object
dtypes: int64(2), object(9)
memory usage: 10.5+ MB


## Feature Engineering

In [121]:
test['age'] = datetime.now().year - test['model_year']
test['age_bins'] = pd.cut(test['age'], bins=[-np.inf, 2, 5, 10, 15, 20, np.inf], labels=['Fairly New', '3-5 Years', '6-10 Years', '11-15 Years', '16-20 Years', 'Old'])

In [122]:
def extract_horsepower(hp):
    try:
        pattern = r'\d+\.\d+HP'
        match = re.search(pattern, hp)
        hp_string = match.group()
        horsepower = hp_string.replace('.0HP', '')
        horsepower = int(horsepower)
        return horsepower
    except:
        return np.nan
    
def extract_engine_liter(liter):
    try:
        pattern = r'\d+\.\d+L'
        match = re.search(pattern, liter)
        liter_string = match.group()
        liter = liter_string.replace('L', '')
        liter = float(liter)
        return liter
    except:
        return np.nan
    
def extract_cylinder(cyn):
    try:
        pattern = r'\d+ Cylinder'
        match = re.search(pattern, cyn)
        cyn_string = match.group()
        cylinder = cyn_string.replace(' Cylinder', '')
        cylinder = int(cylinder)
        return cylinder
    except:
        return np.nan

In [123]:
test['hp'] = test['engine'].apply(extract_horsepower)
test['engine_liter'] = test['engine'].apply(extract_engine_liter)
test['cylinder'] = test['engine'].apply(extract_cylinder)

In [124]:
# Will be pickled later, right now just number from the train data.

hp_median = 328
engine_liter_median = 3.5
cylinder_median = 6

def fill_missing_values(row):
    row['hp'] = row['hp'] if not pd.isna(row['hp']) else hp_median
    row['engine_liter'] = row['engine_liter'] if not pd.isna(row['engine_liter']) else engine_liter_median
    row['cylinder'] = row['cylinder'] if not pd.isna(row['cylinder']) else cylinder_median
    return row

In [125]:
test = test.apply(fill_missing_values, axis=1)

In [126]:
hp_dict = pickle.load(open(r"C:\Users\nene0\Desktop\Projects\kaggle_Used_Car_Regression\Regression_Used_Car_Prices\hp_dict.pickle", "rb"))
test['hp'] = test.apply(lambda row: hp_dict[row['model']] if row['model'] in hp_dict and row['hp'] != hp_dict[row['model']] else row['hp'], axis=1)

In [127]:
def extract_gear(gear):
    try:
        pattern = r'^\d+'
        match = re.search(pattern, gear)
        num_gear = match.group()
        return int(num_gear)
    except:
        return np.nan

In [128]:
test['num_gear'] = test['transmission'].apply(extract_gear)

In [129]:
test.loc[((test['transmission']=='Single-Speed Fixed Gear')&(test['num_gear'].isna())), 'num_gear'] = 1

In [130]:
def remove_speed(trans):
    pattern = r'\d+-[Ss][Pp][Ee][Ee][Dd]\s*'
    return re.sub(pattern, '', trans).strip()

test['transmission'] = test['transmission'].apply(remove_speed)

In [131]:
def replace_transmission(trans):
    if 'A/T' in trans:
        return trans.replace('A/T', 'Automatic')
    elif 'M/T' in trans:
        return trans.replace('M/T', 'Manual')
    elif 'Variable' in trans:
        return trans.replace('Variable', 'CVT')
    else:
        return trans

# Apply the function to the 'transmission' column
test['transmission'] = test['transmission'].apply(replace_transmission)

In [132]:
correct_transmission = {
    'CVT Transmission': 'CVT',
    'Automatic CVT': 'CVT',
    'Electronically Controlled Automatic with O':'Automatic with Overdrive',
    'F': np.nan,
    'CVT-F':'CVT',
    '2': np.nan,
    '6 Speed At/Mt': 'Automated Manual',
    '': np.nan,
    'AT':'Automatic',
    'SCHEDULED FOR OR IN PRODUCTION': np.nan,
    '6 Speed Mt': 'Manual'}

test['transmission'] = test['transmission'].replace(correct_transmission)

In [133]:
test['transmission'].fillna('Automatic', inplace=True)

test['num_gear'].fillna(6, inplace=True)

In [134]:
test.drop(columns=['model_year','engine'], inplace=True)

In [135]:
test.head()

Unnamed: 0,brand,model,milage,fuel_type,transmission,ext_col,int_col,accident,clean_title,age,age_bins,hp,engine_liter,cylinder,num_gear
0,Land Rover,Rover LR2 Base,98000,Gasoline,Automatic,White,Beige,None reported,Yes,9,6-10 Years,240.0,2.0,4.0,6.0
1,Land Rover,Rover Defender SE,9142,Hybrid,Automatic,Silver,Black,None reported,Yes,4,3-5 Years,395.0,3.0,6.0,8.0
2,Ford,Expedition Limited,28121,Gasoline,Automatic,White,Ebony,None reported,Unknown,2,Fairly New,328.0,3.5,6.0,10.0
3,Audi,A6 2.0T Sport,61258,Gasoline,Automatic,Silician Yellow,Black,None reported,Unknown,8,6-10 Years,252.0,3.5,6.0,6.0
4,Audi,A6 2.0T Premium Plus,59000,Gasoline,Automatic,Gray,Black,None reported,Yes,6,6-10 Years,252.0,2.0,4.0,6.0


In [136]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 125690 entries, 0 to 125689
Data columns (total 15 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   brand         125689 non-null  object 
 1   model         125690 non-null  object 
 2   milage        125690 non-null  int64  
 3   fuel_type     125690 non-null  object 
 4   transmission  125690 non-null  object 
 5   ext_col       125690 non-null  object 
 6   int_col       125690 non-null  object 
 7   accident      125690 non-null  object 
 8   clean_title   125690 non-null  object 
 9   age           125690 non-null  int64  
 10  age_bins      125690 non-null  object 
 11  hp            125690 non-null  float64
 12  engine_liter  125690 non-null  float64
 13  cylinder      125690 non-null  float64
 14  num_gear      125690 non-null  float64
dtypes: float64(4), int64(2), object(9)
memory usage: 14.4+ MB


In [137]:
test['brand'].fillna('Ford', inplace=True) # <- mode brand from train data

In [138]:
test.isna().sum().sum()

0

## Preprocessing

In [139]:
# test.drop(columns=['num_gear'], inplace=True)

In [140]:
pre_scale_data = test[test.select_dtypes(['float', 'int']).columns]

pre_scale_data

Unnamed: 0,milage,age,hp,engine_liter,cylinder,num_gear
0,98000,9,240.0,2.0,4.0,6.0
1,9142,4,395.0,3.0,6.0,8.0
2,28121,2,328.0,3.5,6.0,10.0
3,61258,8,252.0,3.5,6.0,6.0
4,59000,6,252.0,2.0,4.0,6.0
...,...,...,...,...,...,...
125685,83315,10,362.0,3.0,6.0,7.0
125686,29336,5,335.0,3.5,6.0,6.0
125687,77634,12,333.0,3.0,6.0,6.0
125688,112000,12,333.0,3.0,6.0,6.0


In [141]:
scaler = pickle.load(open(r"C:\Users\nene0\Desktop\Projects\kaggle_Used_Car_Regression\Regression_Used_Car_Prices\scaler.pickle", "rb"))

In [142]:
scaled_data = pd.DataFrame(scaler.fit_transform(pre_scale_data), columns=pre_scale_data.columns)

scaled_data

Unnamed: 0,milage,age,hp,engine_liter,cylinder,num_gear
0,0.636302,0.140564,-0.960290,-1.361337,-1.636428,-0.422991
1,-1.132944,-0.740684,0.502172,-0.597274,-0.212620,0.991856
2,-0.755054,-1.093183,-0.129989,-0.215243,-0.212620,2.406703
3,-0.095265,-0.035686,-0.847067,-0.215243,-0.212620,-0.422991
4,-0.140224,-0.388185,-0.847067,-1.361337,-1.636428,-0.422991
...,...,...,...,...,...,...
125685,0.343910,0.316813,0.190809,-0.597274,-0.212620,0.284433
125686,-0.730862,-0.564435,-0.063942,-0.215243,-0.212620,-0.422991
125687,0.230796,0.669312,-0.082813,-0.597274,-0.212620,-0.422991
125688,0.915055,0.669312,-0.082813,-0.597274,-0.212620,-0.422991


In [143]:
test['accident'] =  test['accident'].map(lambda x: 1 if x=='At least 1 accident or damage reported' else 0)
test['clean_title'] =  test['clean_title'].map(lambda x: 1 if x=='Yes' else 0)

In [144]:
pre_enc_data = test[test.select_dtypes(['object']).columns]

pre_enc_data

Unnamed: 0,brand,model,fuel_type,transmission,ext_col,int_col,age_bins
0,Land Rover,Rover LR2 Base,Gasoline,Automatic,White,Beige,6-10 Years
1,Land Rover,Rover Defender SE,Hybrid,Automatic,Silver,Black,3-5 Years
2,Ford,Expedition Limited,Gasoline,Automatic,White,Ebony,Fairly New
3,Audi,A6 2.0T Sport,Gasoline,Automatic,Silician Yellow,Black,6-10 Years
4,Audi,A6 2.0T Premium Plus,Gasoline,Automatic,Gray,Black,6-10 Years
...,...,...,...,...,...,...,...
125685,Mercedes-Benz,GL-Class GL 450 4MATIC,Gasoline,Automatic,Black,Black,6-10 Years
125686,Audi,Q7 55 Prestige,Gasoline,Automatic,White,Black,3-5 Years
125687,Audi,A6 3.0T Premium Plus,Gasoline,Automatic,Black,Black,11-15 Years
125688,Audi,Q7 3.0T Premium,Gasoline,Automatic,Black,Black,11-15 Years


In [145]:
encoded_data = pd.get_dummies(pre_enc_data, dtype='int')

encoded_data.head()

Unnamed: 0,brand_Acura,brand_Alfa,brand_Aston,brand_Audi,brand_BMW,brand_Bentley,brand_Bugatti,brand_Buick,brand_Cadillac,brand_Chevrolet,...,int_col_Whisper Beige,int_col_White,int_col_White / Brown,int_col_Yellow,age_bins_11-15 Years,age_bins_16-20 Years,age_bins_3-5 Years,age_bins_6-10 Years,age_bins_Fairly New,age_bins_Old
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
3,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [146]:
test = pd.concat([encoded_data, scaled_data], axis=1)

test.head()

Unnamed: 0,brand_Acura,brand_Alfa,brand_Aston,brand_Audi,brand_BMW,brand_Bentley,brand_Bugatti,brand_Buick,brand_Cadillac,brand_Chevrolet,...,age_bins_3-5 Years,age_bins_6-10 Years,age_bins_Fairly New,age_bins_Old,milage,age,hp,engine_liter,cylinder,num_gear
0,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0.636302,0.140564,-0.96029,-1.361337,-1.636428,-0.422991
1,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,-1.132944,-0.740684,0.502172,-0.597274,-0.21262,0.991856
2,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,-0.755054,-1.093183,-0.129989,-0.215243,-0.21262,2.406703
3,0,0,0,1,0,0,0,0,0,0,...,0,1,0,0,-0.095265,-0.035686,-0.847067,-0.215243,-0.21262,-0.422991
4,0,0,0,1,0,0,0,0,0,0,...,0,1,0,0,-0.140224,-0.388185,-0.847067,-1.361337,-1.636428,-0.422991


In [147]:
features = pickle.load(open(r"C:\Users\nene0\Desktop\Projects\kaggle_Used_Car_Regression\Regression_Used_Car_Prices\slected_fetures.pickle", "rb"))

features

Index(['brand_Acura', 'brand_Alfa', 'brand_Aston', 'brand_Audi', 'brand_BMW',
       'brand_Bentley', 'brand_Bugatti', 'brand_Buick', 'brand_Cadillac',
       'brand_Chevrolet',
       ...
       'age_bins_3-5 Years', 'age_bins_6-10 Years', 'age_bins_Fairly New',
       'age_bins_Old', 'milage', 'age', 'hp', 'engine_liter', 'cylinder',
       'num_gear'],
      dtype='object', length=2452)

In [148]:
len(features)

2452

In [149]:
len(test.columns)

2444

In [150]:
missing_features = set(features) - set(test.columns)

for ft in missing_features:
    if len(ft) == None:
        print("None missing!")
    elif len(ft) != None:
        test[ft] = 0

In [151]:
missing_features

{'ext_col_Gt Silver',
 'ext_col_Nautical Blue Pearl',
 'model_Forte LX',
 'model_Impreza Premium',
 'model_V60 T6 R-Design Platinum',
 'model_X5 3.0i',
 'model_X5 eDrive xDrive40e',
 'model_X5 xDrive 35i Sport Activity',
 'model_i3 Base'}

In [152]:
reordered_test = test[features]

reordered_test

Unnamed: 0,brand_Acura,brand_Alfa,brand_Aston,brand_Audi,brand_BMW,brand_Bentley,brand_Bugatti,brand_Buick,brand_Cadillac,brand_Chevrolet,...,age_bins_3-5 Years,age_bins_6-10 Years,age_bins_Fairly New,age_bins_Old,milage,age,hp,engine_liter,cylinder,num_gear
0,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0.636302,0.140564,-0.960290,-1.361337,-1.636428,-0.422991
1,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,-1.132944,-0.740684,0.502172,-0.597274,-0.212620,0.991856
2,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,-0.755054,-1.093183,-0.129989,-0.215243,-0.212620,2.406703
3,0,0,0,1,0,0,0,0,0,0,...,0,1,0,0,-0.095265,-0.035686,-0.847067,-0.215243,-0.212620,-0.422991
4,0,0,0,1,0,0,0,0,0,0,...,0,1,0,0,-0.140224,-0.388185,-0.847067,-1.361337,-1.636428,-0.422991
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
125685,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0.343910,0.316813,0.190809,-0.597274,-0.212620,0.284433
125686,0,0,0,1,0,0,0,0,0,0,...,1,0,0,0,-0.730862,-0.564435,-0.063942,-0.215243,-0.212620,-0.422991
125687,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0.230796,0.669312,-0.082813,-0.597274,-0.212620,-0.422991
125688,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0.915055,0.669312,-0.082813,-0.597274,-0.212620,-0.422991


In [153]:
model = pickle.load(open(r"C:\Users\nene0\Desktop\Projects\kaggle_Used_Car_Regression\Regression_Used_Car_Prices\catboost.pickle", "rb"))

In [154]:
prediction = np.exp(model.predict(reordered_test))

In [155]:
prediction

array([15518.61797287, 58409.14832099, 49117.01746612, ...,
       18076.82869422, 13037.87047002, 34387.43967405])

In [156]:
submission = pd.concat([id, pd.Series(prediction)], axis=1)
submission.columns = ['id', 'price']

submission

Unnamed: 0,id,price
0,188533,15518.617973
1,188534,58409.148321
2,188535,49117.017466
3,188536,23658.727742
4,188537,27578.381067
...,...,...
125685,314218,22986.758680
125686,314219,43331.282920
125687,314220,18076.828694
125688,314221,13037.870470


In [105]:
timestamp = datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
folder_path = r'C:\Users\nene0\Desktop\Projects\kaggle_Used_Car_Regression'
file_path = os.path.join(folder_path, f'submission_{timestamp}.csv')
submission.to_csv(file_path, index=False)