## Import Dependency

In [152]:
import pandas as pd
import numpy as np
import re
from datetime import datetime
import matplotlib.pyplot as plt
import seaborn as sns

import os
import pickle

## Load Test Data

In [153]:
test = pd.read_csv(r"C:\Users\nene0\Desktop\Projects\kaggle_Used_Car_Regression\playground-series-s4e9\test.csv")

test.head()

Unnamed: 0,id,brand,model,model_year,milage,fuel_type,engine,transmission,ext_col,int_col,accident,clean_title
0,188533,Land,Rover LR2 Base,2015,98000,Gasoline,240.0HP 2.0L 4 Cylinder Engine Gasoline Fuel,6-Speed A/T,White,Beige,None reported,Yes
1,188534,Land,Rover Defender SE,2020,9142,Hybrid,395.0HP 3.0L Straight 6 Cylinder Engine Gasoli...,8-Speed A/T,Silver,Black,None reported,Yes
2,188535,Ford,Expedition Limited,2022,28121,Gasoline,3.5L V6 24V PDI DOHC Twin Turbo,10-Speed Automatic,White,Ebony,None reported,
3,188536,Audi,A6 2.0T Sport,2016,61258,Gasoline,2.0 Liter TFSI,Automatic,Silician Yellow,Black,None reported,
4,188537,Audi,A6 2.0T Premium Plus,2018,59000,Gasoline,252.0HP 2.0L 4 Cylinder Engine Gasoline Fuel,A/T,Gray,Black,None reported,Yes


## Clean Data

Will be using the pickle for pipeline!

In [154]:
missing_values = test.isna().sum()

for column, values in missing_values.items():
    if values > 0:
        print(f"{column}: {values}")

fuel_type: 3383
accident: 1632
clean_title: 14239


In [155]:
id = test['id']

test.drop(columns='id', inplace=True)

In [156]:
def clean_fuel_type(data):
    electric_brands = ['Tesla', 'Lucid', 'Rivian']

    # Fill in the 'fuel_type' with 'Electric' if the brand is in the electric_brands list.
    data.loc[data['brand'].isin(electric_brands), 'fuel_type'] = 'Electric'

    # Fill in missing values or update to the correct value using the 'engine' column.
    data.loc[((data['engine'].str.contains('Electric|Battery'))&(data['fuel_type'].isna())), 'fuel_type'] = 'Electric'
    data.loc[((data['engine'].str.contains('Hybrid'))&(data['fuel_type'].isna()|(data['fuel_type']=='–'))), 'fuel_type'] = 'Hybrid'
    data.loc[((data['engine'].str.contains('Gasoline'))&(data['fuel_type'].isna()|(data['fuel_type']=='not supported')|(data['fuel_type']=='–'))), 'fuel_type'] = 'Gasoline'
    data.loc[((data['engine'].str.contains('Diesel'))&(data['fuel_type'].isna())), 'fuel_type'] = 'Diesel'
    data.loc[((data['engine'].str.contains('Hydrogen'))&(data['fuel_type']=='not supported')), 'fuel_type'] = 'Hydrogen' # Toyota has a Hydrogen Car!

    # Fill in missing values or update to the correct value using the 'model' column.
    data.loc[((data['model'].str.contains('EV|Electric'))&(data['fuel_type'].isna())), 'fuel_type'] = 'Electric' # EV = Electric Vehicle

    # Rest are all filled with the value 'Gasoline'ArithmeticError
    data.loc[((data['fuel_type'].isna())|(data['fuel_type'].str.contains('not supported|–'))), 'fuel_type'] = 'Gasoline'

    return data

In [157]:
test = clean_fuel_type(test)

In [158]:
test['accident'].fillna('Unknown', inplace=True)

In [159]:
test['clean_title'].fillna('Unknown', inplace=True)

In [160]:
test.apply(lambda x: (x == '–').sum())

brand              0
model              0
model_year         0
milage             0
fuel_type          0
engine           617
transmission      40
ext_col          223
int_col         2953
accident           0
clean_title        0
dtype: int64

In [161]:
test.replace('–', np.nan, inplace=True)

In [162]:
test.loc[test['brand']=='Land', 'brand'] = 'Land Rover'

In [163]:
transmission_dict = pickle.load(open(r"C:\Users\nene0\Desktop\Projects\kaggle_Used_Car_Regression\Regression_Used_Car_Prices\transmission_dict.pickle", "rb"))
engine_dict = pickle.load(open(r"C:\Users\nene0\Desktop\Projects\kaggle_Used_Car_Regression\Regression_Used_Car_Prices\engine_dict.pickle", "rb"))
brand_dict = pickle.load(open(r"C:\Users\nene0\Desktop\Projects\kaggle_Used_Car_Regression\Regression_Used_Car_Prices\brand_dict.pickle", "rb"))

In [164]:
test['transmission'] = test.apply(lambda row: transmission_dict[row['model']] if pd.isna(row['transmission']) else row['transmission'], axis=1)
test['engine'] = test.apply(lambda row: engine_dict[row['model']] if pd.isna(row['engine']) else row['engine'], axis=1)
test['brand'] = test.apply(lambda row: brand_dict[row['model']] if pd.isna(row['brand']) else row['brand'], axis=1)

In [165]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 125690 entries, 0 to 125689
Data columns (total 11 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   brand         125690 non-null  object
 1   model         125690 non-null  object
 2   model_year    125690 non-null  int64 
 3   milage        125690 non-null  int64 
 4   fuel_type     125690 non-null  object
 5   engine        125680 non-null  object
 6   transmission  125690 non-null  object
 7   ext_col       125467 non-null  object
 8   int_col       122737 non-null  object
 9   accident      125690 non-null  object
 10  clean_title   125690 non-null  object
dtypes: int64(2), object(9)
memory usage: 10.5+ MB


In [166]:
test['age'] = datetime.now().year - test['model_year']
test['age_bins'] = pd.cut(test['age'], bins=[0, 2, 5, 10, 15, 20, 32], labels=['Fairly New', '3-5 Years', '6-10 Years', '11-15 Years', '16-20 Years', 'Old'])

In [167]:
test['mi_per_year'] = round(test['milage'] / test['age'], 2)
test['mi_per_year'] = np.where(test['mi_per_year']==np.inf, test['milage'], test['mi_per_year'])

In [168]:
def extract_horsepower(hp):
    try:
        pattern = r'\d+\.\d+HP'
        match = re.search(pattern, hp)
        hp_string = match.group()
        horsepower = hp_string.replace('.0HP', '')
        horsepower = int(horsepower)
        return horsepower
    except:
        return np.nan
    
def extract_engine_liter(liter):
    try:
        pattern = r'\d+\.\d+L'
        match = re.search(pattern, liter)
        liter_string = match.group()
        liter = liter_string.replace('L', '')
        liter = float(liter)
        return liter
    except:
        return np.nan
    
def extract_cylinder(cyn):
    try:
        pattern = r'\d+ Cylinder'
        match = re.search(pattern, cyn)
        cyn_string = match.group()
        cylinder = cyn_string.replace(' Cylinder', '')
        cylinder = int(cylinder)
        return cylinder
    except:
        return np.nan

In [169]:
test['hp'] = test['engine'].apply(extract_horsepower)
test['engine_liter'] = test['engine'].apply(extract_engine_liter)
test['cylinder'] = test['engine'].apply(extract_cylinder)

In [170]:
# Will be pickled later, right now just number from the train data.

hp_median = 328
engine_liter_median = 3.5
cylinder_median = 6

def fill_missing_values(row):
    row['hp'] = row['hp'] if not pd.isna(row['hp']) else hp_median
    row['engine_liter'] = row['engine_liter'] if not pd.isna(row['engine_liter']) else engine_liter_median
    row['cylinder'] = row['cylinder'] if not pd.isna(row['cylinder']) else cylinder_median
    return row

In [171]:
test = test.apply(fill_missing_values, axis=1)

In [172]:
def extract_gear(gear):
    try:
        pattern = r'^\d+'
        match = re.search(pattern, gear)
        num_gear = match.group()
        return int(num_gear)
    except:
        return np.nan

In [173]:
test['num_gear'] = test['transmission'].apply(extract_gear)

In [174]:
test.loc[((test['transmission']=='Single-Speed Fixed Gear')&(test['num_gear'].isna())), 'num_gear'] = 1

In [175]:
def remove_speed(trans):
    pattern = r'\d+-[Ss][Pp][Ee][Ee][Dd]\s*'
    return re.sub(pattern, '', trans).strip()

test['transmission'] = test['transmission'].apply(remove_speed)

In [176]:
def replace_transmission(trans):
    if 'A/T' in trans:
        return trans.replace('A/T', 'Automatic')
    elif 'M/T' in trans:
        return trans.replace('M/T', 'Manual')
    elif 'Variable' in trans:
        return trans.replace('Variable', 'CVT')
    else:
        return trans

# Apply the function to the 'transmission' column
test['transmission'] = test['transmission'].apply(replace_transmission)

In [177]:
correct_transmission = {
    'CVT Transmission': 'CVT',
    'Automatic CVT': 'CVT',
    'Electronically Controlled Automatic with O':'Automatic with Overdrive',
    'F': np.nan,
    'CVT-F':'CVT',
    '2': np.nan,
    '6 Speed At/Mt': 'Automated Manual',
    '': np.nan,
    'AT':'Automatic',
    'SCHEDULED FOR OR IN PRODUCTION': np.nan,
    '6 Speed Mt': 'Manual'}

test['transmission'] = test['transmission'].replace(correct_transmission)

In [178]:
test['transmission'].fillna('Automatic', inplace=True)

test['num_gear'].fillna(6, inplace=True)

In [179]:
hp_dict = pickle.load(open(r"C:\Users\nene0\Desktop\Projects\kaggle_Used_Car_Regression\Regression_Used_Car_Prices\hp_dict.pickle", "rb"))

In [180]:
test['hp'] = test.apply(lambda row: hp_dict[row['model']] if row['model'] in hp_dict and row['hp'] != hp_dict[row['model']] else row['hp'], axis=1)

In [181]:
test['engine_efficiency'] = round(test['hp'] / test['engine_liter'], 2)

In [182]:
brand_counts = pickle.load(open(r"C:\Users\nene0\Desktop\Projects\kaggle_Used_Car_Regression\Regression_Used_Car_Prices\brand_counts.pickle", "rb"))
test['brand_count'] = test['brand'].map(brand_counts)
test['brand_rank'] = test['brand_count'].rank(method='dense', ascending=False)

In [183]:
test.drop(columns=['model_year', 'engine', 'brand_count'], inplace=True)

In [184]:
test.head()

Unnamed: 0,brand,model,milage,fuel_type,transmission,ext_col,int_col,accident,clean_title,age,age_bins,mi_per_year,hp,engine_liter,cylinder,num_gear,engine_efficiency,brand_rank
0,Land Rover,Rover LR2 Base,98000,Gasoline,Automatic,White,Beige,None reported,Yes,9,6-10 Years,10888.89,240.0,2.0,4.0,6.0,120.0,7.0
1,Land Rover,Rover Defender SE,9142,Hybrid,Automatic,Silver,Black,None reported,Yes,4,3-5 Years,2285.5,395.0,3.0,6.0,8.0,131.67,7.0
2,Ford,Expedition Limited,28121,Gasoline,Automatic,White,Ebony,None reported,Unknown,2,Fairly New,14060.5,328.0,3.5,6.0,10.0,93.71,1.0
3,Audi,A6 2.0T Sport,61258,Gasoline,Automatic,Silician Yellow,Black,None reported,Unknown,8,6-10 Years,7657.25,252.0,3.5,6.0,6.0,72.0,5.0
4,Audi,A6 2.0T Premium Plus,59000,Gasoline,Automatic,Gray,Black,None reported,Yes,6,6-10 Years,9833.33,252.0,2.0,4.0,6.0,126.0,5.0


In [185]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 125690 entries, 0 to 125689
Data columns (total 18 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   brand              125690 non-null  object 
 1   model              125690 non-null  object 
 2   milage             125690 non-null  int64  
 3   fuel_type          125690 non-null  object 
 4   transmission       125690 non-null  object 
 5   ext_col            125467 non-null  object 
 6   int_col            122737 non-null  object 
 7   accident           125690 non-null  object 
 8   clean_title        125690 non-null  object 
 9   age                125690 non-null  int64  
 10  age_bins           125567 non-null  object 
 11  mi_per_year        125690 non-null  float64
 12  hp                 125690 non-null  float64
 13  engine_liter       125690 non-null  float64
 14  cylinder           125690 non-null  float64
 15  num_gear           125690 non-null  float64
 16  en

## Preprocessing

In [186]:
test.drop(columns=['num_gear'], inplace=True)

In [187]:
pre_scale_data = test[test.select_dtypes(['float', 'int']).columns]

pre_scale_data

Unnamed: 0,milage,age,mi_per_year,hp,engine_liter,cylinder,engine_efficiency,brand_rank
0,98000,9,10888.89,240.0,2.0,4.0,120.00,7.0
1,9142,4,2285.50,395.0,3.0,6.0,131.67,7.0
2,28121,2,14060.50,328.0,3.5,6.0,93.71,1.0
3,61258,8,7657.25,252.0,3.5,6.0,72.00,5.0
4,59000,6,9833.33,252.0,2.0,4.0,126.00,5.0
...,...,...,...,...,...,...,...,...
125685,83315,10,8331.50,362.0,3.0,6.0,120.67,2.0
125686,29336,5,5867.20,335.0,3.5,6.0,95.71,5.0
125687,77634,12,6469.50,333.0,3.0,6.0,111.00,5.0
125688,112000,12,9333.33,333.0,3.0,6.0,111.00,5.0


In [188]:
scaler = pickle.load(open(r"C:\Users\nene0\Desktop\Projects\kaggle_Used_Car_Regression\Regression_Used_Car_Prices\scaler.pickle", "rb"))

In [189]:
scaled_data = pd.DataFrame(scaler.fit_transform(pre_scale_data), columns=pre_scale_data.columns)

scaled_data

Unnamed: 0,milage,age,mi_per_year,hp,engine_liter,cylinder,engine_efficiency,brand_rank
0,0.636302,0.140564,0.345227,-0.960290,-1.361337,-1.636428,0.760436,-0.264159
1,-1.132944,-0.740684,-1.006444,0.502172,-0.597274,-0.212620,1.133783,-0.264159
2,-0.755054,-1.093183,0.843516,-0.129989,-0.215243,-0.212620,-0.080637,-0.937038
3,-0.095265,-0.035686,-0.162493,-0.847067,-0.215243,-0.212620,-0.775185,-0.488452
4,-0.140224,-0.388185,0.179389,-0.847067,-1.361337,-1.636428,0.952388,-0.488452
...,...,...,...,...,...,...,...,...
125685,0.343910,0.316813,-0.056562,0.190809,-0.597274,-0.212620,0.781870,-0.824892
125686,-0.730862,-0.564435,-0.443726,-0.063942,-0.215243,-0.212620,-0.016652,-0.488452
125687,0.230796,0.669312,-0.349099,-0.082813,-0.597274,-0.212620,0.472507,-0.488452
125688,0.915055,0.669312,0.100834,-0.082813,-0.597274,-0.212620,0.472507,-0.488452


In [190]:
test['accident'] =  test['accident'].map(lambda x: 1 if x=='At least 1 accident or damage reported' else 0)
test['clean_title'] =  test['clean_title'].map(lambda x: 1 if x=='Yes' else 0)

In [191]:
test.drop(columns=['ext_col', 'int_col'], inplace=True)

In [192]:
pre_enc_data = test[test.select_dtypes(['object']).columns]

pre_enc_data

Unnamed: 0,brand,model,fuel_type,transmission,age_bins
0,Land Rover,Rover LR2 Base,Gasoline,Automatic,6-10 Years
1,Land Rover,Rover Defender SE,Hybrid,Automatic,3-5 Years
2,Ford,Expedition Limited,Gasoline,Automatic,Fairly New
3,Audi,A6 2.0T Sport,Gasoline,Automatic,6-10 Years
4,Audi,A6 2.0T Premium Plus,Gasoline,Automatic,6-10 Years
...,...,...,...,...,...
125685,Mercedes-Benz,GL-Class GL 450 4MATIC,Gasoline,Automatic,6-10 Years
125686,Audi,Q7 55 Prestige,Gasoline,Automatic,3-5 Years
125687,Audi,A6 3.0T Premium Plus,Gasoline,Automatic,11-15 Years
125688,Audi,Q7 3.0T Premium,Gasoline,Automatic,11-15 Years


In [193]:
encoded_data = pd.get_dummies(pre_enc_data, dtype='int')

encoded_data.head()

Unnamed: 0,brand_Acura,brand_Alfa,brand_Aston,brand_Audi,brand_BMW,brand_Bentley,brand_Bugatti,brand_Buick,brand_Cadillac,brand_Chevrolet,...,transmission_Manual,transmission_Single-Speed Fixed Gear,transmission_Transmission Overdrive Switch,transmission_Transmission w/Dual Shift Mode,age_bins_11-15 Years,age_bins_16-20 Years,age_bins_3-5 Years,age_bins_6-10 Years,age_bins_Fairly New,age_bins_Old
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
3,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [194]:
test = pd.concat([encoded_data, scaled_data], axis=1)

test.head()

Unnamed: 0,brand_Acura,brand_Alfa,brand_Aston,brand_Audi,brand_BMW,brand_Bentley,brand_Bugatti,brand_Buick,brand_Cadillac,brand_Chevrolet,...,age_bins_Fairly New,age_bins_Old,milage,age,mi_per_year,hp,engine_liter,cylinder,engine_efficiency,brand_rank
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0.636302,0.140564,0.345227,-0.96029,-1.361337,-1.636428,0.760436,-0.264159
1,0,0,0,0,0,0,0,0,0,0,...,0,0,-1.132944,-0.740684,-1.006444,0.502172,-0.597274,-0.21262,1.133783,-0.264159
2,0,0,0,0,0,0,0,0,0,0,...,1,0,-0.755054,-1.093183,0.843516,-0.129989,-0.215243,-0.21262,-0.080637,-0.937038
3,0,0,0,1,0,0,0,0,0,0,...,0,0,-0.095265,-0.035686,-0.162493,-0.847067,-0.215243,-0.21262,-0.775185,-0.488452
4,0,0,0,1,0,0,0,0,0,0,...,0,0,-0.140224,-0.388185,0.179389,-0.847067,-1.361337,-1.636428,0.952388,-0.488452


In [203]:
features = pickle.load(open(r"C:\Users\nene0\Desktop\Projects\kaggle_Used_Car_Regression\Regression_Used_Car_Prices\slected_fetures.pickle", "rb"))

features

Index(['brand_Acura', 'brand_Alfa', 'brand_Aston', 'brand_Audi', 'brand_BMW',
       'brand_Bentley', 'brand_Bugatti', 'brand_Buick', 'brand_Cadillac',
       'brand_Chevrolet',
       ...
       'age_bins_Fairly New', 'age_bins_Old', 'milage', 'age', 'mi_per_year',
       'hp', 'engine_liter', 'cylinder', 'engine_efficiency', 'brand_rank'],
      dtype='object', length=1985)

In [204]:
len(features)

1985

In [205]:
len(test.columns)

1986

In [206]:
missing_features = set(features) - set(test.columns)

for ft in missing_features:
    if len(ft) == None:
        print("None missing!")
    elif len(ft) != None:
        test[ft] = 0

In [207]:
reordered_test = test[features]

reordered_test

Unnamed: 0,brand_Acura,brand_Alfa,brand_Aston,brand_Audi,brand_BMW,brand_Bentley,brand_Bugatti,brand_Buick,brand_Cadillac,brand_Chevrolet,...,age_bins_Fairly New,age_bins_Old,milage,age,mi_per_year,hp,engine_liter,cylinder,engine_efficiency,brand_rank
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0.636302,0.140564,0.345227,-0.960290,-1.361337,-1.636428,0.760436,-0.264159
1,0,0,0,0,0,0,0,0,0,0,...,0,0,-1.132944,-0.740684,-1.006444,0.502172,-0.597274,-0.212620,1.133783,-0.264159
2,0,0,0,0,0,0,0,0,0,0,...,1,0,-0.755054,-1.093183,0.843516,-0.129989,-0.215243,-0.212620,-0.080637,-0.937038
3,0,0,0,1,0,0,0,0,0,0,...,0,0,-0.095265,-0.035686,-0.162493,-0.847067,-0.215243,-0.212620,-0.775185,-0.488452
4,0,0,0,1,0,0,0,0,0,0,...,0,0,-0.140224,-0.388185,0.179389,-0.847067,-1.361337,-1.636428,0.952388,-0.488452
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
125685,0,0,0,0,0,0,0,0,0,0,...,0,0,0.343910,0.316813,-0.056562,0.190809,-0.597274,-0.212620,0.781870,-0.824892
125686,0,0,0,1,0,0,0,0,0,0,...,0,0,-0.730862,-0.564435,-0.443726,-0.063942,-0.215243,-0.212620,-0.016652,-0.488452
125687,0,0,0,1,0,0,0,0,0,0,...,0,0,0.230796,0.669312,-0.349099,-0.082813,-0.597274,-0.212620,0.472507,-0.488452
125688,0,0,0,1,0,0,0,0,0,0,...,0,0,0.915055,0.669312,0.100834,-0.082813,-0.597274,-0.212620,0.472507,-0.488452


In [208]:
model = pickle.load(open(r"C:\Users\nene0\Desktop\Projects\kaggle_Used_Car_Regression\Regression_Used_Car_Prices\catboost_model.pickle", "rb"))

In [209]:
prediction = np.exp(model.predict(reordered_test))

In [210]:
prediction

array([16457.68287063, 58739.17771286, 50393.27516523, ...,
       18744.61305274, 13556.51050008, 35813.11762444])

In [211]:
submission = pd.concat([id, pd.Series(prediction)], axis=1)
submission.columns = ['id', 'price']

submission

Unnamed: 0,id,price
0,188533,16457.682871
1,188534,58739.177713
2,188535,50393.275165
3,188536,23981.197051
4,188537,28374.843439
...,...,...
125685,314218,21464.930018
125686,314219,42277.593289
125687,314220,18744.613053
125688,314221,13556.510500


In [212]:
timestamp = datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
folder_path = r'C:\Users\nene0\Desktop\Projects\kaggle_Used_Car_Regression'
file_path = os.path.join(folder_path, f'submission_{timestamp}.csv')
submission.to_csv(file_path, index=False)