## Import Dependency

In [491]:
import pandas as pd
import numpy as np
import re
from datetime import datetime
import matplotlib.pyplot as plt
import seaborn as sns

import os
import pickle

## Load Test Data

In [492]:
test = pd.read_csv(r"C:\Users\nene0\Desktop\Projects\kaggle_Used_Car_Regression\playground-series-s4e9\test.csv")

test.head()

Unnamed: 0,id,brand,model,model_year,milage,fuel_type,engine,transmission,ext_col,int_col,accident,clean_title
0,188533,Land,Rover LR2 Base,2015,98000,Gasoline,240.0HP 2.0L 4 Cylinder Engine Gasoline Fuel,6-Speed A/T,White,Beige,None reported,Yes
1,188534,Land,Rover Defender SE,2020,9142,Hybrid,395.0HP 3.0L Straight 6 Cylinder Engine Gasoli...,8-Speed A/T,Silver,Black,None reported,Yes
2,188535,Ford,Expedition Limited,2022,28121,Gasoline,3.5L V6 24V PDI DOHC Twin Turbo,10-Speed Automatic,White,Ebony,None reported,
3,188536,Audi,A6 2.0T Sport,2016,61258,Gasoline,2.0 Liter TFSI,Automatic,Silician Yellow,Black,None reported,
4,188537,Audi,A6 2.0T Premium Plus,2018,59000,Gasoline,252.0HP 2.0L 4 Cylinder Engine Gasoline Fuel,A/T,Gray,Black,None reported,Yes


## Clean Data

Will be using the pickle for pipeline!

In [493]:
missing_values = test.isna().sum()

for column, values in missing_values.items():
    if values > 0:
        print(f"{column}: {values}")

fuel_type: 3383
accident: 1632
clean_title: 14239


In [494]:
id = test['id']

test.drop(columns='id', inplace=True)

In [495]:
def clean_fuel_type(data):
    electric_brands = ['Tesla', 'Lucid', 'Rivian']

    # Fill in the 'fuel_type' with 'Electric' if the brand is in the electric_brands list.
    data.loc[data['brand'].isin(electric_brands), 'fuel_type'] = 'Electric'

    # Fill in missing values or update to the correct value using the 'engine' column.
    data.loc[((data['engine'].str.contains('Electric|Battery'))&(data['fuel_type'].isna())), 'fuel_type'] = 'Electric'
    data.loc[((data['engine'].str.contains('Hybrid'))&(data['fuel_type'].isna()|(data['fuel_type']=='–'))), 'fuel_type'] = 'Hybrid'
    data.loc[((data['engine'].str.contains('Gasoline'))&(data['fuel_type'].isna()|(data['fuel_type']=='not supported')|(data['fuel_type']=='–'))), 'fuel_type'] = 'Gasoline'
    data.loc[((data['engine'].str.contains('Diesel'))&(data['fuel_type'].isna())), 'fuel_type'] = 'Diesel'
    data.loc[((data['engine'].str.contains('Hydrogen'))&(data['fuel_type']=='not supported')), 'fuel_type'] = 'Hydrogen' # Toyota has a Hydrogen Car!

    # Fill in missing values or update to the correct value using the 'model' column.
    data.loc[((data['model'].str.contains('EV|Electric'))&(data['fuel_type'].isna())), 'fuel_type'] = 'Electric' # EV = Electric Vehicle

    # Rest are all filled with the value 'Gasoline'ArithmeticError
    data.loc[((data['fuel_type'].isna())|(data['fuel_type'].str.contains('not supported|–'))), 'fuel_type'] = 'Gasoline'

    return data

In [496]:
test = clean_fuel_type(test)

In [497]:
test['accident'].fillna('Unknown', inplace=True)

In [498]:
test['clean_title'].fillna('Unknown', inplace=True)

In [499]:
test.apply(lambda x: (x == '–').sum())

brand              0
model              0
model_year         0
milage             0
fuel_type          0
engine           617
transmission      40
ext_col          223
int_col         2953
accident           0
clean_title        0
dtype: int64

In [500]:
test.replace('–', np.nan, inplace=True)

In [501]:
test.loc[test['brand']=='Land', 'brand'] = 'Land Rover'

In [502]:
transmission_dict = pickle.load(open(r"C:\Users\nene0\Desktop\Projects\kaggle_Used_Car_Regression\Regression_Used_Car_Prices\transmission_dict.pickle", "rb"))
engine_dict = pickle.load(open(r"C:\Users\nene0\Desktop\Projects\kaggle_Used_Car_Regression\Regression_Used_Car_Prices\engine_dict.pickle", "rb"))
brand_dict = pickle.load(open(r"C:\Users\nene0\Desktop\Projects\kaggle_Used_Car_Regression\Regression_Used_Car_Prices\brand_dict.pickle", "rb"))

In [503]:
test['transmission'] = test.apply(lambda row: transmission_dict[row['model']] if pd.isna(row['transmission']) else row['transmission'], axis=1)
test['engine'] = test.apply(lambda row: engine_dict[row['model']] if pd.isna(row['engine']) else row['engine'], axis=1)
test['brand'] = test.apply(lambda row: brand_dict[row['model']] if pd.isna(row['brand']) else row['brand'], axis=1)

In [504]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 125690 entries, 0 to 125689
Data columns (total 11 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   brand         125690 non-null  object
 1   model         125690 non-null  object
 2   model_year    125690 non-null  int64 
 3   milage        125690 non-null  int64 
 4   fuel_type     125690 non-null  object
 5   engine        125680 non-null  object
 6   transmission  125690 non-null  object
 7   ext_col       125467 non-null  object
 8   int_col       122737 non-null  object
 9   accident      125690 non-null  object
 10  clean_title   125690 non-null  object
dtypes: int64(2), object(9)
memory usage: 10.5+ MB


In [505]:
test['age'] = datetime.now().year - test['model_year']

In [506]:
def extract_horsepower(hp):
    try:
        pattern = r'\d+\.\d+HP'
        match = re.search(pattern, hp)
        hp_string = match.group()
        horsepower = hp_string.replace('.0HP', '')
        horsepower = int(horsepower)
        return horsepower
    except:
        return np.nan
    
def extract_engine_liter(liter):
    try:
        pattern = r'\d+\.\d+L'
        match = re.search(pattern, liter)
        liter_string = match.group()
        liter = liter_string.replace('L', '')
        liter = float(liter)
        return liter
    except:
        return np.nan
    
def extract_cylinder(cyn):
    try:
        pattern = r'\d+ Cylinder'
        match = re.search(pattern, cyn)
        cyn_string = match.group()
        cylinder = cyn_string.replace(' Cylinder', '')
        cylinder = int(cylinder)
        return cylinder
    except:
        return np.nan

In [507]:
test['hp'] = test['engine'].apply(extract_horsepower)
test['engine_liter'] = test['engine'].apply(extract_engine_liter)
test['cylinder'] = test['engine'].apply(extract_cylinder)

In [508]:
# Will be pickled later, right now just number from the train data.

hp_median = 328
engine_liter_median = 3.5
cylinder_median = 6

def fill_missing_values(row):
    row['hp'] = row['hp'] if not pd.isna(row['hp']) else hp_median
    row['engine_liter'] = row['engine_liter'] if not pd.isna(row['engine_liter']) else engine_liter_median
    row['cylinder'] = row['cylinder'] if not pd.isna(row['cylinder']) else cylinder_median
    return row

In [509]:
test = test.apply(fill_missing_values, axis=1)

In [510]:
def extract_gear(gear):
    try:
        pattern = r'^\d+'
        match = re.search(pattern, gear)
        num_gear = match.group()
        return int(num_gear)
    except:
        return np.nan

In [511]:
test['num_gear'] = test['transmission'].apply(extract_gear)

In [512]:
test.loc[((test['transmission']=='Single-Speed Fixed Gear')&(test['num_gear'].isna())), 'num_gear'] = 1

In [513]:
def remove_speed(trans):
    pattern = r'\d+-[Ss][Pp][Ee][Ee][Dd]\s*'
    return re.sub(pattern, '', trans).strip()

test['transmission'] = test['transmission'].apply(remove_speed)

In [514]:
def replace_transmission(trans):
    if 'A/T' in trans:
        return trans.replace('A/T', 'Automatic')
    elif 'M/T' in trans:
        return trans.replace('M/T', 'Manual')
    elif 'Variable' in trans:
        return trans.replace('Variable', 'CVT')
    else:
        return trans

# Apply the function to the 'transmission' column
test['transmission'] = test['transmission'].apply(replace_transmission)

In [515]:
correct_transmission = {
    'CVT Transmission': 'CVT',
    'Automatic CVT': 'CVT',
    'Electronically Controlled Automatic with O':'Automatic with Overdrive',
    'F': np.nan,
    'CVT-F':'CVT',
    '2': np.nan,
    '6 Speed At/Mt': 'Automated Manual',
    '': np.nan,
    'AT':'Automatic',
    'SCHEDULED FOR OR IN PRODUCTION': np.nan,
    '6 Speed Mt': 'Manual'}

test['transmission'] = test['transmission'].replace(correct_transmission)

In [516]:
test['transmission'].fillna('Automatic', inplace=True)

test['num_gear'].fillna(6, inplace=True)

In [517]:
test.drop(columns=['model_year', 'engine'], inplace=True)

In [518]:
hp_dict = pickle.load(open(r"C:\Users\nene0\Desktop\Projects\kaggle_Used_Car_Regression\Regression_Used_Car_Prices\hp_dict.pickle", "rb"))

In [519]:
test['hp'] = test.apply(lambda row: hp_dict[row['model']] if row['model'] in hp_dict and row['hp'] != hp_dict[row['model']] else row['hp'], axis=1)

In [520]:
test.head()

Unnamed: 0,brand,model,milage,fuel_type,transmission,ext_col,int_col,accident,clean_title,age,hp,engine_liter,cylinder,num_gear
0,Land Rover,Rover LR2 Base,98000,Gasoline,Automatic,White,Beige,None reported,Yes,9,240.0,2.0,4.0,6.0
1,Land Rover,Rover Defender SE,9142,Hybrid,Automatic,Silver,Black,None reported,Yes,4,395.0,3.0,6.0,8.0
2,Ford,Expedition Limited,28121,Gasoline,Automatic,White,Ebony,None reported,Unknown,2,328.0,3.5,6.0,10.0
3,Audi,A6 2.0T Sport,61258,Gasoline,Automatic,Silician Yellow,Black,None reported,Unknown,8,252.0,3.5,6.0,6.0
4,Audi,A6 2.0T Premium Plus,59000,Gasoline,Automatic,Gray,Black,None reported,Yes,6,252.0,2.0,4.0,6.0


In [521]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 125690 entries, 0 to 125689
Data columns (total 14 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   brand         125690 non-null  object 
 1   model         125690 non-null  object 
 2   milage        125690 non-null  int64  
 3   fuel_type     125690 non-null  object 
 4   transmission  125690 non-null  object 
 5   ext_col       125467 non-null  object 
 6   int_col       122737 non-null  object 
 7   accident      125690 non-null  object 
 8   clean_title   125690 non-null  object 
 9   age           125690 non-null  int64  
 10  hp            125690 non-null  float64
 11  engine_liter  125690 non-null  float64
 12  cylinder      125690 non-null  float64
 13  num_gear      125690 non-null  float64
dtypes: float64(4), int64(2), object(8)
memory usage: 13.4+ MB


## Preprocessing

In [522]:
test.drop(columns=['num_gear'], inplace=True)

In [523]:
pre_scale_data = test[test.select_dtypes(['float', 'int']).columns]

pre_scale_data

Unnamed: 0,milage,age,hp,engine_liter,cylinder
0,98000,9,240.0,2.0,4.0
1,9142,4,395.0,3.0,6.0
2,28121,2,328.0,3.5,6.0
3,61258,8,252.0,3.5,6.0
4,59000,6,252.0,2.0,4.0
...,...,...,...,...,...
125685,83315,10,362.0,3.0,6.0
125686,29336,5,335.0,3.5,6.0
125687,77634,12,333.0,3.0,6.0
125688,112000,12,333.0,3.0,6.0


In [524]:
scaler = pickle.load(open(r"C:\Users\nene0\Desktop\Projects\kaggle_Used_Car_Regression\Regression_Used_Car_Prices\scaler.pickle", "rb"))

In [525]:
scaled_data = pd.DataFrame(scaler.fit_transform(pre_scale_data), columns=pre_scale_data.columns)

scaled_data

Unnamed: 0,milage,age,hp,engine_liter,cylinder
0,0.636302,0.140564,-0.960290,-1.361337,-1.636428
1,-1.132944,-0.740684,0.502172,-0.597274,-0.212620
2,-0.755054,-1.093183,-0.129989,-0.215243,-0.212620
3,-0.095265,-0.035686,-0.847067,-0.215243,-0.212620
4,-0.140224,-0.388185,-0.847067,-1.361337,-1.636428
...,...,...,...,...,...
125685,0.343910,0.316813,0.190809,-0.597274,-0.212620
125686,-0.730862,-0.564435,-0.063942,-0.215243,-0.212620
125687,0.230796,0.669312,-0.082813,-0.597274,-0.212620
125688,0.915055,0.669312,-0.082813,-0.597274,-0.212620


In [526]:
test['accident'] =  test['accident'].map(lambda x: 1 if x=='At least 1 accident or damage reported' else 0)
test['clean_title'] =  test['clean_title'].map(lambda x: 1 if x=='Yes' else 0)

In [527]:
test.drop(columns=['ext_col', 'int_col', 'model'], inplace=True)

In [528]:
pre_enc_data = test[test.select_dtypes(['object']).columns]

pre_enc_data

Unnamed: 0,brand,fuel_type,transmission
0,Land Rover,Gasoline,Automatic
1,Land Rover,Hybrid,Automatic
2,Ford,Gasoline,Automatic
3,Audi,Gasoline,Automatic
4,Audi,Gasoline,Automatic
...,...,...,...
125685,Mercedes-Benz,Gasoline,Automatic
125686,Audi,Gasoline,Automatic
125687,Audi,Gasoline,Automatic
125688,Audi,Gasoline,Automatic


In [529]:
encoded_data = pd.get_dummies(pre_enc_data, dtype='int')

encoded_data.head()

Unnamed: 0,brand_Acura,brand_Alfa,brand_Aston,brand_Audi,brand_BMW,brand_Bentley,brand_Bugatti,brand_Buick,brand_Cadillac,brand_Chevrolet,...,transmission_Automated Manual,transmission_Automatic,transmission_Automatic with Auto-Shift,transmission_Automatic with Overdrive,transmission_CVT,transmission_DCT Automatic,transmission_Manual,transmission_Single-Speed Fixed Gear,transmission_Transmission Overdrive Switch,transmission_Transmission w/Dual Shift Mode
0,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
3,0,0,0,1,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
4,0,0,0,1,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0


In [530]:
test = pd.concat([encoded_data, scaled_data], axis=1)

test.head()

Unnamed: 0,brand_Acura,brand_Alfa,brand_Aston,brand_Audi,brand_BMW,brand_Bentley,brand_Bugatti,brand_Buick,brand_Cadillac,brand_Chevrolet,...,transmission_DCT Automatic,transmission_Manual,transmission_Single-Speed Fixed Gear,transmission_Transmission Overdrive Switch,transmission_Transmission w/Dual Shift Mode,milage,age,hp,engine_liter,cylinder
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0.636302,0.140564,-0.96029,-1.361337,-1.636428
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,-1.132944,-0.740684,0.502172,-0.597274,-0.21262
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,-0.755054,-1.093183,-0.129989,-0.215243,-0.21262
3,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,-0.095265,-0.035686,-0.847067,-0.215243,-0.21262
4,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,-0.140224,-0.388185,-0.847067,-1.361337,-1.636428


In [531]:
features = pickle.load(open(r"C:\Users\nene0\Desktop\Projects\kaggle_Used_Car_Regression\Regression_Used_Car_Prices\slected_fetures.pickle", "rb"))

features

Index(['brand_Acura', 'brand_Alfa', 'brand_Aston', 'brand_Audi', 'brand_BMW',
       'brand_Bentley', 'brand_Bugatti', 'brand_Buick', 'brand_Cadillac',
       'brand_Chevrolet', 'brand_Chrysler', 'brand_Dodge', 'brand_FIAT',
       'brand_Ferrari', 'brand_Ford', 'brand_GMC', 'brand_Genesis',
       'brand_Honda', 'brand_Hummer', 'brand_Hyundai', 'brand_INFINITI',
       'brand_Jaguar', 'brand_Jeep', 'brand_Karma', 'brand_Kia',
       'brand_Lamborghini', 'brand_Land Rover', 'brand_Lexus', 'brand_Lincoln',
       'brand_Lotus', 'brand_Lucid', 'brand_MINI', 'brand_Maserati',
       'brand_Maybach', 'brand_Mazda', 'brand_McLaren', 'brand_Mercedes-Benz',
       'brand_Mercury', 'brand_Mitsubishi', 'brand_Nissan', 'brand_Plymouth',
       'brand_Polestar', 'brand_Pontiac', 'brand_Porsche', 'brand_RAM',
       'brand_Rivian', 'brand_Rolls-Royce', 'brand_Saab', 'brand_Saturn',
       'brand_Scion', 'brand_Subaru', 'brand_Suzuki', 'brand_Tesla',
       'brand_Toyota', 'brand_Volkswagen', 'bran

In [532]:
len(features)

79

In [533]:
len(test.columns)

77

In [534]:
missing_features = set(features) - set(test.columns)

for ft in missing_features:
    if len(ft) == None:
        print("None missing!")
    elif len(ft) != None:
        test[ft] = 0

In [535]:
reordered_test = test[features]

reordered_test

Unnamed: 0,brand_Acura,brand_Alfa,brand_Aston,brand_Audi,brand_BMW,brand_Bentley,brand_Bugatti,brand_Buick,brand_Cadillac,brand_Chevrolet,...,transmission_DCT Automatic,transmission_Manual,transmission_Single-Speed Fixed Gear,transmission_Transmission Overdrive Switch,transmission_Transmission w/Dual Shift Mode,milage,age,hp,engine_liter,cylinder
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0.636302,0.140564,-0.960290,-1.361337,-1.636428
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,-1.132944,-0.740684,0.502172,-0.597274,-0.212620
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,-0.755054,-1.093183,-0.129989,-0.215243,-0.212620
3,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,-0.095265,-0.035686,-0.847067,-0.215243,-0.212620
4,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,-0.140224,-0.388185,-0.847067,-1.361337,-1.636428
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
125685,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0.343910,0.316813,0.190809,-0.597274,-0.212620
125686,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,-0.730862,-0.564435,-0.063942,-0.215243,-0.212620
125687,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0.230796,0.669312,-0.082813,-0.597274,-0.212620
125688,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0.915055,0.669312,-0.082813,-0.597274,-0.212620


In [536]:
model = pickle.load(open(r"C:\Users\nene0\Desktop\Projects\kaggle_Used_Car_Regression\Regression_Used_Car_Prices\xgboost_model.pickle", "rb"))

In [537]:
prediction = np.exp(model.predict(reordered_test))

In [538]:
prediction

array([16304.911, 60385.188, 49957.73 , ..., 18680.129, 13208.69 ,
       34273.55 ], dtype=float32)

In [539]:
submission = pd.concat([id, pd.Series(prediction)], axis=1)
submission.columns = ['id', 'price']

submission

Unnamed: 0,id,price
0,188533,16304.911133
1,188534,60385.187500
2,188535,49957.730469
3,188536,24598.933594
4,188537,28798.095703
...,...,...
125685,314218,21256.976562
125686,314219,41316.406250
125687,314220,18680.128906
125688,314221,13208.690430


In [540]:
timestamp = datetime.now().strftime('%Y-%m-%d_%H-%M-%S')

In [541]:
folder_path = r'C:\Users\nene0\Desktop\Projects\kaggle_Used_Car_Regression'
file_path = os.path.join(folder_path, f'submission_{timestamp}.csv')
submission.to_csv(file_path, index=False)