In [1]:
!pip install ray==2.10.0
!pip install autogluon.tabular
!pip install -U ipywidgets

Collecting ray==2.10.0
  Downloading ray-2.10.0-cp310-cp310-manylinux2014_x86_64.whl.metadata (13 kB)
Downloading ray-2.10.0-cp310-cp310-manylinux2014_x86_64.whl (65.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m65.1/65.1 MB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: ray
  Attempting uninstall: ray
    Found existing installation: ray 2.24.0
    Uninstalling ray-2.24.0:
      Successfully uninstalled ray-2.24.0
Successfully installed ray-2.10.0
Collecting autogluon.tabular
  Downloading autogluon.tabular-1.1.1-py3-none-any.whl.metadata (13 kB)
Collecting scipy<1.13,>=1.5.4 (from autogluon.tabular)
  Downloading scipy-1.12.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.4/60.4 kB[0m [31m943.4 kB/s[0m eta [36m0:00:00[0m
Collecting scikit-learn<1.4.1,>=1.3.0 (from autogluon.tabular)
  Downloading scikit_learn-1.4.0-

In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from autogluon.tabular import TabularDataset, TabularPredictor

import warnings
warnings.filterwarnings('ignore')

In [3]:
train = pd.read_csv('/kaggle/input/playground-series-s4e9/train.csv').drop('id', axis=1)
main = pd.read_csv('/kaggle/input/used-car-price-prediction-dataset/used_cars.csv')
test = pd.read_csv('/kaggle/input/playground-series-s4e9/test.csv').drop('id', axis=1)

In [4]:
main['price'] = main['price'].str.replace('$', '').str.replace(',', '').astype(float)
main['milage'] = main['milage'].str.replace('mi.', '').str.replace(',', '').astype(float)

total = pd.concat([train, main], axis=0)

In [5]:
total.head()

Unnamed: 0,brand,model,model_year,milage,fuel_type,engine,transmission,ext_col,int_col,accident,clean_title,price
0,MINI,Cooper S Base,2007,213000.0,Gasoline,172.0HP 1.6L 4 Cylinder Engine Gasoline Fuel,A/T,Yellow,Gray,None reported,Yes,4200.0
1,Lincoln,LS V8,2002,143250.0,Gasoline,252.0HP 3.9L 8 Cylinder Engine Gasoline Fuel,A/T,Silver,Beige,At least 1 accident or damage reported,Yes,4999.0
2,Chevrolet,Silverado 2500 LT,2002,136731.0,E85 Flex Fuel,320.0HP 5.3L 8 Cylinder Engine Flex Fuel Capab...,A/T,Blue,Gray,None reported,Yes,13900.0
3,Genesis,G90 5.0 Ultimate,2017,19500.0,Gasoline,420.0HP 5.0L 8 Cylinder Engine Gasoline Fuel,Transmission w/Dual Shift Mode,Black,Black,None reported,Yes,45000.0
4,Mercedes-Benz,Metris Base,2021,7388.0,Gasoline,208.0HP 2.0L 4 Cylinder Engine Gasoline Fuel,7-Speed A/T,Black,Beige,None reported,Yes,97500.0


In [6]:
test.head()

Unnamed: 0,brand,model,model_year,milage,fuel_type,engine,transmission,ext_col,int_col,accident,clean_title
0,Land,Rover LR2 Base,2015,98000,Gasoline,240.0HP 2.0L 4 Cylinder Engine Gasoline Fuel,6-Speed A/T,White,Beige,None reported,Yes
1,Land,Rover Defender SE,2020,9142,Hybrid,395.0HP 3.0L Straight 6 Cylinder Engine Gasoli...,8-Speed A/T,Silver,Black,None reported,Yes
2,Ford,Expedition Limited,2022,28121,Gasoline,3.5L V6 24V PDI DOHC Twin Turbo,10-Speed Automatic,White,Ebony,None reported,
3,Audi,A6 2.0T Sport,2016,61258,Gasoline,2.0 Liter TFSI,Automatic,Silician Yellow,Black,None reported,
4,Audi,A6 2.0T Premium Plus,2018,59000,Gasoline,252.0HP 2.0L 4 Cylinder Engine Gasoline Fuel,A/T,Gray,Black,None reported,Yes


In [7]:
def feature_engineering(df):

    # Feature 1: Age of the vehicle
    df['age'] = 2024 - df['model_year']

    # Feature 2: Mileage per year
    df['milage_per_year'] = df['milage'] / df['age']
    inf_mask = df['milage_per_year'].replace([float('inf'), -float('inf')], np.nan).isna()
    df.loc[inf_mask, 'milage_per_year'] = df.loc[inf_mask, 'milage'] / (df.loc[inf_mask, 'age'] + 0.75)
    
    # Feature 3: Extracting horsepower (HP) from the engine column
    df['horsepower'] = df['engine'].str.extract(r'(\d+\.\d+)HP').astype(float)

    # Feature 4: Extracting engine displacement (L) from the engine column
    df['engine_displacement'] = df['engine'].str.extract(r'(\d+\.\d+)L').astype(float)

    # Feature 5: Extracting the number of cylinders from the engine column
    df['number_cylinders'] = df['engine'].str.extract(r'(\d+ Cylinder)').astype(str)
    df['number_cylinders'] = df['number_cylinders'].replace('nan', np.nan)
    df['number_cylinders'] = df['number_cylinders'].str.extract(r'(\d+)').astype(float)

    # Feature 6: Extracting the type of fuel from the engine column
    df['engine_fuel_type'] = df['engine'].str.extract(r'Engine (.+ Fuel)').astype(str)
    df['engine_fuel_type'] = df['engine_fuel_type'].replace('nan', np.nan)
    
    df = df.drop(['engine', 'model_year'], axis=1)
    return df

In [8]:
def null_handling(df):
        
    df['horsepower'] = df['horsepower'].fillna(df['horsepower'].mean())
    df['engine_displacement'] = df['engine_displacement'].fillna(df['engine_displacement'].mean())
    df['number_cylinders'] = df['number_cylinders'].fillna(df['number_cylinders'].mode()[0])
    
    df['fuel_type'] = df['fuel_type'].fillna('missing')
    df['accident'] = df['accident'].fillna('missing')
    df['engine_fuel_type'] = df['engine_fuel_type'].fillna('missing')
    
#     df['fuel_type'] = df['fuel_type'].replace('–', df['fuel_type'].mode()[0])
#     df['transmission'] = df['transmission'].replace('–', df['fuel_type'].mode()[0])
    df['clean_title'] = df['clean_title'].fillna('No')
    
    return df

In [9]:
def encoding(df_train, df_test):
    # Encode interior color based on frequency
    int_color_freq = df_train['int_col'].value_counts(normalize=True)
    df_train['int_col'] = df_train['int_col'].map(int_color_freq)
    df_test['int_col'] = df_test['int_col'].map(int_color_freq)
    
    # Encode exterior color based on frequency
    ext_color_freq = df_train['ext_col'].value_counts(normalize=True)
    df_train['ext_col'] = df_train['ext_col'].map(ext_color_freq)
    df_test['ext_col'] = df_test['ext_col'].map(ext_color_freq)
    
    # Encode models based on frequency
    model_freq = df_train['model'].value_counts(normalize=True)
    df_train['model'] = df_train['model'].map(model_freq)
    df_test['model'] = df_test['model'].map(model_freq)
    
    # Encode brands based on frequency
    brand_freq = df_train['brand'].value_counts(normalize=True)
    df_train['brand'] = df_train['brand'].map(brand_freq)
    df_test['brand'] = df_test['brand'].map(brand_freq)
    
    # Extract transmission type from the transmission column
    def extract_transmission_type(transmission):
        if 'Automatic' in transmission:
            return 'Automatic'
        elif 'Manual' in transmission:
            return 'Manual'
        elif 'CVT' in transmission:
            return 'CVT'
        elif 'DCT' in transmission:
            return 'DCT'
        elif 'Fixed Gear' in transmission:
            return 'Fixed Gear'
        elif 'Variable' in transmission:
            return 'Variable'
        elif 'Single-Speed' in transmission:
            return 'Single-Speed'
        else:
            return 'None'

    # Apply transmission type extraction to both train and test sets
    df_train['transmission_type'] = df_train['transmission'].apply(extract_transmission_type)
    df_test['transmission_type'] = df_test['transmission'].apply(extract_transmission_type)

    # Extract speed count from the transmission column
    def extract_speed_count(transmission):
        for speed in ['1', '2', '4', '5', '6', '7', '8', '9', '10']:
            if f'{speed}-Speed' in transmission:
                return speed
        return 'None'

    # Apply speed count extraction to both train and test sets
    df_train['speed_count'] = df_train['transmission'].apply(extract_speed_count)
    df_test['speed_count'] = df_test['transmission'].apply(extract_speed_count)

    # Encode transmission type using LabelEncoder
    type_encoder = LabelEncoder()
    df_train['transmission_type'] = type_encoder.fit_transform(df_train['transmission_type'])
    df_test['transmission_type'] = type_encoder.transform(df_test['transmission_type'])
    
    # Encode speed count using LabelEncoder
    speed_encoder = LabelEncoder()
    df_train['speed_count'] = speed_encoder.fit_transform(df_train['speed_count'])
    df_test['speed_count'] = speed_encoder.transform(df_test['speed_count'])
    
    # Drop the original transmission column
    df_train = df_train.drop(['transmission'], axis=1)
    df_test = df_test.drop(['transmission'], axis=1)
    
    # One-hot encode remaining categorical features
    one_hot = OneHotEncoder(drop='first', sparse_output=False)
    cat_cols = [col for col in df_train.columns if df_train[col].dtype == 'object']

    # Fit and transform the train set, transform the test set
    encoded_train = one_hot.fit_transform(df_train[cat_cols])
    encoded_test = one_hot.transform(df_test[cat_cols])

    # Convert the encoded arrays to DataFrames with proper column names
    encoded_train_df = pd.DataFrame(encoded_train, 
                                    columns=one_hot.get_feature_names_out(cat_cols), 
                                    index=df_train.index)
    encoded_test_df = pd.DataFrame(encoded_test, 
                                   columns=one_hot.get_feature_names_out(cat_cols), 
                                   index=df_test.index)

    # Drop original categorical columns and concatenate the encoded columns
    df_train = df_train.drop(cat_cols, axis=1)
    df_train = pd.concat([df_train, encoded_train_df], axis=1)

    df_test = df_test.drop(cat_cols, axis=1)
    df_test = pd.concat([df_test, encoded_test_df], axis=1)
    
    return df_train, df_test

In [10]:
total = feature_engineering(total)
total = null_handling(total)

test = feature_engineering(test)
test = null_handling(test)

total, test = encoding(total, test)

In [11]:
predictor = TabularPredictor(
                            label='price', 
                            eval_metric ='rmse',
                            problem_type="regression").fit(total, 
                                                           presets='best_quality',
                                                           time_limit=3600*5,
                                                           verbosity=0,
                                                           excluded_model_types=['KNN', 'NN_TORCH'],
                                                           ag_args_fit={'num_gpus': 1})
results = predictor.fit_summary()

No path specified. Models will be saved in: "AutogluonModels/ag-20240905_200432"
2024-09-05 20:04:36,018	INFO worker.py:1752 -- Started a local Ray instance.
[36m(_ray_fit pid=471)[0m 	Training S1F2 with GPU, note that this may negatively impact model quality compared to CPU training.
[36m(_ray_fit pid=470)[0m 	Training S1F1 with GPU, note that this may negatively impact model quality compared to CPU training.
[36m(_ray_fit pid=541)[0m 	Training S1F4 with GPU, note that this may negatively impact model quality compared to CPU training.
[36m(_ray_fit pid=610)[0m 	Training S1F5 with GPU, note that this may negatively impact model quality compared to CPU training.[32m [repeated 2x across cluster][0m
[36m(_ray_fit pid=680)[0m 	Training S1F7 with GPU, note that this may negatively impact model quality compared to CPU training.[32m [repeated 2x across cluster][0m
[36m(_ray_fit pid=715)[0m 	Training S1F8 with GPU, note that this may negatively impact model quality compared to 

*** Summary of fit() ***
Estimated performance of each model:
                          model     score_val              eval_metric  pred_time_val      fit_time  pred_time_val_marginal  fit_time_marginal  stack_level  can_infer  fit_order
0           WeightedEnsemble_L3 -72303.632284  root_mean_squared_error     278.316629  10167.588415                0.002995           0.667951            3       True         53
1        NeuralNetFastAI_BAG_L2 -72352.391728  root_mean_squared_error     258.165104   9724.602528                3.461097        1007.269429            2       True         46
2           WeightedEnsemble_L2 -72443.123849  root_mean_squared_error      71.933454    744.880340                0.003415           0.690237            2       True         40
3             LightGBMXT_BAG_L2 -72488.179841  root_mean_squared_error     256.148223   8758.280603                1.444216          40.947503            2       True         41
4               CatBoost_BAG_L2 -72511.062070  r

In [12]:
predictor.leaderboard()

Unnamed: 0,model,score_val,eval_metric,pred_time_val,fit_time,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,WeightedEnsemble_L3,-72303.632284,root_mean_squared_error,278.316629,10167.588415,0.002995,0.667951,3,True,53
1,NeuralNetFastAI_BAG_L2,-72352.391728,root_mean_squared_error,258.165104,9724.602528,3.461097,1007.269429,2,True,46
2,WeightedEnsemble_L2,-72443.123849,root_mean_squared_error,71.933454,744.88034,0.003415,0.690237,2,True,40
3,LightGBMXT_BAG_L2,-72488.179841,root_mean_squared_error,256.148223,8758.280603,1.444216,40.947503,2,True,41
4,CatBoost_BAG_L2,-72511.06207,root_mean_squared_error,254.913498,8751.373427,0.209491,34.040328,2,True,44
5,CatBoost_r177_BAG_L2,-72517.248228,root_mean_squared_error,254.921645,8748.413659,0.217638,31.08056,2,True,49
6,LightGBM_r130_BAG_L1,-72606.281582,root_mean_squared_error,1.677533,35.496591,1.677533,35.496591,1,True,23
7,XGBoost_r98_BAG_L1,-72630.840232,root_mean_squared_error,0.346014,41.163084,0.346014,41.163084,1,True,38
8,LightGBM_r131_BAG_L2,-72654.138262,root_mean_squared_error,257.745216,8780.650135,3.041209,63.317035,2,True,50
9,LightGBM_r96_BAG_L1,-72655.608282,root_mean_squared_error,47.997775,222.654808,47.997775,222.654808,1,True,13


In [13]:
prediction = predictor.predict(test)
sub = pd.read_csv('/kaggle/input/playground-series-s4e9/sample_submission.csv')

sub['price'] = prediction
sub.to_csv('submission.csv', index=False)