---
# ˚Used Car Pricing Model: Regression Analysis˚
---

---
# ˚ ೀ⋆｡ ˚About Data˚ ೀ⋆｡ ˚
---

### Column Description:
- **Brand & Model**: Identify the brand or company name along with the specific model of each vehicle.
- **Model Year**: Discover the manufacturing year of the vehicles, crucial for assessing depreciation and technology advancements.
- **Mileage**: Obtain the mileage of each vehicle, a key indicator of wear and tear and potential maintenance requirements.
- **Fuel Type**: Learn about the type of fuel the vehicles run on, whether it's gasoline, diesel, electric, or hybrid.
- **Engine Type**: Understand the engine specifications, shedding light on performance and efficiency.
- **Transmission**: Determine the transmission type, whether automatic, manual, or another variant.
- **Exterior & Interior Colors**: Explore the aesthetic aspects of the vehicles, including exterior and interior color options.
- **Accident History**: Discover whether a vehicle has a prior history of accidents or damage, crucial for informed decision-making.
- **Clean Title**: Evaluate the availability of a clean title, which can impact the vehicle's resale value and legal status.
- **Price**: Access the listed prices for each vehicle, aiding in price comparison and budgeting.


---
# ೀ⋆｡ ˚ Import Dependencies ˚ ೀ⋆｡ ˚
---

In [1]:
import pandas as pd 
import optuna
import numpy as np 
import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as sns 
import re
import math
from io import StringIO
from colorama import Fore, Style, init;
from IPython.display import display, HTML
from scipy.stats import skew  
import plotly.graph_objects as go
from plotly.subplots import make_subplots

import warnings
warnings.filterwarnings("ignore")

from sklearn.model_selection import train_test_split
from sklearn.ensemble import VotingClassifier, VotingRegressor
from sklearn.model_selection import KFold, RepeatedStratifiedKFold, cross_val_score
from sklearn.model_selection import RandomizedSearchCV
from lightgbm import LGBMRegressor
from sklearn.preprocessing import LabelEncoder, MinMaxScaler , StandardScaler , QuantileTransformer, PowerTransformer
import xgboost as xgb
import lightgbm as lgb
from sklearn.model_selection import cross_val_score, StratifiedKFold
from catboost import CatBoostClassifier
from sklearn.metrics import *
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, ExtraTreesClassifier
from xgboost import XGBClassifier
pd.set_option('display.max_columns', None)

---
# ˚ ೀ⋆｡ ˚ Load Data and Preprocessing˚ ೀ⋆｡ ˚
---

In [2]:
tr_ds = pd.read_csv('/kaggle/input/playground-series-s4e9/train.csv')
te_ds = pd.read_csv('/kaggle/input/playground-series-s4e9/test.csv')
submission = pd.read_csv('/kaggle/input/playground-series-s4e9/sample_submission.csv')
orignal = pd.read_csv('/kaggle/input/used-car-price-prediction-dataset/used_cars.csv')

orignal[['milage', 'price']] = orignal[['milage', 'price']].map(
    lambda x: int(''.join(re.findall(r'\d+', x))))

if 'id' in tr_ds.columns:
    tr_ds.drop(columns=['id'], inplace=True)
if 'id' in te_ds.columns:
    te_ds.drop(columns=['id'], inplace=True)

tr_ds = pd.concat([tr_ds, orignal], ignore_index=True)

In [3]:
def clean_data(df, threshold=100):
    """Update categorical columns by replacing low-frequency values with 'noise' 
    and filling missing values."""
    
    cat_columns = ['brand', 'model', 'fuel_type', 'engine', 'transmission', 
                   'ext_col', 'int_col', 'accident', 'clean_title']
    
    for col in cat_columns:
        value_counts = df[col].value_counts(dropna=False)
        low_freq_mask = df[col].map(value_counts) < threshold
        df.loc[low_freq_mask, col] = 'noise'
        
        df[col] = df[col].fillna('missing').astype('category')
    
    return df

tr_ds = clean_data(tr_ds)
te_ds = clean_data(te_ds)

In [4]:
tr_ds.head()

Unnamed: 0,brand,model,model_year,milage,fuel_type,engine,transmission,ext_col,int_col,accident,clean_title,price
0,MINI,Cooper S Base,2007,213000,Gasoline,172.0HP 1.6L 4 Cylinder Engine Gasoline Fuel,A/T,Yellow,Gray,None reported,Yes,4200
1,Lincoln,noise,2002,143250,Gasoline,noise,A/T,Silver,Beige,At least 1 accident or damage reported,Yes,4999
2,Chevrolet,noise,2002,136731,E85 Flex Fuel,320.0HP 5.3L 8 Cylinder Engine Flex Fuel Capab...,A/T,Blue,Gray,None reported,Yes,13900
3,Genesis,noise,2017,19500,Gasoline,420.0HP 5.0L 8 Cylinder Engine Gasoline Fuel,Transmission w/Dual Shift Mode,Black,Black,None reported,Yes,45000
4,Mercedes-Benz,Metris Base,2021,7388,Gasoline,208.0HP 2.0L 4 Cylinder Engine Gasoline Fuel,7-Speed A/T,Black,Beige,None reported,Yes,97500


---
# ˚ ೀ⋆｡ ˚Model Building˚ ೀ⋆｡ ˚
---


In [5]:
# =====================================
# | X | Y | of | Train | Data: |
# =====================================
X = tr_ds.drop('price', axis=1)
y = tr_ds['price']

# =====================================
# | Train | Test | Split |  | Data: |
# =====================================
X_Train, X_Test, Y_Train, Y_Test = train_test_split(X, y, test_size=0.1, random_state=42)

In [6]:
SEED = 42
N_SPLITS = 10

def train_ml_model(model, X, y, n_splits=N_SPLITS, seed=SEED):
    """
    Trains a given model using K-Folds cross-validation and calculates the RMSE scores.

    Parameters:
    model: The machine learning model to train.
    X: The feature matrix.
    y: The target vector.
    n_splits: The number of splits for cross-validation.
    seed: The random seed for reproducibility.

    Returns:
    model: The trained model.
    train_scores: List of RMSE scores for the training sets.
    val_scores: List of RMSE scores for the validation sets.
    """
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=seed)
    train_scores = []
    val_scores = []

    for fold, (train_index, val_index) in enumerate(kf.split(X), 1):
        # Splitting data into training and validation sets
        X_train, X_val = X.iloc[train_index], X.iloc[val_index]
        y_train, y_val = y.iloc[train_index], y.iloc[val_index]

        # Fitting the model
        model.fit(X_train, y_train)

        # Predicting on the training and validation sets
        y_train_pred = model.predict(X_train)
        y_val_pred = model.predict(X_val)

        # Calculating RMSE scores
        train_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred))
        val_rmse = np.sqrt(mean_squared_error(y_val, y_val_pred))

        # Storing the scores
        train_scores.append(train_rmse)
        val_scores.append(val_rmse)

        # Printing the scores for the current fold
        print(f"Fold {fold}: Train RMSE = {train_rmse:.4f}, Validation RMSE = {val_rmse:.4f}")

    # Calculating mean RMSE scores
    mean_train_rmse = np.mean(train_scores)
    mean_val_rmse = np.mean(val_scores)

    # Printing mean RMSE scores
    print(f"\nMean Train RMSE: {mean_train_rmse:.4f}")
    print(f"Mean Validation RMSE: {mean_val_rmse:.4f}")

    return model, train_scores, val_scores

# Define LightGBM parameters 
lgb_params = {
    'boosting_type': 'gbdt',
    'learning_rate': 0.01,
    'num_leaves': 31,
    'n_estimators': 500,
    'max_depth': 10,
    'min_child_samples': 20,
    'subsample': 0.7,
    'colsample_bytree': 0.7,
    'random_state': SEED,
    'verbose': -1
}

model = LGBMRegressor(**lgb_params)

# Make sure X and y are defined as your feature matrix and target vector respectively
trained_model, train_scores, val_scores = train_ml_model(model, X, y)


Fold 1: Train RMSE = 68859.5327, Validation RMSE = 72031.1707
Fold 2: Train RMSE = 68011.7246, Validation RMSE = 78438.5764
Fold 3: Train RMSE = 68673.0918, Validation RMSE = 74178.3429
Fold 4: Train RMSE = 69694.8765, Validation RMSE = 61764.7443
Fold 5: Train RMSE = 68900.3612, Validation RMSE = 70587.4243
Fold 6: Train RMSE = 69144.3967, Validation RMSE = 68841.5830
Fold 7: Train RMSE = 68186.3724, Validation RMSE = 78724.2321
Fold 8: Train RMSE = 68706.9628, Validation RMSE = 71198.6221
Fold 9: Train RMSE = 68971.3541, Validation RMSE = 70444.4920
Fold 10: Train RMSE = 67872.0273, Validation RMSE = 79628.6822

Mean Train RMSE: 68702.0700
Mean Validation RMSE: 72583.7870


---
# ˚ ೀ⋆｡ ˚Submission˚ ೀ⋆｡ ˚
---

In [7]:
pred = model.predict(te_ds)

Sub = pd.DataFrame({
    'id': submission.id,
    'price': pred
})

Sub.to_csv('Submission_4.csv', index=False)