In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
sns.set_style("whitegrid")


# Load Data

In [None]:
df = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')
df_final = pd.read_csv('sample_submission.csv')


# EDA
1. Dataset overview

2. Univariate analysis - Numeric Features
    - Distribution plots (hist and box)
    - Describe
    - Outliers 

3. Univariate analysis - Categorical Features
    - Most common values with percentages
    - Countplot with labels
    - Unique values 

4. Bivariate/Multivariate analysis
    - Numeric-numeric : Correlation matrix, scatterplots (for high correlation), pairplots for smaller features
    - Categorical-categorical : 
    - Numerical-categorical: Boxplot/violin plots

5. Data quality checks
    - Missing values
    - Duplicated values
    - Inconsistencies
    - Feature engineering recommendations


# Dataset Overview
    - shape
    - info
    - missing values
    - duplicated values
    - head, tail, sample

In [None]:
print(f"Rows: {df.shape[0]}")
print(f"Columns: {df.shape[1]}")

In [None]:
df.info()

In [None]:
df.isnull().sum()

In [None]:
print(f"Duplicated values : {df.duplicated(keep=False).sum()}")
if df.duplicated().sum()>0:
    display(df[df.duplicated(keep=False)].reset_index())

In [None]:
print("Head:")
display(df.head(3))
print("Tail:")
display(df.tail(3))
print("Sample:")
display(df.sample(3))

In [None]:
num_cols = df.select_dtypes(include=['int64']).columns
cat_cols = df.select_dtypes(include=['object']).columns
print(f"Numerical Datatypes: {num_cols}")
print(f"Categorical Datatypes: {cat_cols}")

# Observations
1. There are 188533 rows of data and 13 columns
2. There is one id column, and 2 numerical columns
3. The target column is numeric


# Univariate analysis - Numeric Features
    - Distribution plots (hist and box)
    - Describe
    - Outliers 

In [None]:
sns.set_palette("pastel")    
sns.set_theme(style="darkgrid")   

In [None]:
def num_analysis(df,col):
    print(f"****************************** {col} analysis ******************************")

    fig,axs = plt.subplots(1,2,figsize=(10,5))
    axs[0].set_title(f"{col} boxplot")
    axs[0].tick_params(axis='x', rotation=45)
    sns.boxplot(data=df,x=col,ax=axs[0])
    axs[1].set_title(f"{col} histplot")
    axs[1].tick_params(axis='x', rotation=45)

    sns.histplot(data=df,x=col,ax=axs[1],kde=True)
    plt.tight_layout()
    plt.show()

    print(f"********************  {col} values description  ********************")
    display(df[col].describe().to_frame().style.background_gradient(cmap='cool'))

    print(f"********************  {col} outliers  ********************")

    Q3 = df[col].quantile(0.75)
    Q1 = df[col].quantile(0.25)

    print(f"IQR : {Q3 - Q1}")

    upper_outliers = df[df[col] > Q3]
    lower_outliers = df[df[col] < Q1]

    if len(upper_outliers)>0:
        print(f"****** Upper Outliers ******")
        print(f"Upper outlier count: {len(upper_outliers)}")
        display(upper_outliers.head(3))
    
    if len(lower_outliers)>0:
        print(f"****** Lower Outliers ******")
        print(f"Lower outlier count: {len(lower_outliers)}")
        display(lower_outliers.head(3))

    
    print("")
    print("")
    print("")
    print("")



In [None]:
num_cols_for_analysis = num_cols.drop('id')

In [None]:
for col in num_cols_for_analysis:
    num_analysis(df,col)

# Univariate analysis - Categorical Features
- Value counts
- Unique values
- Bar chart

In [None]:
def categorical_analysis(df,col):
    print(f"****************************** {col} analysis ******************************")
    
    print(f"Number of Unique Values: {df[col].nunique()}")
    if df[col].nunique() < 10:
        fig = plt.Figure(figsize=(10,5))
        plt.xticks(rotation=45)
        plt.title("Value Distribution")
        sns.histplot(data=df,x=col)
        plt.tight_layout()
        plt.show()
    else:
        print(f"Top values for {col}")
        display(df[col].value_counts().reset_index().head(5))

    print("")
    print("")

In [None]:
for col in cat_cols:
    categorical_analysis(df,col)

# Observations 
- Clean_title can be removed as there is only one value 
- There are many unique values for engine, transmission, interior/exterior colours, model and brand

In [None]:
df.columns

Multivariate Analysis:
1. Which brands make the most money? 
2. Do interior and exterior colour affect the price? 
3. How does fuel type correlate with price?
4. How does a vehicle in an accident affect the price?
5. Does the milage affect the price?
6. Do specific brands tend to create cars of a particular fuel type?
7. How does the age of the car affect the price?


In [None]:
# Which brands make the most money?

df_brands = df.groupby(by='brand')['price'].mean().head(10).sort_values().reset_index()
plt.Figure(figsize=(10,10))
plt.title("Most Expensive Brands")
plt.xticks(rotation = 45)
sns.barplot(data=df_brands,x='brand',y='price')

In [None]:
# Do interior or exterior colours affect the price?

top_int_colors = df['int_col'].value_counts().nlargest(10).index
int_col_filtered = df[df['int_col'].isin(top_int_colors)]

plt.Figure(figsize=(12,6))
sns.boxplot(data=int_col_filtered,y='int_col',x='price')
plt.title('Interior Colour vs Price')
plt.tight_layout()
plt.show()

In [None]:
top_ext_colors = df['ext_col'].value_counts().nlargest(10).index
ext_col_filtered = df[df['ext_col'].isin(top_int_colors)]

plt.Figure(figsize=(12,6))
sns.boxplot(data=ext_col_filtered,y='ext_col',x='price')
plt.title('Exterior Colour vs Price')
plt.tight_layout()
plt.show()

In [None]:
# Most popular interior colours 
display(df.groupby('int_col')['price'].agg(['mean','count']).sort_values(by='mean',ascending=False).head(10).reset_index())

In [None]:
# Most popular exterior colours
display(df.groupby('ext_col')['price'].agg(['mean','count']).sort_values(by='mean',ascending=False).head(10).reset_index())

In [None]:
# How does fuel type correlate with price?

df_fuel = df.groupby('fuel_type')['price'].mean().sort_values(ascending=False).reset_index()

fig = plt.figure(figsize=(5,5))
plt.title("Average Price of Car by fuel-type")
plt.xticks(rotation=45)
sns.barplot(data=df_fuel,x='fuel_type',y='price',hue='fuel_type')
plt.tight_layout()
plt.show()

In [None]:
fig = plt.figure(figsize=(15,5))
sns.boxplot(data=df,x='price',y='fuel_type')
plt.tight_layout()
plt.show()

In [None]:
# How does a vehicle in an accident affect the price?

fig = plt.figure(figsize=(15,3))
plt.title("Price of fuel by accident history")
sns.boxplot(data=df,x='price',hue='accident')
plt.tight_layout()
plt.show()



In [None]:
df_accident = df.groupby('accident')['price'].mean().reset_index()
display(df_accident)

In [None]:
# Does the mileage affect the price?

df_corr = df[['price','milage']].corr()

sns.heatmap(df_corr,annot=True)

In [None]:
# How does the age of the car affect the price?

df['age'] = 2025 - df['model_year'] 

In [None]:
df_corr_n = df[['age','price']].corr()
sns.heatmap(df_corr_n,annot=True)

In [None]:
sns.scatterplot(data=df,x='age',y='price')

In [None]:
df.columns

# Preprocessor

- remove clean_title
- create 'age' feature
- one hot encoding for fuel type, accident
- target encoding for ext_col, int_col, transmission, brand, model, engine
- drop id and price

In [None]:
import category_encoders as ce 
from sklearn.model_selection import train_test_split

In [None]:
cols_for_encoding = ['ext_col','int_col','transmission','brand','model','engine']

In [None]:
def preprocessor(df):
    X = df.drop(columns='price')
    y = df['price']
    X['age'] = 2025 - df['model_year']
    X = pd.get_dummies(X,columns=['fuel_type','accident'])
    X = X.drop(columns=['id','clean_title'])

    return X,y


In [None]:
X,y = preprocessor(df)

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3,random_state=43)

encoder = ce.TargetEncoder(cols=cols_for_encoding)
X_train[cols_for_encoding] = encoder.fit_transform(X_train[cols_for_encoding], y_train)
X_test[cols_for_encoding] = encoder.transform(X_test[cols_for_encoding])

In [None]:
X.head(3)

In [None]:
from catboost import CatBoostRegressor
import optuna as optuna
from sklearn.model_selection import cross_val_score


In [None]:
def objective(trial):
    params = {
        'loss_function': 'RMSE',  
        'iterations': trial.suggest_int('iterations',300,600),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1, log=True),
        'l2_leaf_reg': trial.suggest_float('l2_leaf_reg',3,15),
        'depth': trial.suggest_int('depth',3,10),
        'verbose': 0,
        'early_stopping_rounds':50,
        'eval_metric': 'RMSE',
        'task_type':'GPU'
    }

    model = CatBoostRegressor(**params)
    
    score = cross_val_score(model, X_train, y_train, cv=4, scoring='neg_mean_squared_error').mean()

    return -score



study = optuna.create_study(direction='minimize',sampler = optuna.samplers.RandomSampler(seed=42))

study.optimize(objective,n_trials=200)

In [None]:
optuna.visualization.plot_optimization_history(study)

In [None]:
optuna.visualization.plot_param_importances(study)

In [None]:
optuna.visualization.plot_parallel_coordinate(study)

In [None]:
params = {'iterations': 580, 'learning_rate': 0.03673949509651876, 'l2_leaf_reg': 11.359809886522708, 'depth': 10}

In [None]:
print(params)

In [None]:
model_cat = CatBoostRegressor(**params)
model_cat.fit(X_train,y_train)

In [None]:
y_preds = model_cat.predict(X_test)



In [None]:
from sklearn.metrics import root_mean_squared_error

In [None]:
def rmse(y_test,y_preds):
    rmse = root_mean_squared_error(y_test,y_preds)
    return rmse

In [None]:
rmse(y_test,y_preds)

In [None]:
df_test['age'] = 2025 - df['model_year']
df_test = pd.get_dummies(df_test,columns=['fuel_type','accident'])
df_test = df_test.drop(columns=['id','clean_title'])
df_test[cols_for_encoding] = encoder.transform(df_test[cols_for_encoding])

y_final_preds = model_cat.predict(df_test)


In [None]:
df_final['price'] = y_final_preds

In [None]:
df_final.to_csv('submission.csv',index=False)