## Train Data Loading and Overview

In [None]:
import pandas as pd
import warnings
import matplotlib.pyplot as plt
import seaborn as sns
import re
from dython.nominal import associations
from sklearn.preprocessing import StandardScaler
from scipy import stats
warnings.filterwarnings("ignore")

train_data = pd.read_csv('../data/raw/train.csv').copy()
train_data.drop(columns = 'id', inplace = True)

# Train data shape
print("Train data shape:")
print(train_data.shape)
print("\n" + "-"*50 + "\n")

# Train data description
print("Train data description:")
print(train_data.describe())
print("\n" + "-"*50 + "\n")

# Train data nunique
print("Train data nunique:")
print(train_data.nunique())
print("\n" + "-"*50 + "\n")

# Train data info
print("Train data info:")
train_data.info()
print("\n" + "-"*50 + "\n")

# Train data mising percentage
print("Missing percentage in train data:")
missing_percentage = (train_data.isnull().sum() / len(train_data)) * 100
missing_percentage = missing_percentage.round(2)
print(missing_percentage)
print("\n" + "-"*50 + "\n")

def analyze_columns(train_data):
    for col in train_data.columns:
        print(f"{col} value_counts:\n{train_data[col].value_counts(dropna=False)[:10]}\n")

analyze_columns(train_data)

## Data Visualisering

In [None]:
# Visualization 1: Distribution of Car Prices
plt.figure(figsize=(10,6))
sns.histplot(train_data['price'], bins=50, kde=True, color='skyblue')
plt.title('Distribution of Car Prices')
plt.xlabel('Price')
plt.ylabel('Frequency')
plt.show()

# Visualization 2: Average Car Price by Model Year
plt.figure(figsize=(10,6))
sns.boxplot(x='model_year', y='price', data=train_data, palette="Set2")
plt.title('Average Car Price by Model Year')
plt.xlabel('Model Year')
plt.ylabel('Price')
plt.xticks(rotation=45)
plt.show()

# Visualization 3: Count of Cars by Model Year
plt.figure(figsize=(10,6))
sns.countplot(x='model_year', data=train_data, palette="viridis")
plt.title('Count of Cars by Model Year')
plt.xlabel('Model Year')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.show()

# Visualization 4: Average Price by Car Brand
plt.figure(figsize=(12, 6))
sns.barplot(x='brand', y='price', data=train_data[:10000], errorbar=None)
plt.title('Average Price by Car Brand')
plt.xlabel('Brand')
plt.ylabel('Average Price')
plt.xticks(rotation=90)  
plt.show()

# Visualization 5: Top 20 Most Common Engine Types
top_20_engines = train_data['engine'].value_counts().head(20)
plt.figure(figsize=(12,8))
top_100_engines.plot(kind='bar', color='skyblue')
plt.title('Top 100 Most Common Engine Types')
plt.xlabel('Engine Type')
plt.ylabel('Count')
plt.xticks(rotation=90)
plt.show()

# Visualization 6: Distribution of Car Transmission Types (Pie Chart)
plt.figure(figsize=(8,8))
transmission_counts = train_data['transmission'].value_counts().head(10)
transmission_counts.plot.pie(autopct='%1.1f%%', startangle=90, colors=sns.color_palette("Set3", len(transmission_counts)))
plt.title('Distribution of Car Transmission Types')
plt.ylabel('')
plt.show()

# Visualization 7: Relationship Between Mileage and Price
plt.figure(figsize=(10,6))
sns.scatterplot(x='milage', y='price', data=train_data, color='purple', alpha=0.6)
plt.title('Relationship Between Mileage and Price')
plt.xlabel('Mileage')
plt.ylabel('Price')
plt.show()

# Visualization 8: Distribution of Car Fuel Types (Pie Chart)
plt.figure(figsize=(8,8))
fuel_counts = train_data['fuel_type'].value_counts()
fuel_counts.plot.pie(autopct='%1.1f%%', startangle=90, colors=sns.color_palette("Pastel1", len(fuel_counts)))
plt.title('Distribution of Car Fuel Types')
plt.ylabel('')
plt.show()

# Visualization 9: Distribution of Accident Status (Pie Chart)
plt.figure(figsize=(8,8))
accident_counts = train_data['accident'].value_counts()
accident_counts.plot.pie(autopct='%1.1f%%', startangle=90, colors=sns.color_palette("Set2", len(accident_counts)))
plt.title('Distribution of Accident Status')
plt.ylabel('')
plt.show()

# Visualization 10: Distribution of Clean Title Status (Pie Chart)
plt.figure(figsize=(8,8))
clean_title_counts = train_data['clean_title'].value_counts()
clean_title_counts.plot.pie(autopct='%1.1f%%', startangle=90, colors=sns.color_palette("Pastel2", len(clean_title_counts)))
plt.title('Distribution of Clean Title Status')
plt.ylabel('')
plt.show()

# Visualization 11: Missing Values Distribution
missing_values = train_data.isnull().mean() * 100
missing_values = missing_values[missing_values > 0].sort_values(ascending=False)

plt.figure(figsize=(10, 6))
sns.barplot(x=missing_values.values, y=missing_values.index, palette='viridis')
plt.xlabel('Percentage of Missing Values')
plt.ylabel('Features')
plt.title('Missing Values Distribution in train_data')
plt.tight_layout()
plt.show()

In [None]:
associations_df = associations(train_data, nominal_columns='all', plot=False)
corr_matrix = associations_df['corr']
plt.figure(figsize=(20, 8))
plt.gcf().set_facecolor('#FFFDD0') 
sns.heatmap(corr_matrix, annot=True, fmt='.2f', cmap='coolwarm', linewidths=0.5)
plt.title('Correlation Matrix including Categorical Features')
plt.show()

## Handling NaN values

In [None]:
# Filling missing values in accident column
train_data['accident'].fillna('None reported', inplace=True)
train_data['accident_mapped'] = train_data['accident'].map({'None reported': 1, 'At least 1 accident or damage reported': 2, 0: 0})

# Filling missing values in clean_title
train_data['clean_title'] = train_data['clean_title'].fillna(0).map({'Yes': 1, 0: 0})

# Filling missing values in int_col
train_data['int_col'] = train_data['int_col'].replace('–', np.nan)
train_data['int_col'].fillna('Black', inplace=True)

# Checking the results
print(train_data.isnull().sum())

## Statistical Analysis in Numeric Features

In [None]:
# Mileage Analysis
milage_stats = train_data['milage'].describe()
milage_skewness = train_data['milage'].skew()
milage_kurtosis = train_data['milage'].kurtosis()

print("\n--- Mileage Statistics ---\n")
print(milage_stats)
print(f"Milage Skewness: {milage_skewness:.2f}")
print(f"Milage Kurtosis: {milage_kurtosis:.2f}")
print("\n" + "-" * 50 + "\n")

# Visualization of Mileage Distribution with Outliers
plt.figure(figsize=(12, 6))
sns.boxplot(x=train_data['milage'], color='skyblue')
plt.title('Mileage Distribution with Outliers')
plt.xlabel('Mileage')
plt.show()

# Price Statistics
print("\n--- Price Statistics ---\n")
price_stats = train_data['price'].describe()
print(price_stats)

# Price Skewness and Kurtosis
price_skewness = train_data['price'].skew()
price_kurtosis = train_data['price'].kurtosis()

print(f"Price Skewness: {price_skewness:.2f}")
print(f"Price Kurtosis: {price_kurtosis:.2f}")
print("\n" + "-" * 50 + "\n")

# Visualization of Price Distribution with Outliers
plt.figure(figsize=(12, 6))
sns.boxplot(x=train_data['price'], color='orange')
plt.title('Price Distribution with Outliers')
plt.xlabel('Price')
plt.show()

In [None]:
# Scale Mileage
scaler = StandardScaler()
train_data['scaled_milage'] = scaler.fit_transform(train_data[['milage']])

# Log Transformation of Mileage
train_data['log_milage'] = np.log1p(train_data['milage'])

# Log Milage Skewness and Kurtosis
log_skewness = train_data['log_milage'].skew()
log_kurtosis = train_data['log_milage'].kurtosis()

print("\n--- Log Mileage Skewness and Kurtosis ---")
print(f"Log Milage Skewness (Çarpıklık): {log_skewness}")
print(f"Log Milage Kurtosis (Basıklık): {log_kurtosis}")
print("\n" + "-" * 50 + "\n")

# IQR Method for Outlier Detection (Log Milage)
Q1 = train_data['log_milage'].quantile(0.25)
Q3 = train_data['log_milage'].quantile(0.75)
IQR = Q3 - Q1

# Outlier Thresholds
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Apply Winsorizing for Outliers
train_data['log_milage'] = np.clip(train_data['log_milage'], lower_bound, upper_bound)

# Box-Cox Transformation for Mileage (positive values only)
train_data['boxcox_milage'], fitted_lambda = stats.boxcox(train_data['milage'])

# Box-Cox Milage Skewness and Kurtosis
boxcox_skewness = train_data['boxcox_milage'].skew()
boxcox_kurtosis = train_data['boxcox_milage'].kurtosis()

print("\n--- Box-Cox Mileage Skewness and Kurtosis ---")
print(f"Box-Cox Milage Skewness (Çarpıklık): {boxcox_skewness}")
print(f"Box-Cox Milage Kurtosis (Basıklık): {boxcox_kurtosis}")
print("\n" + "-" * 50 + "\n")

# StandardScaler for Log and Box-Cox Mileage
train_data['scaled_log_milage'] = scaler.fit_transform(train_data[['log_milage']])
train_data['scaled_boxcox_milage'] = scaler.fit_transform(train_data[['boxcox_milage']])

# Visualization of Mileage Distribution with Outliers
plt.figure(figsize=(12, 6))
sns.boxplot(x=train_data['scaled_boxcox_milage'], color='skyblue')
plt.title('scaled_boxcox_milage Distribution with Outliers')
plt.xlabel('Mileage')
plt.show()

train_data.drop(columns='scaled_log_milage', inplace=True)

#### Despite this analysis, I will not use the scaled new feature I created here, as the raw 'Mileage' feature performed better in the model!

## Feature Engineering

In [None]:
# Brand Column Processing

# Concatenate the unique values of 'brand' from train_data and get the unique brands
unique_brands = train_data['brand'].unique()

# Create a mapping of brands to integer values starting from 1
brand_mapping = {brand: i for i, brand in enumerate(unique_brands, 1)}

# Map the 'brand' column in train_data to the corresponding integer values using the brand_mapping
train_data['brand_mapped'] = train_data['brand'].map(brand_mapping)


# Model Column Processing

# Concatenate the unique values of 'model' from train_data and get the unique models
unique_models = train_data['model'].unique()

# Create a mapping of models to integer values starting from 1
model_mapping = {model: i for i, model in enumerate(unique_models, 1)}

# Map the 'model' column in train_data to the corresponding integer values using the model_mapping
train_data['model_mapped'] = train_data['model'].map(model_mapping)


# Int_col Column Processing

# Concatenate the unique values of 'int_col' from train_data and get the unique interior colors
unique_int_cols = train_data['int_col'].unique()

# Create a mapping of interior colors to integer values starting from 1
int_col_mapping = {int_col: i for i, int_col in enumerate(unique_int_cols, 1)}

# Map the 'int_col' column in train_data to the corresponding integer values using the int_col_mapping
train_data['int_col_mapped'] = train_data['int_col'].map(int_col_mapping)


# Ext_col Column Processing 

# Concatenate the unique values of 'ext_col' from train_data and get the unique exterior colors
unique_ext_cols = train_data['ext_col'].unique()

# Create a mapping of exterior colors to integer values starting from 1
ext_col_mapping = {ext_col: i for i, ext_col in enumerate(unique_ext_cols, 1)}

# Map the 'ext_col' column in train_data to the corresponding integer values using the ext_col_mapping
train_data['ext_col_mapped'] = train_data['ext_col'].map(ext_col_mapping)

# Engine Column Processing

# Concatenate the unique values of 'engine' from train_data and get the unique engine types
unique_engines = train_data['engine'].unique()

# Create a mapping of engine types to integer values starting from 1
engine_mapping = {engine: i for i, engine in enumerate(unique_engines, 1)}

# Map the 'engine' column in train_data to the corresponding integer values using the engine_mapping
train_data['engine_mapped'] = train_data['engine'].map(engine_mapping)

In [None]:
# Fuel Type Encoding
fuel_type_mapping = {
    'Gasoline': 0,
    'Hybrid': 1,
    'E85 Flex Fuel': 2,
    'Diesel': 3,
    'Plug-In Hybrid': 4
}
train_data['fuel_type_mapped'] = train_data['fuel_type'].map(fuel_type_mapping)

In [None]:
train_data.info()

In [None]:
def extract_engine_details(df, engine_column):
    # Extract Horsepower (HP) from the engine column
    df['hp'] = df[engine_column].str.extract(r'(\d+\.\d+)HP')[0]
    df['hp'] = pd.to_numeric(df['hp'], errors='coerce')
    df['hp'] = df['hp'].fillna(df['hp'].mean())  # Fill missing values with the mean

    # Extract Engine Size (L) from the engine column
    df['engine_size'] = df[engine_column].str.extract(r'(\d+\.\d+)L')[0]
    df['engine_size'] = pd.to_numeric(df['engine_size'], errors='coerce')

    # Extract Cylinder count (V6, V8, I4, etc.) from the engine column
    df['cylinders'] = df[engine_column].str.extract(r'(\d+)\s?Cylinder')[0]
    df['cylinders'] = pd.to_numeric(df['cylinders'], errors='coerce')

    return df

def extract_transmission_features(df):
    # Transmission type classification (qual_mapping)
    qual_mapping = {
        'Automatic': 1,
        'Manual': 2,
        'CVT': 3,
        'Dual Shift': 4,
        'Auto-Shift': 5
    }

    # Identify whether the transmission is automatic or manual
    df['is_automatic'] = df['transmission'].str.contains('A/T|Automatic|CVT', case=False, na=False).astype(int)
    df['is_manual'] = df['transmission'].str.contains('M/T|Manual', case=False, na=False).astype(int)

    # Extract the number of transmission speeds
    df['speed'] = df['transmission'].str.extract(r'(\d+)-Speed')[0]
    df['speed'] = pd.to_numeric(df['speed'], errors='coerce')

    # Identify additional features for dual shift and auto shift
    df['has_dual_shift'] = df['transmission'].str.contains('Dual Shift', case=False, na=False).astype(int)
    df['has_auto_shift'] = df['transmission'].str.contains('Auto-Shift', case=False, na=False).astype(int)

    # Map transmission type to numeric values
    df['transmission_type'] = df['transmission'].apply(lambda x: qual_mapping.get(x, -1))  # -1 for unknown types

    return df

def create_other_features(df):
    # Create a binary feature for rare fuel types (other than Petrol or Diesel)
    df['rare_fuel_type'] = df['fuel_type'].apply(lambda x: 0 if x in ['Petrol', 'Diesel'] else 1)

    # Identify if the transmission is automatic
    df['is_automatic'] = df['transmission'].apply(lambda x: 1 if x == 'Automatic' else 0)

    # Identify if the car has an accident history
    df['has_accident_history'] = df['accident'].apply(lambda x: 1 if x != 'None reported' else 0)

    return df

current_year = 2024  # Current year for age calculation

luxury_brands = {
    "Ultra Luxury": ["Rolls-Royce", "Lamborghini", "Ferrari", "Bentley", "Aston Martin", "McLaren"],
    "Upper Luxury": ["Porsche", "Maserati", "Land Rover", "Mercedes-Benz", "BMW", "Audi"],
    "Entry-Level Luxury": ["Jaguar", "Lexus", "Genesis", "Cadillac", "Volvo", "Alfa Romeo", "INFINITI", "Acura", "Lincoln", "Rivian", "Hummer", "Lucid", "Karma", "Lotus"]
}

# Calculate car's age based on the model year
train_data["age"] = current_year - train_data["model_year"]

# Extract engine details
train_data = extract_engine_details(train_data, 'engine')

# Extract transmission features
train_data = extract_transmission_features(train_data)

# Create other features like rare fuel types and accident history
train_data = create_other_features(train_data)

# Classify brands into luxury categories
def classify_brand(brand):
    for category, brands in luxury_brands.items():
        if brand in brands:
            return category
    return 'Non-Luxury'

train_data['luxury_category'] = train_data['brand'].apply(classify_brand)

# Map luxury categories to numeric values
train_data['luxury_category'] = train_data['luxury_category'].map({
    'Non-Luxury': 0,
    'Upper Luxury': 1,
    'Entry-Level Luxury': 2,
    'Ultra Luxury': 3
})

In [None]:
print(train_data.info())
print('\n')
print(train_data.isnull().sum())

#### A quick filling missing values

In [None]:
def fill_missing_values(df):
    # Fill missing values in numerical columns with the mean
    numerical_columns = df.select_dtypes(include=['float64', 'int64']).columns
    for col in numerical_columns:
        df[col] = df[col].fillna(df[col].mean())

    # Fill missing values in categorical columns with the mode (most frequent value)
    categorical_columns = df.select_dtypes(include=['object']).columns
    for col in categorical_columns:
        df[col] = df[col].fillna(df[col].mode()[0])

    return df

# Apply the missing value filling function to the data
train_data = fill_missing_values(train_data)

In [None]:
print(train_data.info())
print('\n')
print(train_data.isnull().sum())

In [None]:
train_data.drop(columns=['brand', 'model', 'model_year', 'int_col', 'ext_col', 'engine','transmission', 'fuel_type', 'accident'], inplace=True)

#### Critical analysis for choosing the most important features for the model to be created.

In [None]:
train_data.corr()['price']

In [None]:
train_data.to_csv('../data/processed/train_data_processed.csv')