# Car prices dataset assignment

*aangeven wat de Research Question is en de Sub Question

## 1. Exploratory Data Analysis

### 1.1 Import packages

In [None]:
import numpy as np
import pandas as pd
from plotnine import * 
import os
import scipy
from scipy import stats
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all" 
from IPython.core.display import HTML 
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from tabulate import tabulate
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import PredefinedSplit, GridSearchCV
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor

### 1.2 Read data from path

An extra column is loaded due to having data but no column name. 
This is because of an error of data in the dataset and will be adjusted after.

In [None]:
# Get the current working directory
cwd = os.getcwd()

# List all the directories and files in the parent directory of 'dataset'
dir_list = os.listdir(os.path.join(cwd, 'dataset'))

car_prices_data_path = os.path.join( "dataset", "car_prices.csv")

# Read csv file and name column to make sure the 17th column is included
df = pd.read_csv(car_prices_data_path, header=None, names=[f'col{i}' for i in range(1, 18)])

# Give the first row of this column a value
df.iloc[0, 16] = 'extra'

# Now, use the first row as column names
df.columns = df.iloc[0]

# Drop the first row since it's now redundant as column names
df = df[1:]

### 1.3 Examine and Clean the data

The unalligned data is alligned into the correct columns

In [None]:
# Filter rows with data in the 'extra' column
df_extra = df[df['extra'].notnull()]

# Include columns starting from column 5 onwards
df_extra = df_extra.iloc[:, 5:]

# Replace column names with the name of the column to the right
df_extra.columns = df_extra.columns.to_series().shift(+1).fillna('body')

# Iterate over the rows of df_extra and update values in df
for index, row in df_extra.iterrows():
    for column_name in df.columns:
        # Update values in df only for shared rows and columns
        if column_name in df_extra.columns and index in df.index:
            df.at[index, column_name] = row[column_name]

# drop the 'extra' column from the dataframe
df = df.drop('extra', axis=1)

Change data types

In [None]:
# Convert columns to appropriate data types
df['year'] = df['year'].astype('int')
df['odometer'] = df['odometer'].astype('float')
df['mmr'] = df['mmr'].astype('float')
df['sellingprice'] = df['sellingprice'].astype('float')
df['condition'] = df['condition'].astype('float')
df['saledate'] = pd.to_datetime(df['saledate'])

Numeric statistics

In [None]:
# Calculate shape
print("There are {} rows and {} columns in the dataset".format(df.shape[0], df.shape[1]))

# Select only numeric columns
numeric_columns = df.select_dtypes(include=['number'])

# get statistics for numeric columns
nan_counts_numeric = numeric_columns.isnull().sum()
unique_counts_numeric = numeric_columns.nunique()
mode_counts_numeric = numeric_columns.mode().iloc[0]
max_values_numeric = numeric_columns.max()
min_values_numeric = numeric_columns.min()
percentage_nan_numeric = numeric_columns.isnull().mean() * 100
mean_values_numeric = numeric_columns.mean()
median_values_numeric = numeric_columns.median()
variance_values_numeric = numeric_columns.var()
std_dev_values_numeric = numeric_columns.std()
skewness_values_numeric = numeric_columns.skew()
kurtosis_values_numeric = numeric_columns.kurt()

# create a new dataframe with the statistics for numeric columns
numeric_statistics = pd.DataFrame({'Nan Count': nan_counts_numeric,
                                    'Unique Count': unique_counts_numeric,
                                    'Mode Count': mode_counts_numeric,
                                    'Max Value': max_values_numeric,
                                    'Min Value': min_values_numeric,
                                    'Percentage Nan': percentage_nan_numeric,
                                    'Mean Value': mean_values_numeric,
                                    'Median Value': median_values_numeric,
                                    'Variance Value': variance_values_numeric,
                                    'Standard Deviation Value': std_dev_values_numeric,
                                    'Skewness Value': skewness_values_numeric,
                                    'Kurtosis Value': kurtosis_values_numeric})

# Print the numeric statistics dataframe
print(tabulate(numeric_statistics, headers='keys', tablefmt='fancy_grid'))


Non numeric statistics

In [None]:
# Select only non-numeric columns
non_numeric_columns = df.select_dtypes(exclude=['number'])

# get statistics for non-numeric columns
unique_counts_non_numeric = non_numeric_columns.nunique()
mode_counts_non_numeric = non_numeric_columns.mode().iloc[0]
nan_counts_non_numeric = non_numeric_columns.isnull().sum()
percentage_nan_non_numeric = non_numeric_columns.isnull().mean() * 100

# create a new dataframe with the statistics for non-numeric columns
non_numeric_statistics = pd.DataFrame({'Nan Count': nan_counts_non_numeric,
                                       'Unique Count': unique_counts_non_numeric,
                                       'Mode Count': mode_counts_non_numeric,
                                       'Percentage Nan': percentage_nan_non_numeric})

# print the non-numeric statistics dataframe
print(tabulate(non_numeric_statistics, headers='keys', tablefmt='fancy_grid'))


Filling missing values

In [None]:
grouped = df.groupby('model')
modes = grouped['transmission'].apply(lambda x: stats.mode(x)[0][0])
df['transmission'].fillna(df['model'].map(modes), inplace=True)

# Calculate the mode of the 'transmission' column
mode = df['transmission'].mode()[0]

# Fill the remaining NaN values in the 'transmission' column with the calculated mode
df['transmission'].fillna(mode, inplace=True)

# Calculate the mode of the 'transmission' column
mode = df['transmission'].mode()[0]

# Drop all rows with NaN values in the other rows
df.dropna(inplace=True)

Check if NaN values are filtered out and shape of remaining dataframe

In [None]:
# Select only non-numeric columns
non_numeric_columns = df.select_dtypes(exclude=['number'])

# Select only numeric columns
numeric_columns = df.select_dtypes(include=['number'])

# get statistics for numeric columns
nan_counts_numeric = numeric_columns.isnull().sum()
unique_counts_numeric = numeric_columns.nunique()
percentage_nan_numeric = numeric_columns.isnull().mean() * 100


# create a new dataframe with the statistics for numeric columns
numeric_statistics = pd.DataFrame({'Nan Count': nan_counts_numeric,
                                    'Unique Count': unique_counts_numeric,
                                    'Percentage Nan': percentage_nan_numeric})


# print the numeric statistics dataframe
print(tabulate(numeric_statistics, headers='keys', tablefmt='fancy_grid'))

# get statistics for non-numeric columns
unique_counts_non_numeric = non_numeric_columns.nunique()
nan_counts_non_numeric = non_numeric_columns.isnull().sum()
percentage_nan_non_numeric = non_numeric_columns.isnull().mean() * 100

# create a new dataframe with the statistics for non-numeric columns
non_numeric_statistics = pd.DataFrame({'Nan Count': nan_counts_non_numeric,
                                       'Unique Count': unique_counts_non_numeric,
                                       'Percentage Nan': percentage_nan_non_numeric})

# print the non-numeric statistics dataframe
print(tabulate(non_numeric_statistics, headers='keys', tablefmt='fancy_grid'))

# Calculate shape
print("There are {} rows and {} columns in the dataset".format(df.shape[0], df.shape[1]))

Looking into the predicted variable

In [None]:
# Get summary statistics for the 'sellingprice' column
sellingprice_stats = df['sellingprice'].describe()

# Convert the series to a DataFrame for tabulation
sellingprice_stats_df = pd.DataFrame({'Selling Price Stats': sellingprice_stats})

# Print the summary statistics using tabulate
print(tabulate(sellingprice_stats_df, headers='keys', tablefmt='fancy_grid'))

### 1.4 Correlation and descriptive analysis

Create a correlation heatmap

In [None]:
# Compute the correlation matrix
corr_matrix = df.corr()

# Create a heatmap of the correlation matrix
sns.heatmap(corr_matrix, annot=True, cmap='Spectral')

# Show the plot
plt.show()

Inncluding the categorical features to check the correlation between the features

In [None]:
# Create a copy of the DataFrame
df2 = df.copy()

# Factorize categorical columns to convert categorical data into numerical format
for column in df2.select_dtypes(exclude=[np.number]).columns:
     df2[column], labels = pd.factorize(df[column])
   
# Compute the correlation matrix
corr_matrix = df2.corr()

# Set the figure size
plt.figure(figsize=(12, 10))

# Create a heatmap of the correlation matrix
sns.clustermap(corr_matrix, annot=True, cmap='Spectral', fmt=".2f", vmin=-0.5, vmax=0.5)

# Show the plot
plt.show()

Based on the correaltion matrix above the following explanatory features are selected: 
year, condition, odometer, color and interior

Checking if there are outliers in the predicted column using IQR

In [None]:

# Calculate the first quartile (Q1) and third quartile (Q3)
Q1 = df['sellingprice'].quantile(0.25)
Q3 = df['sellingprice'].quantile(0.75)

# Calculate the Interquartile Range (IQR)
IQR = Q3 - Q1

# Define the lower and upper bounds for outliers
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Print relevant values for debugging
print(f'- Q1: {Q1} | Q3: {Q3} | IQR: {IQR}')
print(f'- Lower Bound: {lower_bound} | Upper Bound: {upper_bound}')

# Identify outliers
outliers = df[(df['sellingprice'] < lower_bound) | (df['sellingprice'] > upper_bound)]

# Print the number of outliers
num_outliers = len(outliers)
print(f'- Number of outliers: {num_outliers}')

In [None]:
# Get the number of rows before removing outliers
num_rows_before = len(df)

# Remove outliers from the 'sellingprice' column including the sellingprice = 1 
df = df[(df['sellingprice'] >= lower_bound) & (df['sellingprice'] <= upper_bound) & (df['sellingprice'] != 1)]

# Get the number of outliers removed
num_outliers_removed = num_rows_before - len(df)
print(f'Number of outliers removed: {num_outliers_removed}')

Checking if there are outliers in the odometer column using IQR

In [None]:
# Calculate the first quartile (Q1) and third quartile (Q3)
Q1 = df['odometer'].quantile(0.25)
Q3 = df['odometer'].quantile(0.75)

# Calculate the Interquartile Range (IQR)
IQR = Q3 - Q1

# Define the lower and upper bounds for outliers
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Print relevant values for debugging
print(f'- Q1: {Q1} | Q3: {Q3} | IQR: {IQR}')
print(f'- Lower Bound: {lower_bound} | Upper Bound: {upper_bound}')

# Identify outliers
outliers = df[(df['odometer'] < lower_bound) | (df['odometer'] > upper_bound)]

# Print the number of outliers
num_outliers = len(outliers)
print(f'- Number of outliers: {num_outliers}')

In [None]:
# Get the number of rows before removing outliers
num_rows_before = len(df)

# Remove outliers from the 'sellingprice' column including the sellingprice = 1 
df = df[(df['odometer'] >= lower_bound) & (df['odometer'] <= upper_bound)]

# Get the number of outliers removed
num_outliers_removed = num_rows_before - len(df)
print(f'Number of outliers removed: {num_outliers_removed}')

Checking if there are outliers in the year column using IQR

In [None]:
# Calculate the first quartile (Q1) and third quartile (Q3)
Q1 = df['year'].quantile(0.25)
Q3 = df['year'].quantile(0.75)

# Calculate the Interquartile Range (IQR)
IQR = Q3 - Q1

# Define the lower and upper bounds for outliers
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Print relevant values for debugging
print(f'- Q1: {Q1} | Q3: {Q3} | IQR: {IQR}')
print(f'- Lower Bound: {lower_bound} | Upper Bound: {upper_bound}')

# Identify outliers
outliers = df[(df['year'] < lower_bound) | (df['year'] > upper_bound)]

# Print the number of outliers
num_outliers = len(outliers)
print(f'- Number of outliers: {num_outliers}')

In [None]:
# Get the number of rows before removing outliers
num_rows_before = len(df)

# Remove outliers from the 'sellingprice' column including the sellingprice = 1 
df = df[(df['year'] >= lower_bound) & (df['year'] <= upper_bound)]

# Get the number of outliers removed
num_outliers_removed = num_rows_before - len(df)
print(f'Number of outliers removed: {num_outliers_removed}')

Adding a 'ID' column

In [None]:
#Only keeping the columns which are needed 
df = df[['year', 'condition', 'odometer', 'color', 'interior', 'sellingprice']]
#Add an 'id' column with values starting from 1
df['ID']=range(1, len(df) +1) 

#Moving the 'id' column to the front
df.insert(0, 'ID', df.pop('ID'))

#Display the updated datframe 
print(tabulate(df.head(), headers='keys', tablefmt='fancy_grid'))
print(tabulate(df.tail(), headers='keys', tablefmt='fancy_grid'))

Analyzing the unique values of the explanatory features

In [None]:
print(tabulate({
    'Year': df['year'].unique(),
    'Condition': df['condition'].unique(),
    'Odometer': df['odometer'].unique(),
    'Color': df['color'].unique(),
    'Interior': df['interior'].unique(),
 }, headers='keys', tablefmt='fancy_grid'))

Checking the value counts to identify the categories for the column "colors"

In [None]:
print(tabulate(df['color'].value_counts().reset_index(), headers='keys', tablefmt='fancy_grid'))

Creating the categories for the column "color"

In [None]:
# Define a function to categorize colors
def categorize_color(color):
     
    if 'black' in color:
        return 'black'
    elif 'white' in color:
        return 'white'
    elif 'silver' in color:
        return 'silver'
    elif 'gray' in color:
        return 'gray'
    else:
        return 'other'

# Apply the function to create a new 'color_category' column
df['color_cat'] = df['color'].apply(categorize_color)

# Display the count of unique values in 'color_category'
print(tabulate(df['color_cat'].value_counts().reset_index(), headers='keys', tablefmt='fancy_grid'))

Checking the value counts to identify the categories for the column "colors"


In [None]:
print(tabulate(df['interior'].value_counts().reset_index(), headers='keys', tablefmt='fancy_grid'))

Creating categories for column 'interior'

In [None]:
# Define a function to categorize interior
def categorize_interior(interior):
     
    if 'black' in interior:
        return 'black'
    # elif 'beige' in interior:
    #     return 'beige'
    elif 'gray' in interior:
        return 'gray'
    else:
        return 'other'

# Apply the function to create a new 'color_category' column
df['interior_cat'] = df['interior'].apply(categorize_interior)

# Display the count of unique values in 'color_category'
print(tabulate(df['interior_cat'].value_counts().reset_index(), headers='keys', tablefmt='fancy_grid'))

Dropping the color and interior column

In [None]:
df.drop(['interior', 'color'], axis=1, inplace=True)

### 1.5 Distribution visualization

- Looking at the current distribution of sellingprice after cleaning the data
- And normalizing the data in this column

In [None]:
# Apply square root transformation to 'sellingprice'
df['sqrt_sellingprice'] = np.sqrt(df['sellingprice'])

# Set a smaller figure size
plt.figure(figsize=(20, 10))

# Set font size for better readability
plt.rcParams.update({'font.size': 7})
# Create a histogram of the 'sellingprice' column
plt.subplot(1, 2, 1)
histplot = sns.histplot(df['sellingprice'], bins=20, kde=True, color='slateblue')

# Add frequency counts to each bin
for rect in histplot.patches:
    height = rect.get_height()
    width = rect.get_width()
    x = rect.get_x()
    y = rect.get_y()

    # Add the text annotation
    histplot.text(x + width / 2, y + height / 1.5 , f'{int(height)}', ha='center', va='center_baseline')

# Set plot titles and labels
plt.title('Distribution of Selling Prices')
plt.xlabel('Selling Price')
plt.ylabel('Frequency')

# Create a histogram of the 'sellingprice' column
plt.subplot(1, 2, 2)
histplot = sns.histplot(df['sqrt_sellingprice'], bins=20, kde=True, color='slateblue')

# Add frequency counts to each bin
for rect in histplot.patches:
    height = rect.get_height()
    width = rect.get_width()
    x = rect.get_x()
    y = rect.get_y()

    # Add the text annotation
    histplot.text(x + width / 2, y + height / 1.9 , f'{int(height)}', ha='center', va='center_baseline')

# Set plot titles and labels
plt.title('Distribution of Normalized Selling Prices')
plt.xlabel('Selling Price')
plt.ylabel('Frequency')

# Show the plot
plt.show()

Definine the function generate_freq_table to visualize the categorical plots

In [None]:
# Group the DataFrame by the target variable and calculate the size of each group
def generate_freq_table(df, variable = ['sellingprice']):
    dfs = []
    for i in variable:
        df_count = (
            df.groupby(i, observed=False)
            .size()
            .reset_index(name='N')
            .assign(var = i)
            .rename(columns={i: 'category'})
        )
        # Append the result to the list of DataFrames
        dfs.append(df_count)
        # Concatenate all DataFrames in the list
        res = pd.concat(dfs)
        # Convert the 'category' column to string type
        res['category'] = res['category'].astype(str)
    # Return the final concatenated DataFrame
    return res

generate_freq_table(df, ['interior_cat', 'color_cat'])

Defining a function to create the plots

In [None]:
def generate_freq_plot(freq_table):
    return (
        ggplot(freq_table, aes(x='var', y='N', fill='category')) +
        geom_col(stat='identity', position='dodge')
        )

Generating the interior categories plot

In [None]:
generate_freq_plot(generate_freq_table(df, ['interior_cat']))

Generating the color categories plot

In [None]:
generate_freq_plot(generate_freq_table(df, ['color_cat']))

Creating the plot for year

In [None]:
# Generate the frequency plot of year with different colors
plt.figure(figsize=(10, 6))
histplot = sns.histplot(data=df, x='year', hue='year', bins=15, palette='viridis', kde=False)
plt.title('Frequency Plot of Years')
plt.xlabel('Year')
plt.ylabel('Frequency')
# Display plot
plt.show()

Creating the plot for condition

In [None]:
# Generate the frequency plot of condition with different colors
plt.figure(figsize=(10, 6))
histplot = sns.histplot(data=df, x='condition', hue='condition', bins=41, palette='magma')
plt.title('Frequency Plot of Condition')
plt.xlabel('Year')
plt.ylabel('Frequency')
#Modify legend 
plt.legend(title='Condition', loc='upper left', ncol=2, labels= (df['condition'].unique()))

# Display plot
plt.show()

- Creating the plot for odometer
- And normalizing the data

In [None]:
# Apply square root transformation to 'age'
df['sqrt_odometer'] = np.sqrt(df['odometer'])

# Set the figure size
plt.figure(figsize=(16, 6))  # Increased the figure width to accommodate both plots

# Create the first histogram of the 'odometer' column
plt.subplot(1, 2, 1)
sns.histplot(df['odometer'], bins=20, color='lightseagreen')
plt.title('Histogram of Odometer')
plt.xlabel('Odometer')
plt.ylabel('Frequency')

# Create the second histogram of a different column (replace 'another_column' with the actual column name)
plt.subplot(1, 2, 2)
sns.histplot(df['sqrt_odometer'], bins=20, color='lightseagreen')  # Replace 'another_column' with the actual column name
plt.title('Histogram of Normalized Odometer')
plt.xlabel('Odometer logged')
plt.ylabel('Frequency')

# Adjust layout
plt.tight_layout()

# Show the plots
plt.show()

## 2. Feature Engineering

Create a Trainclasssplitter class with 3 def fucntions:
- Innitialize instances
- Statistics calculator
- Train, test and validation splitter

In [None]:
df['color_cat'].unique()

In [None]:
class TrainTestSplitter(object):
    '''Class to perform the split of the data into train, test, and validation.
    '''
    def __init__(self, train_frac=0.8, validation_frac=0.2, seed=1234, num_bins=5):
        self.train_frac = train_frac
        self.validation_frac = validation_frac
        self.seed = seed
        self.num_bins = num_bins
        self.total_n_sellingprice_bin = {}
    
    def calculate_statistics(self):
        statistics = {}
        for i in ['train_set', 'test_set', 'validation_set']:
            split_stats = {}
            
            # Create bins for sellingprice
            bins = np.linspace(getattr(self, i)['sellingprice'].min(), getattr(self, i)['sellingprice'].max(), self.num_bins + 1)
            getattr(self, i)['sellingprice_bin'] = pd.cut(getattr(self, i)['sellingprice'], bins=bins)
            
            target_count = getattr(self, i).groupby('sellingprice_bin').size().reset_index()
            
            for _, row in target_count.iterrows():
                bin_label = str(row['sellingprice_bin'])
                bin_count = row[0]
                percentage_key = f'percentage_total_sellingprice_bin_{bin_label}'
                split_stats[f'N_sellingprice_bin_{bin_label}'] = bin_count
                split_stats[percentage_key] = bin_count / self.total_n_sellingprice_bin.get(bin_label, 1) * 100

            statistics[i] = split_stats

        self.split_statistics = statistics

    def split_train_test(self, df):
        print("Generating the train/validation/test splits...")
        
        # Create bins for sellingprice in the entire dataset
        bins = np.linspace(df['sellingprice'].min(), df['sellingprice'].max(), self.num_bins + 1)
        df['sellingprice_bin'] = pd.cut(df['sellingprice'], bins=bins)
        
        for bin_label in df['sellingprice_bin'].unique():
            bin_count = df.loc[lambda x: x.sellingprice_bin == bin_label].shape[0]
            self.total_n_sellingprice_bin[str(bin_label)] = bin_count

        self.train_set = df.sample(frac=self.train_frac, random_state=self.seed)
        self.test_set = df.loc[lambda x: ~x.ID.isin(self.train_set.ID)].reset_index(drop=True)
        self.validation_set = self.train_set.sample(frac=self.validation_frac).reset_index(drop=True)
        self.train_set = self.train_set.loc[lambda x: ~x.ID.isin(self.validation_set.ID)].reset_index(drop=True)
        print("Calculating the statistics...")
        self.calculate_statistics()
        print("Split completed")

In [None]:
# create a fitting_splits object that will hold the train, validation, and test data
fitting_splits = TrainTestSplitter()
fitting_splits.split_train_test(df)

In [None]:
fitting_splits.test_set.shape
fitting_splits.split_statistics

Create 3 def functions:
- Dummificator
- Scaler
- A function that uses the other 2 def functions

In [None]:
def dummify(df, one_hot_encoder):
    vars_to_encode = ['interior_cat', 'color_cat']
    df_to_encode = df[vars_to_encode]
    if not one_hot_encoder:
        one_hot_encoder = OneHotEncoder()
        df_encoded = one_hot_encoder.fit_transform(df_to_encode).toarray()
    else:
        df_encoded = one_hot_encoder.transform(df_to_encode).toarray()
    df_encoded = pd.DataFrame(df_encoded, columns=one_hot_encoder.get_feature_names_out())
    # add the encoded columns and drop the original columns
    df = pd.concat([df, df_encoded], axis=1)
    df = df.drop(vars_to_encode, axis=1)
    return df, one_hot_encoder

def scale(df, standard_scaler, cols_to_scale):
    if not standard_scaler:
        standard_scaler = StandardScaler()
        df[cols_to_scale] = standard_scaler.fit_transform(df[cols_to_scale])
    else:
        df[cols_to_scale] = standard_scaler.transform(df[cols_to_scale])
    return df, standard_scaler

def prepare_data(df, one_hot_encoder=None, standard_scaler=None, cols_to_scale=None):
    df = df.reset_index(drop=True)
    df, one_hot_encoder = dummify(df, one_hot_encoder)
    
    # Identify columns to scale (numerical features)
    if cols_to_scale is None:
        cols_to_scale = df.select_dtypes(include=[np.number]).columns.tolist()
    
    # Identify columns to exclude from scaling (one-hot encoded columns)
    one_hot_columns = one_hot_encoder.get_feature_names_out() if one_hot_encoder else []
    cols_to_exclude = df.columns[df.columns.isin(one_hot_columns)]
    
    # Remove one-hot encoded columns from the list of columns to scale
    cols_to_scale = list(set(cols_to_scale) - set(cols_to_exclude))
    
    df, standard_scaler = scale(df, standard_scaler, cols_to_scale)
    return df, one_hot_encoder, standard_scaler

In [None]:
df.head()

df.head()

In [None]:
# now we prepare all the data we use below
X_train, one_hot_encoder, standard_scaler = prepare_data(fitting_splits.train_set)
X_train = X_train.drop(["ID", 'odometer', 'sellingprice', 'sqrt_sellingprice', 'sellingprice_bin'],axis=1)  
y_train = fitting_splits.train_set["sqrt_sellingprice"]

X_validation = prepare_data(fitting_splits.validation_set, one_hot_encoder, standard_scaler)[0]
X_validation = X_validation.drop(["ID", 'odometer', 'sellingprice', 'sqrt_sellingprice', 'sellingprice_bin'],axis=1)
y_validation = fitting_splits.validation_set["sqrt_sellingprice"]

X_test = prepare_data(fitting_splits.test_set, one_hot_encoder, standard_scaler)[0].drop(['sellingprice_bin',"ID", 'odometer', 'sellingprice', 'sqrt_sellingprice'],axis=1)
y_test = fitting_splits.test_set["sqrt_sellingprice"]

X_train_validation = pd.concat([X_train, X_validation])
y_train_validation = pd.concat([y_train, y_validation])

In [None]:
print(X_train_validation.dtypes)



In [None]:
y_train_validation.to_csv('train.csv')

In [None]:
# Continue with your GridSearchCV
param_grid = {
    "n_estimators": [x for x in range(10, 200, 10)],
    "max_depth": [x for x in range(5, 21, 5)]
}

sklearn_grid_search_rf = GridSearchCV(RandomForestRegressor(), param_grid=param_grid, n_jobs=4, scoring='neg_mean_squared_error')
_ = sklearn_grid_search_rf.fit(X_train_validation, y_train_validation)


One-hot encoding to create binary columns for each category and indicates the presence of a category with a 1 or 0. 

In [None]:
# Perform one-hot encoding with a prefix for color_cat
# df = pd.get_dummies(df, columns=['color_cat'], prefix='color')

In [None]:
# Perform one-hot encoding with a prefix for interior_Cat
# df = pd.get_dummies(df, columns=['interior_cat'], prefix='interior')

In [None]:
# #Checking how the dataframe looks after encoding
# print(tabulate(df.head(), headers='keys', tablefmt='fancy_grid'))
# print(f'The DataFrame has {df.shape[0]} rows and {df.shape[1]} columns.')