In [1746]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

In [1747]:
# Read the file with the correct encoding
car = pd.read_csv("./car_data.csv", encoding='ISO-8859-1')



In [1748]:
# Create a DataFrame from the results
car.head()

Unnamed: 0,brand,model,year,kilometer,engine,transmission,fuel,nb_of_doors,price
0,FIAT,Turbo 2013,2013,48390,135 hp 1.4L I4,5-Speed Manual,Gasoline,2 doors,"$7,787"
1,FIAT,Trekking 2014,2014,98119,160 hp 1.4L I4,6-Speed Automatic,Gasoline,4 doors,"$5,800"
2,FIAT,Salon Cabriolet 2013,2013,73482,101 hp 1.4L I4,6-Speed Automatic,Gasoline,2 doors,"$7,495"
3,FIAT,Sport 2012,2012,73873,101 hp 1.4L I4,6-Speed Automatic,Gasoline,2 doors,"$4,999"
4,FIAT,Sport 2013,2013,90056,101 hp 1.4L I4,Automatic,Gasoline,2 doors,"$5,880"


In [1749]:
car=car.drop(columns=['model'])

In [1750]:
# get the unique value of the brand
car['brand'].unique()

array(['FIAT', 'Dodge', 'Mercedes-Benz', 'Mitsubishi', 'Tesla',
       'Cadillac', 'Nissan', 'Peugeot', 'Renault', 'Volkswagen', 'Kia',
       'Honda', 'Suzuki', 'Citroen', 'Jeep', 'Chevrolet', 'Subaru',
       'Audi', 'Hyundai', 'BMW', 'Toyota'], dtype=object)

In [1751]:
# info of the car
car.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25617 entries, 0 to 25616
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   brand         25617 non-null  object
 1   year          25617 non-null  int64 
 2   kilometer     25617 non-null  object
 3   engine        25616 non-null  object
 4   transmission  25617 non-null  object
 5   fuel          25615 non-null  object
 6   nb_of_doors   25617 non-null  object
 7   price         25617 non-null  object
dtypes: int64(1), object(7)
memory usage: 1.6+ MB


In [1752]:
# unique value of the year
car['year'].unique()

array([2013, 2014, 2012, 2016, 2015, 2018, 2017, 2019, 2020, 2021, 2022,
       1981, 2004, 2005, 2007, 2010, 2002, 1997, 2011, 2006, 2008, 2009,
       2023, 2003, 2001, 1995, 2024, 1987, 1998, 1990, 2000, 1999],
      dtype=int64)

In [1753]:
# remove the null values and make another variable car_cleaned
car_cleaned=car.dropna()

In [1754]:
# Create a DataFrame from the results
car_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
Index: 25614 entries, 0 to 25616
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   brand         25614 non-null  object
 1   year          25614 non-null  int64 
 2   kilometer     25614 non-null  object
 3   engine        25614 non-null  object
 4   transmission  25614 non-null  object
 5   fuel          25614 non-null  object
 6   nb_of_doors   25614 non-null  object
 7   price         25614 non-null  object
dtypes: int64(1), object(7)
memory usage: 1.8+ MB


In [1755]:
# removed (') and $ sign from the price column and convert into int
car_cleaned['price'] = car_cleaned['price'].replace('[\$,]', '', regex=True).astype(int)

  car_cleaned['price'] = car_cleaned['price'].replace('[\$,]', '', regex=True).astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  car_cleaned['price'] = car_cleaned['price'].replace('[\$,]', '', regex=True).astype(int)


In [1756]:
# remove doors after the number of doors
car_cleaned['nb_of_doors'] = car_cleaned['nb_of_doors'].str.replace(r'\s*doors?', '', regex=True).str.strip()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  car_cleaned['nb_of_doors'] = car_cleaned['nb_of_doors'].str.replace(r'\s*doors?', '', regex=True).str.strip()


In [1757]:
# Rename the 'miles' column to 'kilometer'
car_cleaned = car_cleaned.rename(columns={'miles': 'kilometer'})

In [1758]:
car_cleaned.head()

Unnamed: 0,brand,year,kilometer,engine,transmission,fuel,nb_of_doors,price
0,FIAT,2013,48390,135 hp 1.4L I4,5-Speed Manual,Gasoline,2,7787
1,FIAT,2014,98119,160 hp 1.4L I4,6-Speed Automatic,Gasoline,4,5800
2,FIAT,2013,73482,101 hp 1.4L I4,6-Speed Automatic,Gasoline,2,7495
3,FIAT,2012,73873,101 hp 1.4L I4,6-Speed Automatic,Gasoline,2,4999
4,FIAT,2013,90056,101 hp 1.4L I4,Automatic,Gasoline,2,5880


In [1759]:
# Convert the 'kilometer' column to string type first, then remove commas and convert to int
car_cleaned['kilometer'] = car_cleaned['kilometer'].astype(str).str.replace(',', '').astype(int)

# Convert kilometers to miles
car_cleaned['miles'] = car_cleaned['kilometer'] * 0.621371

# Optionally round miles to the nearest integer
car_cleaned['miles'] = car_cleaned['miles'].round().astype(int)

car_cleaned.head()


Unnamed: 0,brand,year,kilometer,engine,transmission,fuel,nb_of_doors,price,miles
0,FIAT,2013,48390,135 hp 1.4L I4,5-Speed Manual,Gasoline,2,7787,30068
1,FIAT,2014,98119,160 hp 1.4L I4,6-Speed Automatic,Gasoline,4,5800,60968
2,FIAT,2013,73482,101 hp 1.4L I4,6-Speed Automatic,Gasoline,2,7495,45660
3,FIAT,2012,73873,101 hp 1.4L I4,6-Speed Automatic,Gasoline,2,4999,45903
4,FIAT,2013,90056,101 hp 1.4L I4,Automatic,Gasoline,2,5880,55958


In [1760]:
#  to get all the unique values of the columns
# Loop through each column and print unique values
for col in car_cleaned.columns:
    print('unique values of '+col)
    print(car_cleaned[col].unique())
    print('----------------------------------')

unique values of brand
['FIAT' 'Dodge' 'Mercedes-Benz' 'Mitsubishi' 'Tesla' 'Cadillac' 'Nissan'
 'Peugeot' 'Renault' 'Volkswagen' 'Kia' 'Honda' 'Suzuki' 'Citroen' 'Jeep'
 'Chevrolet' 'Subaru' 'Audi' 'Hyundai' 'BMW' 'Toyota']
----------------------------------
unique values of year
[2013 2014 2012 2016 2015 2018 2017 2019 2020 2021 2022 1981 2004 2005
 2007 2010 2002 1997 2011 2006 2008 2009 2023 2003 2001 1995 2024 1987
 1998 1990 2000 1999]
----------------------------------
unique values of kilometer
[ 48390  98119  73482 ... 136528  43098 133495]
----------------------------------
unique values of engine
['135 hp 1.4L I4' '160 hp 1.4L I4' '101 hp 1.4L I4' '180 hp 2.4L I4'
 '164 hp 1.4L I4' '177 hp 1.3L I4' '102 hp 2L I4' '173 hp 2.4L I4'
 '172 hp 2.4L I4' '305 hp 5.9L I6 Diesel' '500 hp 8.3L V10'
 '325 hp 5.9L I6 Diesel' '250 hp 3.5L V6' '245 hp 5.9L V8'
 '210 hp 3.7L V6' '175 hp 3.9L V6' '345 hp 5.7L V8' '370 hp 5.7L V8'
 '292 hp 3.6L V6' '390 hp 5.7L V8' '305 hp 3.6L V6' '360 hp 5

In [1761]:
# Remove rows where 'engine' column has NaN values
car_cleaned = car_cleaned.dropna(subset=['engine'])


In [1762]:
# getting the columns name
car_cleaned.columns


Index(['brand', 'year', 'kilometer', 'engine', 'transmission', 'fuel',
       'nb_of_doors', 'price', 'miles'],
      dtype='object')

In [1763]:
# Extract horsepower number before 'hp'
car_cleaned['engine(hp)'] = car_cleaned['engine'].str.extract(r'(\d+)\s*hp')[0]

# Drop rows where horsepower couldn't be extracted
car_cleaned = car_cleaned[car_cleaned['engine(hp)'].notna()]

# Convert to integer safely
car_cleaned['engine(hp)'] = car_cleaned['engine(hp)'].astype(int)


In [1764]:
# checking the dataframe
car_cleaned.head()

Unnamed: 0,brand,year,kilometer,engine,transmission,fuel,nb_of_doors,price,miles,engine(hp)
0,FIAT,2013,48390,135 hp 1.4L I4,5-Speed Manual,Gasoline,2,7787,30068,135
1,FIAT,2014,98119,160 hp 1.4L I4,6-Speed Automatic,Gasoline,4,5800,60968,160
2,FIAT,2013,73482,101 hp 1.4L I4,6-Speed Automatic,Gasoline,2,7495,45660,101
3,FIAT,2012,73873,101 hp 1.4L I4,6-Speed Automatic,Gasoline,2,4999,45903,101
4,FIAT,2013,90056,101 hp 1.4L I4,Automatic,Gasoline,2,5880,55958,101


In [1765]:
# drop the column engine
car_cleaned.drop(columns=['engine'], inplace=True)
car_cleaned.head()

Unnamed: 0,brand,year,kilometer,transmission,fuel,nb_of_doors,price,miles,engine(hp)
0,FIAT,2013,48390,5-Speed Manual,Gasoline,2,7787,30068,135
1,FIAT,2014,98119,6-Speed Automatic,Gasoline,4,5800,60968,160
2,FIAT,2013,73482,6-Speed Automatic,Gasoline,2,7495,45660,101
3,FIAT,2012,73873,6-Speed Automatic,Gasoline,2,4999,45903,101
4,FIAT,2013,90056,Automatic,Gasoline,2,5880,55958,101


In [1766]:
# Convert transmission values to 'Manual' or 'Automatic'
car_cleaned['transmission'] = car_cleaned['transmission'].apply(
    lambda x: 'Manual' if 'Manual' in str(x) else ('Automatic' if 'Automatic' in str(x) else 'Unknown')
)

# Print the updated DataFrame
car_cleaned


Unnamed: 0,brand,year,kilometer,transmission,fuel,nb_of_doors,price,miles,engine(hp)
0,FIAT,2013,48390,Manual,Gasoline,2,7787,30068,135
1,FIAT,2014,98119,Automatic,Gasoline,4,5800,60968,160
2,FIAT,2013,73482,Automatic,Gasoline,2,7495,45660,101
3,FIAT,2012,73873,Automatic,Gasoline,2,4999,45903,101
4,FIAT,2013,90056,Automatic,Gasoline,2,5880,55958,101
...,...,...,...,...,...,...,...,...,...
25611,Toyota,2016,112676,Automatic,Gasoline,4,23750,70014,278
25612,Toyota,2014,119863,Automatic,Gasoline,4,23998,74479,381
25614,Toyota,2011,136528,Automatic,Gasoline,4,12559,84835,187
25615,Toyota,2022,43098,Automatic,Gasoline,4,34000,26780,278


In [1767]:
# unique value of fuel type
car_cleaned['fuel'].unique()

array(['Gasoline', '9-Speed Automatic', 'Diesel', 'Flex Fuel Vehicle',
       'Hybrid', 'Continuously Variable Transmission', '5-Speed Manual',
       '8,343', 'Not Rated', 'Black', 'Automatic', '5-Speed Automatic',
       '5-Speed Automatic Overdrive', '200,415', '185,239', 'Manual',
       '95,098', '124,099', 'Biodiesel', 'Leather', '8-Speed Automatic',
       'Gray', 'Brown', '6-Speed Automatic', '44,656'], dtype=object)

In [1768]:
# Remove rows where 'fuel' contains 'Not Rated' or starts with a digit
car_cleaned = car_cleaned[~car_cleaned['fuel'].str.contains(r'Not Rated|^\d', na=False)]

# Print the cleaned DataFrame
car_cleaned


Unnamed: 0,brand,year,kilometer,transmission,fuel,nb_of_doors,price,miles,engine(hp)
0,FIAT,2013,48390,Manual,Gasoline,2,7787,30068,135
1,FIAT,2014,98119,Automatic,Gasoline,4,5800,60968,160
2,FIAT,2013,73482,Automatic,Gasoline,2,7495,45660,101
3,FIAT,2012,73873,Automatic,Gasoline,2,4999,45903,101
4,FIAT,2013,90056,Automatic,Gasoline,2,5880,55958,101
...,...,...,...,...,...,...,...,...,...
25611,Toyota,2016,112676,Automatic,Gasoline,4,23750,70014,278
25612,Toyota,2014,119863,Automatic,Gasoline,4,23998,74479,381
25614,Toyota,2011,136528,Automatic,Gasoline,4,12559,84835,187
25615,Toyota,2022,43098,Automatic,Gasoline,4,34000,26780,278


In [1769]:
car_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
Index: 17424 entries, 0 to 25616
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   brand         17424 non-null  object
 1   year          17424 non-null  int64 
 2   kilometer     17424 non-null  int32 
 3   transmission  17424 non-null  object
 4   fuel          17424 non-null  object
 5   nb_of_doors   17424 non-null  object
 6   price         17424 non-null  int32 
 7   miles         17424 non-null  int32 
 8   engine(hp)    17424 non-null  int32 
dtypes: int32(4), int64(1), object(4)
memory usage: 1.1+ MB


In [1770]:
car_cleaned.describe()

Unnamed: 0,year,kilometer,price,miles,engine(hp)
count,17424.0,17424.0,17424.0,17424.0,17424.0
mean,2017.61421,69165.656623,19123.504017,42977.535067,219.926997
std,4.122978,43989.489666,10406.977703,27333.794764,73.107435
min,1981.0,120.0,30.0,75.0,74.0
25%,2015.0,32300.5,11495.0,20070.75,170.0
50%,2018.0,64306.0,17598.0,39958.0,192.0
75%,2021.0,100110.75,24782.25,62205.75,272.0
max,2024.0,287433.0,218981.0,178603.0,707.0


In [1771]:
# Define a function to calculate outliers using IQR
def detect_outliers(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    return df[(df[column] < lower_bound) | (df[column] > upper_bound)]

# Check for outliers in each column
outliers_year = detect_outliers(car_cleaned, 'year')
outliers_kilometer = detect_outliers(car_cleaned, 'kilometer')
outliers_price = detect_outliers(car_cleaned, 'price')
outliers_miles = detect_outliers(car_cleaned, 'miles')
outliers_engine_hp = detect_outliers(car_cleaned, 'engine(hp)')

# Print outliers count for each column
print("Outliers in 'year' column:", outliers_year.shape[0])
print("Outliers in 'kilometer' column:", outliers_kilometer.shape[0])
print("Outliers in 'price' column:", outliers_price.shape[0])
print("Outliers in 'miles' column:", outliers_miles.shape[0])
print("Outliers in 'engine(hp)' column:", outliers_engine_hp.shape[0])


Outliers in 'year' column: 127
Outliers in 'kilometer' column: 60
Outliers in 'price' column: 311
Outliers in 'miles' column: 60
Outliers in 'engine(hp)' column: 181


In [1772]:
def remove_outliers(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    return df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]
# Remove outliers from each column
car_cleaned_no_outliers_year = remove_outliers(car_cleaned, 'year')
car_cleaned_no_outliers_kilometer = remove_outliers(car_cleaned_no_outliers_year, 'kilometer')
car_cleaned_no_outliers_price = remove_outliers(car_cleaned_no_outliers_kilometer, 'price')
car_cleaned_no_outliers_miles = remove_outliers(car_cleaned_no_outliers_price, 'miles')
car_cleaned1 = remove_outliers(car_cleaned_no_outliers_miles, 'engine(hp)')

In [1773]:
# Print the shape of the data after removing outliers
print("Data shape after removing outliers:", car_cleaned1.shape)

Data shape after removing outliers: (16803, 9)


In [1774]:
# cars that are gasoline
gasoline_cars = car_cleaned[car_cleaned['fuel'] == 'Gasoline']
gasoline_cars

Unnamed: 0,brand,year,kilometer,transmission,fuel,nb_of_doors,price,miles,engine(hp)
0,FIAT,2013,48390,Manual,Gasoline,2,7787,30068,135
1,FIAT,2014,98119,Automatic,Gasoline,4,5800,60968,160
2,FIAT,2013,73482,Automatic,Gasoline,2,7495,45660,101
3,FIAT,2012,73873,Automatic,Gasoline,2,4999,45903,101
4,FIAT,2013,90056,Automatic,Gasoline,2,5880,55958,101
...,...,...,...,...,...,...,...,...,...
25611,Toyota,2016,112676,Automatic,Gasoline,4,23750,70014,278
25612,Toyota,2014,119863,Automatic,Gasoline,4,23998,74479,381
25614,Toyota,2011,136528,Automatic,Gasoline,4,12559,84835,187
25615,Toyota,2022,43098,Automatic,Gasoline,4,34000,26780,278


In [1775]:
# replace flex fuel to diesel
car_cleaned1['fuel'] = car_cleaned1['fuel'].replace('Flex Fuel Vehicle', 'Diesel')


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  car_cleaned1['fuel'] = car_cleaned1['fuel'].replace('Flex Fuel Vehicle', 'Diesel')


In [1776]:
# replace the fuel type with diesel;
car_cleaned1['fuel'] = car_cleaned1['fuel'].replace(['Black', 'Brown', 'Gray','Leather','Automatic','Continuously Variable Transmission'], 'Diesel')


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  car_cleaned1['fuel'] = car_cleaned1['fuel'].replace(['Black', 'Brown', 'Gray','Leather','Automatic','Continuously Variable Transmission'], 'Diesel')


In [1777]:
# car runs on diesel
Diesel_cars = car_cleaned1[car_cleaned1['fuel'] == 'Diesel']
Diesel_cars

Unnamed: 0,brand,year,kilometer,transmission,fuel,nb_of_doors,price,miles,engine(hp)
816,Dodge,2007,175133,Automatic,Diesel,4,18898,108823,325
865,Dodge,2016,132300,Automatic,Diesel,4,5700,82207,283
866,Dodge,2016,39050,Automatic,Diesel,4,15995,24265,283
867,Dodge,2017,133354,Automatic,Diesel,4,6888,82862,283
868,Dodge,2019,109827,Automatic,Diesel,4,11995,68243,283
...,...,...,...,...,...,...,...,...,...
25273,Toyota,2014,191510,Automatic,Diesel,4,17407,118999,381
25298,Toyota,2022,9386,Manual,Diesel,4,39999,5832,278
25341,Toyota,2018,164422,Automatic,Diesel,4,23674,102167,381
25390,Toyota,2014,112255,Automatic,Diesel,4,24900,69752,381


In [1778]:
car_cleaned1['fuel'].unique()

array(['Gasoline', 'Diesel', 'Hybrid'], dtype=object)

In [1779]:
car_cleaned1['transmission'].unique()

array(['Manual', 'Automatic', 'Unknown'], dtype=object)

In [1780]:
# replace the unknown transmission with manual
car_cleaned1['transmission'] = car_cleaned1['transmission'].replace('Unknown', 'Manual')
car_cleaned1['transmission'].unique()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  car_cleaned1['transmission'] = car_cleaned1['transmission'].replace('Unknown', 'Manual')


array(['Manual', 'Automatic'], dtype=object)

In [1781]:
# Save the cleaned DataFrame as a CSV file
car_cleaned1.to_csv('old_car_data.csv', index=False)

print("CSV file saved as 'old_car_data.csv'")


CSV file saved as 'old_car_data.csv'


Model


In [1782]:
X=car_cleaned1.drop(columns=['price'])
y=car_cleaned1['price']

In [1783]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [1784]:
X_train

Unnamed: 0,brand,year,kilometer,transmission,fuel,nb_of_doors,miles,engine(hp)
12351,Volkswagen,2021,47429,Automatic,Gasoline,4,29471,147
595,FIAT,2014,50164,Automatic,Gasoline,4,31170,160
24930,Toyota,2011,171644,Automatic,Gasoline,4,106655,266
11119,Volkswagen,2019,68086,Automatic,Gasoline,4,42307,147
2075,Mitsubishi,2021,60401,Manual,Gasoline,4,37531,148
...,...,...,...,...,...,...,...,...
19102,Chevrolet,2021,43605,Automatic,Gasoline,4,27095,170
19953,Subaru,2019,38816,Manual,Gasoline,4,24119,152
10429,Volkswagen,2014,95000,Automatic,Gasoline,4,59030,200
877,Dodge,2014,95130,Automatic,Diesel,4,59111,283


In [1785]:
X_test

Unnamed: 0,brand,year,kilometer,transmission,fuel,nb_of_doors,miles,engine(hp)
21754,Audi,2012,95730,Automatic,Gasoline,4,59484,310
4096,Cadillac,2019,21669,Automatic,Gasoline,4,13464,237
18632,Jeep,2022,16542,Automatic,Gasoline,4,10279,293
22077,Audi,2020,60758,Automatic,Gasoline,4,37753,228
24027,BMW,2011,95626,Automatic,Gasoline,4,59419,315
...,...,...,...,...,...,...,...,...
19090,Chevrolet,2020,3973,Automatic,Gasoline,4,2469,138
20549,Subaru,2017,96092,Manual,Gasoline,4,59709,268
23077,Hyundai,2017,90805,Automatic,Gasoline,4,56424,185
13855,Kia,2019,57869,Automatic,Gasoline,4,35958,181


In [1786]:
y_test

21754    12500
4096     25800
18632    35900
22077    22981
24027     9999
         ...  
19090    18998
20549    17895
23077    10495
13855    17869
15628    24999
Name: price, Length: 3361, dtype: int32

In [1787]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline

In [1788]:
#  creating and fitting a OneHotEncoder from scikit-learn to transform categorical columns (brand, transmission, and fuel) into a numeric format for machine learning.
ohe=OneHotEncoder()
ohe.fit(X[['brand','transmission','fuel']])

In [1789]:
ohe.categories_

[array(['Audi', 'BMW', 'Cadillac', 'Chevrolet', 'Dodge', 'FIAT', 'Honda',
        'Hyundai', 'Jeep', 'Kia', 'Mercedes-Benz', 'Mitsubishi', 'Nissan',
        'Subaru', 'Toyota', 'Volkswagen'], dtype=object),
 array(['Automatic', 'Manual'], dtype=object),
 array(['Diesel', 'Gasoline', 'Hybrid'], dtype=object)]

In [1790]:
# creates a ColumnTransformer named column_trans, which applies specific preprocessing steps to specific columns.
column_trans=make_column_transformer((OneHotEncoder(categories=ohe.categories_),['brand','transmission','fuel']), remainder='passthrough')

In [1791]:
#  creates an instance of the Linear Regression model from sklearn.linear_model.
lr=LinearRegression()

In [1792]:
# creates a pipeline that combines the column transformer and the linear regression model.
pipe=make_pipeline(column_trans,lr)

In [1793]:
# fits the pipeline to the training data (X_train and y_train).
pipe.fit(X_train,y_train)

The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).



In [1794]:
# predicts the target variable (price) for the test data (X_test) using the fitted pipeline.
y_pred=pipe.predict(X_test)

In [1795]:
# r2_score function from sklearn.metrics calculates the R-squared score, which indicates how well the model explains the variance in the target variable.
r2_score(y_pred,y_test)

0.7645570415957168

In [1796]:
# 10 R² scores using different random splits of your data
scores=[]
for i in range(10):
    X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=i)
    lr=LinearRegression()
    pipe=make_pipeline(column_trans,lr)
    pipe.fit(X_train,y_train)
    y_pred=pipe.predict(X_test)
    print(r2_score(y_test,y_pred))
    scores.append(r2_score(y_pred,y_test))

0.8213979969835405
0.8119754855505629
0.8113063448136921
0.816778914913814
0.8115497654880345
0.8240107251021829
0.8137362801184622
0.8166836586247859
0.8106889645106232
0.8162187743048893


In [1797]:
# which train-test split (from the 10 iterations) gave the best model performance.
np.argmax(scores)

5

In [1798]:
scores[np.argmax(scores)]

0.7899335046626657

In [None]:
# give you the best R² score from the model trained on the most optimal data split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=np.argmax(scores))
lr=LinearRegression()
pipe=make_pipeline(column_trans,lr)
pipe.fit(X_train,y_train)
y_pred=pipe.predict(X_test)
r2_score(y_test,y_pred)

0.8240107251021829

In [1800]:
import pickle

In [1801]:
# save the trained model so that you can load it later without retraining, for example when deploying it or making predictions on new data.
pickle.dump(pipe,open('car_price_model.pkl','wb'))