# Project: Used Car Price Prediction using Linear Regression 
       
The goal of this project is to predict the used car price based on various features.
Dataset to downloaded from the below link:
https://www.kaggle.com/datasets/ananaymital/us-used-cars-dataset

In [1]:
import time
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error


In [2]:
# read in the dataset
start = time.time()
pd.pandas.set_option('display.max_columns', None)

df = pd.read_csv('preprocessed_dataset.csv')
print("Time taken to read dataset: %f seconds" % (time.time()-start))
print("\nDataset: %d * %d" %(df.shape[0], df.shape[1]))

df.head()

Time taken to read dataset: 10.880542 seconds

Dataset: 3000000 * 38


Unnamed: 0,price,back_legroom,body_type,city,city_fuel_economy,daysonmarket,engine_displacement,engine_type,fleet,frame_damaged,franchise_dealer,franchise_make,front_legroom,fuel_tank_volume,fuel_type,has_accidents,height,highway_fuel_economy,horsepower,isCab,is_new,length,listing_color,listing_id,make_name,maximum_seating,mileage,model_name,salvage,savings_amount,sp_id,theft_title,transmission,transmission_display,wheel_system,wheelbase,width,year
0,23141.0,0.511416,0.555556,0.05143,0.130777,0.14504,0.077922,0.153846,0.212869,0.009502,1.0,0.4375,0.425743,0.115385,0.625,0.154406,0.557203,0.166439,0.128964,0.182364,1.0,0.156699,1.0,0.828716,0.464646,0.583333,7.000001e-08,0.731092,0.006691,0.0,0.823837,0.003347,0.0,0.886364,0.6,0.024845,0.592982,0.923077
1,46500.0,0.648402,0.555556,0.781263,0.130777,0.057516,0.168831,0.153846,0.212869,0.009502,1.0,0.5,0.227723,0.362637,0.625,0.154406,0.588983,0.166439,0.201903,0.182364,1.0,0.324163,0.0,0.938659,0.535354,0.75,8.000001e-08,0.298319,0.006691,0.0,0.870482,0.003347,0.0,0.886364,0.4,0.15528,0.792982,0.961538
2,46995.0,0.525114,0.666667,0.354247,0.083333,0.342595,0.233766,0.0,0.0,0.0,1.0,0.229167,0.633663,0.263736,0.625,0.0,0.381356,0.111111,0.264271,0.0,0.0,0.322967,0.857143,0.585812,0.89899,0.583333,0.0003114691,0.951681,0.0,0.0,0.823507,0.0,0.75,0.568182,0.4,0.089027,0.568421,0.807692
3,67430.0,0.625571,0.555556,0.781263,0.130777,0.05446,0.298701,0.564103,0.212869,0.009502,1.0,0.5,0.217822,0.631868,0.625,0.154406,0.694915,0.166439,0.301268,0.182364,1.0,0.491627,0.285714,0.942341,0.535354,0.75,1.1e-07,0.296919,0.006691,0.0,0.870482,0.003347,0.0,0.772727,0.4,0.293996,0.849123,0.961538
4,48880.0,0.648402,0.555556,0.781263,0.130777,0.038066,0.168831,0.153846,0.212869,0.009502,1.0,0.5,0.227723,0.362637,0.625,0.154406,0.588983,0.166439,0.201903,0.182364,1.0,0.324163,0.0,0.95778,0.535354,0.75,7.000001e-08,0.298319,0.006691,0.0,0.870482,0.003347,0.0,0.886364,0.4,0.15528,0.792982,0.961538


## Build Model

### 1- Train and test data splitting

In [4]:
feature_train = [feature for feature in df.columns if feature not in ['price']]
X = df[feature_train]
y = df['price']

# Divide the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

print(len(y_train))

2400000


### Model 1: LinearRegression

In [5]:
lregr = LinearRegression()
lregr.fit(X_train, y_train)

pred_train_l = lregr.predict(X_train)
pred_test_l = lregr.predict(X_test)

mse_train_l = mean_squared_error(y_train, pred_train_l)
print("Train MSE: %.2f" %mse_train_l)

mse_test_l = mean_squared_error(y_test, pred_test_l)
print("Test  MSE: %.2f" %mse_test_l)


Train MSE: 173193306.91
Test  MSE: 168374771.19


### Model 2: RandomForestRegressor

In [6]:
# Sample a portion of the rows

df2 = df.sample(frac=0.30, replace=False)
print("Sample Dataset: %d * %d" %(df2.shape[0], df2.shape[1]))


Sample Dataset: 900000 * 38


In [7]:
X2 = df2[feature_train]
y2 = df2['price']

# Divide the data into training and testing sets
X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y2, test_size = 0.2, random_state = 42)

print(len(y2_train))

720000


In [10]:

rfregr = RandomForestRegressor(n_estimators=100, random_state=42)
rfregr.fit(X2_train, y2_train)

train_pred_rf = rfregr.predict(X2_train)
test_pred_rf = rfregr.predict(X2_test)

mse_train_rf = mean_squared_error(y2_train, train_pred_rf)
print("Train MSE: %.2f" %mse_train_rf)

mse_test_rf = mean_squared_error(y2_test, test_pred_rf)
print("Test  MSE: %.2f" %mse_test_rf)


Train MSE: 6989776.33
Test  MSE: 80797076.54


In [11]:
# Feature importance analysis

feature_imp = pd.Series(rfregr.feature_importances_).sort_values(ascending=False)
print(feature_imp[feature_imp>0.0])
feature_idx_list = feature_imp[feature_imp>0.0].index
feature_idx = np.array(feature_idx_list)
print("Selected features:", feature_idx.shape[0])
print("Selected feature indices:", feature_idx)

17    0.415274
36    0.136256
25    0.087941
3     0.035635
11    0.030100
26    0.026806
23    0.025968
20    0.024492
28    0.018000
12    0.017443
6     0.015904
13    0.015006
35    0.013714
15    0.013433
0     0.012966
24    0.011549
2     0.011000
33    0.010547
22    0.010191
29    0.009939
7     0.009227
16    0.007919
34    0.006796
4     0.005896
10    0.005603
1     0.005556
5     0.004789
21    0.003460
19    0.003157
32    0.002434
31    0.001405
9     0.000319
18    0.000283
30    0.000271
27    0.000270
14    0.000254
8     0.000197
dtype: float64
Selected features: 37
Selected feature indices: [17 36 25  3 11 26 23 20 28 12  6 13 35 15  0 24  2 33 22 29  7 16 34  4
 10  1  5 21 19 32 31  9 18 30 27 14  8]


In [12]:
print('Top 5 important features: ')
for i in feature_idx_list[:5]:
    print(feature_train[i])

Top 5 important features: 
horsepower
year
mileage
city_fuel_economy
front_legroom


In [13]:
X3 = df2[feature_train]
y3 = df2['price']

# Divide the data into training and testing sets
X3_train, X3_test, y3_train, y3_test = train_test_split(X3, y3, test_size = 0.2, random_state = 1)

print(len(y3_train))

720000


In [14]:
rfregr = RandomForestRegressor(n_estimators=100, random_state=1)
rfregr.fit(X3_train, y3_train)

train_pred_rf2 = rfregr.predict(X3_train)
test_pred_rf2 = rfregr.predict(X3_test)

mse_train_rf2 = mean_squared_error(y3_train, train_pred_rf2)
print("Train MSE: %.2f" %mse_train_rf2)

mse_test_rf2 = mean_squared_error(y3_test, test_pred_rf2)
print("Test  MSE: %.2f" %mse_test_rf2)


Train MSE: 7568950.06
Test  MSE: 61660340.23
