In [304]:
## making the linear regression to predict the future value of cars
## making imports
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import sqlite3

In [305]:
#loading file ~~ can be adjusted later but using columns from sql database:
#price, year, manufacturer, condition, cylinders, odometer, title_status, transmission, size, state, posting_date
file_path = Path("../data/vehicles_clean.csv")


In [306]:
#making dataframe
used_cars_df = pd.read_csv(file_path)

used_cars_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 80170 entries, 0 to 80169
Data columns (total 15 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   price         80170 non-null  int64  
 1   year          80170 non-null  int64  
 2   manufacturer  80170 non-null  object 
 3   condition     80170 non-null  object 
 4   cylinders     80170 non-null  object 
 5   fuel          80170 non-null  object 
 6   odometer      80170 non-null  float64
 7   title_status  80170 non-null  object 
 8   transmission  80170 non-null  object 
 9   drive         80170 non-null  object 
 10  size          80170 non-null  object 
 11  type          80170 non-null  object 
 12  paint_color   80170 non-null  object 
 13  state         80170 non-null  object 
 14  posting_date  80170 non-null  object 
dtypes: float64(1), int64(2), object(12)
memory usage: 9.2+ MB


In [307]:
#dropping cars that are free
mask = used_cars_df['price'] != 0
used_cars_df = used_cars_df.loc[~mask]

In [308]:
#dropping un-drivable cars
# mask2 = used_cars_df['title_status'] == 'parts only','salvage'
# used_cars_df = used_cars_df.loc[~mask2]
used_cars_df.condition.unique()

array(['like new', 'excellent', 'new', 'good', 'fair', 'salvage'],
      dtype=object)

In [309]:
used_cars_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3478 entries, 8 to 80169
Data columns (total 15 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   price         3478 non-null   int64  
 1   year          3478 non-null   int64  
 2   manufacturer  3478 non-null   object 
 3   condition     3478 non-null   object 
 4   cylinders     3478 non-null   object 
 5   fuel          3478 non-null   object 
 6   odometer      3478 non-null   float64
 7   title_status  3478 non-null   object 
 8   transmission  3478 non-null   object 
 9   drive         3478 non-null   object 
 10  size          3478 non-null   object 
 11  type          3478 non-null   object 
 12  paint_color   3478 non-null   object 
 13  state         3478 non-null   object 
 14  posting_date  3478 non-null   object 
dtypes: float64(1), int64(2), object(12)
memory usage: 434.8+ KB


In [310]:
# Cleaning the Data
used_cars_df = used_cars_df.drop('posting_date', axis=1)

In [311]:
# Label encoding manufacturer, cylinders, fuel, transmission, drive, size, type, paint_color, state
used_cars_df['manufacturer'] = used_cars_df['manufacturer'].astype('category')
used_cars_df['cylinders'] = used_cars_df['cylinders'].astype('category')
used_cars_df['fuel'] = used_cars_df['fuel'].astype('category')
used_cars_df['transmission'] = used_cars_df['transmission'].astype('category')
used_cars_df['drive'] = used_cars_df['drive'].astype('category')
used_cars_df['size'] = used_cars_df['size'].astype('category')
used_cars_df['type'] = used_cars_df['type'].astype('category')
used_cars_df['paint_color'] = used_cars_df['paint_color'].astype('category')
used_cars_df['state'] = used_cars_df['state'].astype('category')
used_cars_df['year'] = used_cars_df['year'].astype('category')

In [312]:
#label encoding continued
used_cars_df["manufacturer"] = used_cars_df["manufacturer"].cat.codes
used_cars_df["cylinders"] = used_cars_df["cylinders"].cat.codes
used_cars_df["fuel"] = used_cars_df["fuel"].cat.codes
used_cars_df["transmission"] = used_cars_df["transmission"].cat.codes
used_cars_df["drive"] = used_cars_df["drive"].cat.codes
used_cars_df["size"] = used_cars_df["size"].cat.codes
used_cars_df["type"] = used_cars_df["type"].cat.codes
used_cars_df["paint_color"] = used_cars_df["paint_color"].cat.codes
used_cars_df["state"] = used_cars_df["state"].cat.codes
used_cars_df["year"] = used_cars_df["year"].cat.codes

In [313]:
used_cars_df.head(5)

Unnamed: 0,price,year,manufacturer,condition,cylinders,fuel,odometer,title_status,transmission,drive,size,type,paint_color,state
8,0,49,5,like new,5,2,68472.0,clean,0,2,1,11,10,1
9,0,50,5,like new,5,2,69125.0,clean,0,2,1,11,10,1
10,0,49,5,like new,5,2,66555.0,clean,0,2,1,11,10,1
21,0,48,5,like new,6,0,102000.0,clean,0,0,1,10,9,1
39,0,48,5,like new,6,0,102000.0,clean,0,0,1,10,9,1


In [314]:
# Importing Encoder
or_enc = OrdinalEncoder()
ohe = OneHotEncoder()

In [315]:
# Ordinal Encoding
used_cars_df['title_status'] = or_enc.fit_transform(used_cars_df[['title_status']])
used_cars_df['condition'] = or_enc.fit_transform(used_cars_df[['condition']])


In [316]:
used_cars_df.head(20)

Unnamed: 0,price,year,manufacturer,condition,cylinders,fuel,odometer,title_status,transmission,drive,size,type,paint_color,state
8,0,49,5,3.0,5,2,68472.0,0.0,0,2,1,11,10,1
9,0,50,5,3.0,5,2,69125.0,0.0,0,2,1,11,10,1
10,0,49,5,3.0,5,2,66555.0,0.0,0,2,1,11,10,1
21,0,48,5,3.0,6,0,102000.0,0.0,0,0,1,10,9,1
39,0,48,5,3.0,6,0,102000.0,0.0,0,0,1,10,9,1
40,0,47,28,3.0,5,0,120000.0,0.0,0,0,1,10,0,1
79,0,48,7,3.0,6,2,39000.0,0.0,0,2,2,3,4,1
92,0,45,28,3.0,5,0,1111111.0,0.0,0,0,1,10,10,1
95,0,49,5,3.0,6,0,1111111.0,0.0,0,0,1,10,0,1
96,0,48,10,3.0,6,2,88000.0,0.0,0,0,1,10,3,1


In [317]:
X = used_cars_df.drop('price', axis =1)
y= used_cars_df['price']

In [318]:
#train & test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size = 0.2, random_state = 42)

print("training shape: ", X_train.shape, y_train.shape)
print("testing shape: ", X_test.shape, y_test.shape)

training shape:  (2782, 13) (2782,)
testing shape:  (696, 13) (696,)


In [319]:
# the model
lr = LinearRegression()

In [320]:
#fit the model
lr.fit(X_train,y_train)

LinearRegression()

In [321]:
#predict on testing set
y_pred = lr.predict(X_test)

In [322]:
# Evaluate model performance
r2 = r2_score(y_test,y_pred)
mse = mean_squared_error(y_test,y_pred)
mae = mean_absolute_error(y_test,y_pred)
intercept = lr.intercept_

print("r2 score: ", r2)
print("mean squared error: ", mse)
print("mean absolute error: ", mae)
print("intercept: ", intercept)

r2 score:  1.0
mean squared error:  0.0
mean absolute error:  0.0
intercept:  0.0
