## Car Price Predicter

 This project is a basic Machine Learning program where it takes Car-Price Data from Keggle (https://www.kaggle.com/austinreese/craigslist-carstrucks-data) and make LinearRegression Model to predict what is the best price to sell a car with different features (i.e. location, condition, colors, etc)

### 1. Import needed classes

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing
from sklearn.preprocessing import LabelEncoder, StandardScaler # data Processing 
from sklearn.model_selection import train_test_split # Data Splitting
from sklearn.linear_model import LinearRegression # LinearRegression Model
import matplotlib.pyplot as plt
import os
import warnings

print(os.listdir("../Car Price Predicter/input"))

['car_data_clean.csv', 'vehicles.csv']


In [2]:
df = pd.read_csv("../Car Price Predicter/input/vehicles.csv")

Before Making any changes to our DataFrame, let's display it so that which column / rows we need to fix

In [3]:
pd.options.display.max_columns = 30
pd.options.display.max_colwidth = 25
display(df)

Unnamed: 0,id,url,region,region_url,price,year,manufacturer,model,condition,cylinders,fuel,odometer,title_status,transmission,vin,drive,size,type,paint_color,image_url,description,county,state,lat,long
0,7119256118,https://mohave.craigs...,mohave county,https://mohave.craigs...,3495,2012.0,jeep,patriot,like new,4 cylinders,gas,,clean,automatic,,,,,silver,https://images.craigs...,THIS 2012 JEEP PATRIO...,,az,34.4554,-114.2690
1,7120880186,https://oregoncoast.c...,oregon coast,https://oregoncoast.c...,13750,2014.0,bmw,328i m-sport,good,,gas,76237.0,clean,automatic,,rwd,,sedan,grey,https://images.craigs...,Selling my 2014 BMW 3...,,or,46.1837,-123.8240
2,7115048251,https://greenville.cr...,greenville / upstate,https://greenville.cr...,2300,2001.0,dodge,caravan,excellent,6 cylinders,gas,199000.0,clean,automatic,,,,,,https://images.craigs...,"01 DODGE CARAVAN,3.3 ...",,sc,34.9352,-81.9654
3,7119250502,https://mohave.craigs...,mohave county,https://mohave.craigs...,9000,2004.0,chevrolet,colorado ls,excellent,5 cylinders,gas,54000.0,clean,automatic,1GCCS196448191644,rwd,mid-size,pickup,red,https://images.craigs...,2004 Chevy Colorado L...,,az,34.4783,-114.2710
4,7120433904,https://maine.craigsl...,maine,https://maine.craigsl...,0,2021.0,,Honda-Nissan-Kia-Ford...,,,other,,clean,other,,,,,,https://images.craigs...,CALL: 207.548.6500 TE...,,me,44.4699,-68.8963
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
435844,7119262300,https://mohave.craigs...,mohave county,https://mohave.craigs...,2500,2005.0,ford,f150,fair,,gas,282866.0,clean,automatic,,,full-size,truck,white,https://images.craigs...,2005 F150 XLT 5.4 TRI...,,az,35.2414,-113.9890
435845,7112219717,https://rapidcity.cra...,rapid city / west SD,https://rapidcity.cra...,2700,2002.0,toyota,camry,good,6 cylinders,gas,194000.0,clean,automatic,,fwd,,,blue,https://images.craigs...,2002 Toyota Camry. 4...,,sd,44.0036,-103.3590
435846,7120896708,https://oregoncoast.c...,oregon coast,https://oregoncoast.c...,2450,2001.0,ford,focus,good,4 cylinders,gas,130484.0,clean,automatic,,rwd,compact,other,black,https://images.craigs...,I am selling my ford ...,,or,45.5272,-123.0950
435847,7120885819,https://oregoncoast.c...,oregon coast,https://oregoncoast.c...,8995,2013.0,mazda,mazda3,,,gas,93339.0,clean,automatic,JM1BL1UPXD1758084,fwd,,sedan,,https://images.craigs...,Simple Easy Car Buyin...,,or,45.5231,-122.5790


### 2. Clean Data

In [4]:
# Features to drop:
drop_features = ['id','url', 'model','region','region_url','vin','drive','size','type','image_url','description','county','lat','long']
df.drop(columns=drop_features, inplace=True)
display(df)

Unnamed: 0,price,year,manufacturer,condition,cylinders,fuel,odometer,title_status,transmission,paint_color,state
0,3495,2012.0,jeep,like new,4 cylinders,gas,,clean,automatic,silver,az
1,13750,2014.0,bmw,good,,gas,76237.0,clean,automatic,grey,or
2,2300,2001.0,dodge,excellent,6 cylinders,gas,199000.0,clean,automatic,,sc
3,9000,2004.0,chevrolet,excellent,5 cylinders,gas,54000.0,clean,automatic,red,az
4,0,2021.0,,,,other,,clean,other,,me
...,...,...,...,...,...,...,...,...,...,...,...
435844,2500,2005.0,ford,fair,,gas,282866.0,clean,automatic,white,az
435845,2700,2002.0,toyota,good,6 cylinders,gas,194000.0,clean,automatic,blue,sd
435846,2450,2001.0,ford,good,4 cylinders,gas,130484.0,clean,automatic,black,or
435847,8995,2013.0,mazda,,,gas,93339.0,clean,automatic,,or


In [5]:
#Replace / drop rows that contains NaN data
df.isna().any()
df.dropna(axis=0, how='any', inplace=True)

In [6]:
#display cleaned_data
display(df)

Unnamed: 0,price,year,manufacturer,condition,cylinders,fuel,odometer,title_status,transmission,paint_color,state
3,9000,2004.0,chevrolet,excellent,5 cylinders,gas,54000.0,clean,automatic,red,az
7,8500,2005.0,ford,excellent,6 cylinders,gas,62800.0,clean,automatic,silver,me
12,2750,2006.0,chevrolet,good,8 cylinders,gas,165000.0,clean,automatic,white,me
19,24930,2017.0,subaru,excellent,4 cylinders,gas,32989.0,clean,automatic,grey,mt
26,3200,1998.0,volkswagen,good,4 cylinders,gas,98186.0,clean,manual,blue,mt
...,...,...,...,...,...,...,...,...,...,...,...
435840,24900,2017.0,audi,excellent,4 cylinders,gas,27100.0,clean,automatic,silver,mt
435841,9995,2008.0,buick,excellent,6 cylinders,gas,145975.0,clean,automatic,brown,va
435843,46995,2019.0,ford,like new,8 cylinders,diesel,55000.0,clean,automatic,white,sc
435845,2700,2002.0,toyota,good,6 cylinders,gas,194000.0,clean,automatic,blue,sd


In [7]:
# Idea from https://www.kaggle.com/vbmokin/used-cars-price-prediction-by-15-models
numerics = ['int8', 'int16', 'int32', 'int64', 'float16', 'float32', 'float64']
categorical_columns = []
features = df.columns.values.tolist()
for col in features:
    if df[col].dtype in numerics: continue
    categorical_columns.append(col)
# Encoding categorical features
for col in categorical_columns:
    if col in df.columns:
        le = LabelEncoder()
        le.fit(list(df[col].astype(str).values))
        df[col] = le.transform(list(df[col].astype(str).values))

In [8]:
df['year'] = (df['year']-1900).astype(int)
df['odometer'] = df['odometer'].astype(int)

df.head(5)

Unnamed: 0,price,year,manufacturer,condition,cylinders,fuel,odometer,title_status,transmission,paint_color,state
3,9000,104,7,0,4,2,54000,0,0,8,3
7,8500,105,13,0,5,2,62800,0,0,9,21
12,2750,106,7,2,6,2,165000,0,0,10,21
19,24930,117,37,0,3,2,32989,0,0,5,26
26,3200,98,40,2,3,2,98186,0,1,1,26


### 3. Clean Data pt.2

In [9]:
df.corr()

Unnamed: 0,price,year,manufacturer,condition,cylinders,fuel,odometer,title_status,transmission,paint_color,state
price,1.0,0.017142,-0.005606,0.002344,0.010889,-0.008571,-0.011167,-0.002802,0.008494,0.003449,0.000271
year,0.017142,1.0,0.033773,0.008559,-0.161166,0.043297,-0.194442,0.072195,0.006851,0.024151,0.007892
manufacturer,-0.005606,0.033773,1.0,-0.012479,-0.233917,-0.011401,0.011169,0.031565,0.046769,-0.016907,-0.008864
condition,0.002344,0.008559,-0.012479,1.0,0.040493,0.028047,-0.016077,0.017264,0.110503,0.002943,0.007305
cylinders,0.010889,-0.161166,-0.233917,0.040493,1.0,-0.127531,0.051151,-0.101891,0.018389,0.03436,0.012261
fuel,-0.008571,0.043297,-0.011401,0.028047,-0.127531,1.0,-0.067986,0.016144,0.071203,-0.05497,-0.027154
odometer,-0.011167,-0.194442,0.011169,-0.016077,0.051151,-0.067986,1.0,-0.052935,-0.072415,0.00234,0.002563
title_status,-0.002802,0.072195,0.031565,0.017264,-0.101891,0.016144,-0.052935,1.0,-0.042996,-0.018624,0.00922
transmission,0.008494,0.006851,0.046769,0.110503,0.018389,0.071203,-0.072415,-0.042996,1.0,-0.019236,-0.01766
paint_color,0.003449,0.024151,-0.016907,0.002943,0.03436,-0.05497,0.00234,-0.018624,-0.019236,1.0,-0.02254


In [10]:
# Also from https://www.kaggle.com/vbmokin/used-cars-price-prediction-by-15-models (Thank you so much!)
df['price'].value_counts()

0        5031
4500     1642
5995     1543
3500     1523
5500     1511
         ... 
5050        1
3001        1
952         1
23475       1
12218       1
Name: price, Length: 6088, dtype: int64

In [11]:
# Make Price at least $1000 or more 
df = df[df['price'] > 1000]
# Make Price at most $50000 or less
df = df[df['price'] < 50000]
# Round odometer by 1000
df['odometer'] = df['odometer'] // 1000
# Take out cars that are above 100 
df = df[df['year'] > 100]

df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 127355 entries, 3 to 435846
Data columns (total 11 columns):
 #   Column        Non-Null Count   Dtype
---  ------        --------------   -----
 0   price         127355 non-null  int64
 1   year          127355 non-null  int32
 2   manufacturer  127355 non-null  int64
 3   condition     127355 non-null  int64
 4   cylinders     127355 non-null  int64
 5   fuel          127355 non-null  int64
 6   odometer      127355 non-null  int32
 7   title_status  127355 non-null  int64
 8   transmission  127355 non-null  int64
 9   paint_color   127355 non-null  int64
 10  state         127355 non-null  int64
dtypes: int32(2), int64(9)
memory usage: 10.7 MB


In [12]:
df.corr()

Unnamed: 0,price,year,manufacturer,condition,cylinders,fuel,odometer,title_status,transmission,paint_color,state
price,1.0,0.596346,-0.050691,0.086308,0.319332,-0.178897,-0.305048,-0.053669,0.211737,0.041993,-0.002301
year,0.596346,1.0,-0.006579,0.050919,-0.143384,0.081185,-0.356705,0.1004,0.130102,0.014544,0.006029
manufacturer,-0.050691,-0.006579,1.0,-0.013478,-0.224236,-0.012507,0.006113,0.030668,0.047148,-0.021093,-0.006199
condition,0.086308,0.050919,-0.013478,1.0,0.040033,0.03026,-0.033063,0.013229,0.124631,0.004762,0.011285
cylinders,0.319332,-0.143384,-0.224236,0.040033,1.0,-0.127259,0.069253,-0.107818,0.047084,0.036917,0.005773
fuel,-0.178897,0.081185,-0.012507,0.03026,-0.127259,1.0,-0.09092,0.016048,0.089261,-0.053262,-0.020964
odometer,-0.305048,-0.356705,0.006113,-0.033063,0.069253,-0.09092,1.0,-0.079304,-0.110351,0.008472,0.005143
title_status,-0.053669,0.1004,0.030668,0.013229,-0.107818,0.016048,-0.079304,1.0,-0.046113,-0.020881,0.01149
transmission,0.211737,0.130102,0.047148,0.124631,0.047084,0.089261,-0.110351,-0.046113,1.0,-0.017191,-0.017861
paint_color,0.041993,0.014544,-0.021093,0.004762,0.036917,-0.053262,0.008472,-0.020881,-0.017191,1.0,-0.023135


In [13]:
df.describe()

Unnamed: 0,price,year,manufacturer,condition,cylinders,fuel,odometer,title_status,transmission,paint_color,state
count,127355.0,127355.0,127355.0,127355.0,127355.0,127355.0,127355.0,127355.0,127355.0,127355.0,127355.0
mean,12918.919218,111.052146,19.089521,1.176357,4.589439,1.909968,106.279801,0.270292,0.152055,5.713031,23.934035
std,9058.298783,4.585259,11.661144,1.174355,1.265369,0.55796,102.567258,1.026154,0.474787,4.013322,14.957312
min,1020.0,101.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,5995.0,108.0,10.0,0.0,3.0,2.0,59.0,0.0,0.0,1.0,9.0
50%,9999.0,112.0,14.0,2.0,5.0,2.0,101.0,0.0,0.0,8.0,23.0
75%,17790.0,115.0,31.0,2.0,6.0,2.0,142.0,0.0,0.0,10.0,37.0
max,49999.0,121.0,41.0,5.0,7.0,4.0,10000.0,5.0,2.0,11.0,50.0


### 4. Make and Train Model

In [14]:
y = df['price']
X = df.drop(['price'], axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [17]:
X_train.head(5)

Unnamed: 0,year,manufacturer,condition,cylinders,fuel,odometer,title_status,transmission,paint_color,state
0,-1.536162,-0.781239,0.703379,-1.253113,0.160057,0.567794,-0.26246,-0.320552,-0.178668,-1.200267
1,-0.009632,-0.523885,0.703379,1.11547,0.160057,1.195974,-0.26246,-0.320552,1.067287,-0.598584
2,-2.190389,-1.038594,-1.000714,1.11547,0.160057,0.117755,-0.26246,-0.320552,-0.178668,-0.799145
3,-0.445783,0.505533,0.703379,0.325942,0.160057,0.155258,-0.26246,-0.320552,0.568905,-0.197462
4,1.080746,-0.523885,0.703379,-1.253113,0.160057,-0.576056,-0.26246,-0.320552,-1.424623,-1.467681


In [19]:
linearRegressor = LinearRegression()

In [20]:
linearRegressor.fit(X_train, y_train)

LinearRegression()

### 5. Test Model

In [22]:
linearRegressor.predict(X_test)

array([511725.75724148, 541110.87281804, 590773.3021239 , ...,
       583316.73204146, 501144.77862364, 566750.02474918])