In [38]:
import sqlite3
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import linear_model
from sklearn.linear_model import LinearRegression
# Create your connection.
cnx = sqlite3.connect('car_price.db')

cars = pd.read_sql_query("SELECT * FROM cars", cnx)

In [39]:
print(cars.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 410 entries, 0 to 409
Data columns (total 26 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   car_ID            410 non-null    int64  
 1   symboling         410 non-null    int64  
 2   fueltype          410 non-null    object 
 3   aspiration        410 non-null    object 
 4   doornumber        410 non-null    object 
 5   carbody           410 non-null    object 
 6   drivewheel        410 non-null    object 
 7   enginelocation    410 non-null    object 
 8   wheelbase         410 non-null    float64
 9   carlength         410 non-null    float64
 10  carwidth          410 non-null    float64
 11  carheight         410 non-null    float64
 12  curbweight        410 non-null    int64  
 13  enginetype        410 non-null    object 
 14  cylindernumber    410 non-null    object 
 15  enginesize        410 non-null    int64  
 16  fuelsystem        410 non-null    object 
 1

In [40]:
list(cars.columns.values)

['car_ID',
 'symboling',
 'fueltype',
 'aspiration',
 'doornumber',
 'carbody',
 'drivewheel',
 'enginelocation',
 'wheelbase',
 'carlength',
 'carwidth',
 'carheight',
 'curbweight',
 'enginetype',
 'cylindernumber',
 'enginesize',
 'fuelsystem',
 'boreratio',
 'stroke',
 'compressionratio',
 'horsepower',
 'peakrpm',
 'citympg',
 'highwaympg',
 'price',
 'car_company']

In [41]:
cars['symboling'] = cars['symboling'].astype('object')
cars.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 410 entries, 0 to 409
Data columns (total 26 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   car_ID            410 non-null    int64  
 1   symboling         410 non-null    object 
 2   fueltype          410 non-null    object 
 3   aspiration        410 non-null    object 
 4   doornumber        410 non-null    object 
 5   carbody           410 non-null    object 
 6   drivewheel        410 non-null    object 
 7   enginelocation    410 non-null    object 
 8   wheelbase         410 non-null    float64
 9   carlength         410 non-null    float64
 10  carwidth          410 non-null    float64
 11  carheight         410 non-null    float64
 12  curbweight        410 non-null    int64  
 13  enginetype        410 non-null    object 
 14  cylindernumber    410 non-null    object 
 15  enginesize        410 non-null    int64  
 16  fuelsystem        410 non-null    object 
 1

In [42]:
# Split data into X and y
X = cars.loc[:, ['symboling', 'fueltype', 'aspiration', 'doornumber',
       'carbody', 'drivewheel', 'enginelocation', 'wheelbase', 'carlength',
       'carwidth', 'carheight', 'curbweight', 'enginetype', 'cylindernumber',
       'enginesize', 'fuelsystem', 'boreratio', 'stroke', 'compressionratio',
       'horsepower', 'peakrpm', 'citympg', 'highwaympg',
       'car_company']]

Y = cars['price']

In [43]:
# Categorical 
cars_categorical = X.select_dtypes(include=['object'])
cars_categorical.head()

Unnamed: 0,symboling,fueltype,aspiration,doornumber,carbody,drivewheel,enginelocation,enginetype,cylindernumber,fuelsystem,car_company
0,3,gas,std,two,convertible,rwd,front,dohc,four,mpfi,alfa-romero
1,3,gas,std,two,convertible,rwd,front,dohc,four,mpfi,alfa-romero
2,1,gas,std,two,hatchback,rwd,front,ohcv,six,mpfi,alfa-romero
3,2,gas,std,four,sedan,fwd,front,ohc,four,mpfi,audi
4,2,gas,std,four,sedan,4wd,front,ohc,five,mpfi,audi


In [44]:
# Covert categorical to dummies
cars_dummies = pd.get_dummies(cars_categorical, drop_first=True)
cars_dummies.head()

Unnamed: 0,symboling_-1,symboling_0,symboling_1,symboling_2,symboling_3,fueltype_gas,aspiration_turbo,doornumber_two,carbody_hardtop,carbody_hatchback,...,car_company_nissan,car_company_peugeot,car_company_plymouth,car_company_porsche,car_company_renault,car_company_saab,car_company_subaru,car_company_toyota,car_company_volkswagen,car_company_volvo
0,0,0,0,0,1,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,1,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,1,0,0,1,0,1,0,1,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [45]:
X = X.drop(list(cars_categorical.columns), axis=1)

In [46]:
X = pd.concat([X, cars_dummies], axis=1)

In [47]:
# scaling the features
from sklearn.preprocessing import scale
import statsmodels.api as sm
from sklearn.feature_selection import RFE
from sklearn.metrics import r2_score

# storing column names in cols, since column names are (annoyingly) lost after 
# scaling (the df is converted to a numpy array)
cols = X.columns
X = pd.DataFrame(scale(X))
X.columns = cols
X.columns

Index(['wheelbase', 'carlength', 'carwidth', 'carheight', 'curbweight',
       'enginesize', 'boreratio', 'stroke', 'compressionratio', 'horsepower',
       'peakrpm', 'citympg', 'highwaympg', 'symboling_-1', 'symboling_0',
       'symboling_1', 'symboling_2', 'symboling_3', 'fueltype_gas',
       'aspiration_turbo', 'doornumber_two', 'carbody_hardtop',
       'carbody_hatchback', 'carbody_sedan', 'carbody_wagon', 'drivewheel_fwd',
       'drivewheel_rwd', 'enginelocation_rear', 'enginetype_dohcv',
       'enginetype_l', 'enginetype_ohc', 'enginetype_ohcf', 'enginetype_ohcv',
       'enginetype_rotor', 'cylindernumber_five', 'cylindernumber_four',
       'cylindernumber_six', 'cylindernumber_three', 'cylindernumber_twelve',
       'cylindernumber_two', 'fuelsystem_2bbl', 'fuelsystem_4bbl',
       'fuelsystem_idi', 'fuelsystem_mfi', 'fuelsystem_mpfi',
       'fuelsystem_spdi', 'fuelsystem_spfi', 'car_company_audi',
       'car_company_bmw', 'car_company_buick', 'car_company_chevrolet',


In [48]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, Y, train_size=0.7,test_size = 0.3, random_state=100)

In [49]:
# RFE with 6 features
lm = LinearRegression()
rfe_6 = RFE(lm, 6)

# fit with 6 features
model = rfe_6.fit(X_train, y_train)

# predict
y_pred = rfe_6.predict(X_test)

# r-squared
print(r2_score(y_test, y_pred))

0.9042483947151553




In [51]:
import joblib
joblib.dump(model, "finalModel.joblib")

['finalModel.joblib']