In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns 
import stat as st

In [2]:
pred_data = pd.read_csv(r'data.csv')

In [3]:
pred_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 205 entries, 0 to 204
Data columns (total 28 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Unnamed: 0        205 non-null    int64  
 1   car_ID            205 non-null    int64  
 2   symboling         205 non-null    int64  
 3   CarName           205 non-null    object 
 4   fueltype          205 non-null    object 
 5   aspiration        205 non-null    object 
 6   doornumber        205 non-null    object 
 7   carbody           205 non-null    object 
 8   drivewheel        205 non-null    object 
 9   enginelocation    205 non-null    object 
 10  wheelbase         205 non-null    float64
 11  carlength         205 non-null    float64
 12  carwidth          205 non-null    float64
 13  carheight         205 non-null    float64
 14  curbweight        205 non-null    int64  
 15  enginetype        205 non-null    object 
 16  cylindernumber    205 non-null    object 
 1

# PREPROCESSING & FEATURE ENGINEERING


In [4]:
#dropping columns that are statistically insignificant to the model
model_data = pred_data.drop(['Unnamed: 0', 'car_ID', 'CarName','fuelsystem', 'enginetype','doornumber'], axis=1)

In [5]:
# transforming 'symboling' to object since its a rating system on the risk of a vehicle purchase.
model_data['symboling'] = model_data['symboling'].astype('object')

In [6]:
#model_data.info()

In [7]:
# importing libraries that will transform data to be consumable by the predictive model
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression


In [8]:
#categorical features for one hot encoding
cat_features = model_data.select_dtypes(include='object').columns.tolist()

In [9]:
cat_features

['symboling',
 'fueltype',
 'aspiration',
 'carbody',
 'drivewheel',
 'enginelocation',
 'cylindernumber',
 'carbrand']

In [10]:
#encoding categorical features with dummy method
encoded_data = pd.get_dummies(model_data, columns = cat_features)

In [11]:
encoded_data

Unnamed: 0,wheelbase,carlength,carwidth,carheight,curbweight,enginesize,boreratio,stroke,compressionratio,horsepower,...,carbrand_peugeot,carbrand_plymouth,carbrand_porsche,carbrand_renault,carbrand_saab,carbrand_subaru,carbrand_toyota,carbrand_vokswagen,carbrand_volkswagen,carbrand_volvo
0,88.6,168.8,64.1,48.8,2548,130,3.47,2.68,9.0,111,...,0,0,0,0,0,0,0,0,0,0
1,88.6,168.8,64.1,48.8,2548,130,3.47,2.68,9.0,111,...,0,0,0,0,0,0,0,0,0,0
2,94.5,171.2,65.5,52.4,2823,152,2.68,3.47,9.0,154,...,0,0,0,0,0,0,0,0,0,0
3,99.8,176.6,66.2,54.3,2337,109,3.19,3.40,10.0,102,...,0,0,0,0,0,0,0,0,0,0
4,99.4,176.6,66.4,54.3,2824,136,3.19,3.40,8.0,115,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
200,109.1,188.8,68.9,55.5,2952,141,3.78,3.15,9.5,114,...,0,0,0,0,0,0,0,0,0,1
201,109.1,188.8,68.8,55.5,3049,141,3.78,3.15,8.7,160,...,0,0,0,0,0,0,0,0,0,1
202,109.1,188.8,68.9,55.5,3012,173,3.58,2.87,8.8,134,...,0,0,0,0,0,0,0,0,0,1
203,109.1,188.8,68.9,55.5,3217,145,3.01,3.40,23.0,106,...,0,0,0,0,0,0,0,0,0,1


In [15]:
#splitting data into training and testing set
X = encoded_data.drop('price', axis=1)
y = encoded_data[['price']]
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=0)

In [18]:
print('Features:',X[:10], '\n Labels:', y[:10], sep='\n')

Features:
   wheelbase  carlength  carwidth  carheight  curbweight  enginesize  \
0       88.6      168.8      64.1       48.8        2548         130   
1       88.6      168.8      64.1       48.8        2548         130   
2       94.5      171.2      65.5       52.4        2823         152   
3       99.8      176.6      66.2       54.3        2337         109   
4       99.4      176.6      66.4       54.3        2824         136   
5       99.8      177.3      66.3       53.1        2507         136   
6      105.8      192.7      71.4       55.7        2844         136   
7      105.8      192.7      71.4       55.7        2954         136   
8      105.8      192.7      71.4       55.9        3086         131   
9       99.5      178.2      67.9       52.0        3053         131   

   boreratio  stroke  compressionratio  horsepower  ...  carbrand_peugeot  \
0       3.47    2.68               9.0         111  ...                 0   
1       3.47    2.68               9.0     

In [19]:
#standardizing numerical features of train dataset
scaler = StandardScaler()
train_model = scaler.fit(X_train, (y_train))


In [None]:
#creating an instance of the linear regression model
model = LinearRegression().fit(X_train, y_train)
print (model)