# **Importing Library**

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt #for visualization
import seaborn as sns #for visualization
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
from sklearn.metrics import r2_score

# **Loading dataset**

In [2]:
dataset = pd.read_csv('CarPrice.csv')

# **Exploratory Data Analysis**

# **Data Preprocessing**

### **Preparing the data to implement model**

Firstly dropping some feature variables to prevent overfittting of model

In [3]:
dataset.drop(["car_ID","symboling","CarName"], axis=1, inplace=True)

In [4]:
dataset.head()

Unnamed: 0,fueltype,aspiration,doornumber,carbody,drivewheel,enginelocation,wheelbase,carlength,carwidth,carheight,...,enginesize,fuelsystem,boreratio,stroke,compressionratio,horsepower,peakrpm,citympg,highwaympg,price
0,gas,std,two,convertible,rwd,front,88.6,168.8,64.1,48.8,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,13495.0
1,gas,std,two,convertible,rwd,front,88.6,168.8,64.1,48.8,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,16500.0
2,gas,std,two,hatchback,rwd,front,94.5,171.2,65.5,52.4,...,152,mpfi,2.68,3.47,9.0,154,5000,19,26,16500.0
3,gas,std,four,sedan,fwd,front,99.8,176.6,66.2,54.3,...,109,mpfi,3.19,3.4,10.0,102,5500,24,30,13950.0
4,gas,std,four,sedan,4wd,front,99.4,176.6,66.4,54.3,...,136,mpfi,3.19,3.4,8.0,115,5500,18,22,17450.0


### **Dummification**

A One hot encoding is a representation of categorical variable as binary vectors.It allows the representation of categorical data to be more expresive. This first requires that the categorical values be mapped to integer values, that is label encoding. Then, each integer value is represented as a binary vector that is all zero values except the index of the integer, which is marked with a 1.

The Dummy variable trap is a scenario in which the independent variable are multicollinear, a scenario in which two or more variables are highly correlated in simple term one variable can be predicted from the others.

By using pandas get_dummies function we can do all above three step in line of code. We will this fuction to get dummy variable for sex, children,smoker,region features. By setting drop_first =True function will remove dummy variable trap by droping one variable and original variable.



In [5]:
dataset = pd.get_dummies(dataset, columns= ["drivewheel"], drop_first=True)

In [6]:
dataset = pd.get_dummies(dataset, columns= ["doornumber"],drop_first=True)

In [7]:
dataset = pd.get_dummies(dataset, columns= ["cylindernumber"],drop_first=True)

In [8]:
dataset = pd.get_dummies(dataset, columns= ["fueltype"],drop_first=True)

In [9]:
dataset = pd.get_dummies(dataset, columns= ["aspiration"],drop_first=True)

In [10]:
dataset = pd.get_dummies(dataset, columns= ["carbody"],drop_first=True)

In [11]:
dataset = pd.get_dummies(dataset, columns= ["enginelocation"],drop_first=True)

In [12]:
dataset = pd.get_dummies(dataset, columns= ["enginetype"],drop_first=True)

In [13]:
dataset = pd.get_dummies(dataset, columns= ["fuelsystem"],drop_first=True)

**Checking the cleaned and processed dataset**

In [14]:
dataset.head()

Unnamed: 0,wheelbase,carlength,carwidth,carheight,curbweight,enginesize,boreratio,stroke,compressionratio,horsepower,...,enginetype_ohcf,enginetype_ohcv,enginetype_rotor,fuelsystem_2bbl,fuelsystem_4bbl,fuelsystem_idi,fuelsystem_mfi,fuelsystem_mpfi,fuelsystem_spdi,fuelsystem_spfi
0,88.6,168.8,64.1,48.8,2548,130,3.47,2.68,9.0,111,...,0,0,0,0,0,0,0,1,0,0
1,88.6,168.8,64.1,48.8,2548,130,3.47,2.68,9.0,111,...,0,0,0,0,0,0,0,1,0,0
2,94.5,171.2,65.5,52.4,2823,152,2.68,3.47,9.0,154,...,0,1,0,0,0,0,0,1,0,0
3,99.8,176.6,66.2,54.3,2337,109,3.19,3.4,10.0,102,...,0,0,0,0,0,0,0,1,0,0
4,99.4,176.6,66.4,54.3,2824,136,3.19,3.4,8.0,115,...,0,0,0,0,0,0,0,1,0,0


### **Handling Missing Value**
We will take a look at if there is any missing data in our data. If there are, we will try to eliminate them.

In [15]:
dataset.columns[dataset.isnull().any()]

Index([], dtype='object')

In [16]:
dataset.isna().sum()

wheelbase                0
carlength                0
carwidth                 0
carheight                0
curbweight               0
enginesize               0
boreratio                0
stroke                   0
compressionratio         0
horsepower               0
peakrpm                  0
citympg                  0
highwaympg               0
price                    0
drivewheel_fwd           0
drivewheel_rwd           0
doornumber_two           0
cylindernumber_five      0
cylindernumber_four      0
cylindernumber_six       0
cylindernumber_three     0
cylindernumber_twelve    0
cylindernumber_two       0
fueltype_gas             0
aspiration_turbo         0
carbody_hardtop          0
carbody_hatchback        0
carbody_sedan            0
carbody_wagon            0
enginelocation_rear      0
enginetype_dohcv         0
enginetype_l             0
enginetype_ohc           0
enginetype_ohcf          0
enginetype_ohcv          0
enginetype_rotor         0
fuelsystem_2bbl          0
f

No Null value is present in the data

### **Defining Independent and Dependent Varriable**

In [17]:
X=dataset.drop("price", axis=1)
Y=dataset["price"]

In [18]:
print(X)

     wheelbase  carlength  carwidth  carheight  curbweight  enginesize  \
0         88.6      168.8      64.1       48.8        2548         130   
1         88.6      168.8      64.1       48.8        2548         130   
2         94.5      171.2      65.5       52.4        2823         152   
3         99.8      176.6      66.2       54.3        2337         109   
4         99.4      176.6      66.4       54.3        2824         136   
..         ...        ...       ...        ...         ...         ...   
200      109.1      188.8      68.9       55.5        2952         141   
201      109.1      188.8      68.8       55.5        3049         141   
202      109.1      188.8      68.9       55.5        3012         173   
203      109.1      188.8      68.9       55.5        3217         145   
204      109.1      188.8      68.9       55.5        3062         141   

     boreratio  stroke  compressionratio  horsepower  ...  enginetype_ohcf  \
0         3.47    2.68           

In [19]:
print(Y)

0      13495.0
1      16500.0
2      16500.0
3      13950.0
4      17450.0
        ...   
200    16845.0
201    19045.0
202    21485.0
203    22470.0
204    22625.0
Name: price, Length: 205, dtype: float64


### **Spliiting data in Training set and Test Set**

In [20]:
test_size = 0.25
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = test_size, random_state = 20)

### **Printing the shapes of train and test data**

In [21]:
print("X_train shape {}, len {}.".format(X_train.shape,len(X_train)))
print("X_test shape {}, len {}.".format(X_test.shape,len(X_test)))
print("Y_train shape {}, len {}.".format(Y_train.shape,len(Y_train)))
print("Y_test shape {}, len {}.".format(Y_test.shape,len(Y_test)))

X_train shape (153, 42), len 153.
X_test shape (52, 42), len 52.
Y_train shape (153,), len 153.
Y_test shape (52,), len 52.


# **Decision Tree Regression Model Building**

### **Building a decision tree regression model**

In [22]:
from sklearn.tree import DecisionTreeRegressor
regressor = DecisionTreeRegressor(random_state = 0)

### **Train the model on the train dataset**

In [23]:
regressor.fit(X_train, Y_train)

DecisionTreeRegressor(random_state=0)

In [24]:
predict = regressor.predict(X_test)

### **Printing the model summary**

In [33]:
df_DecisionTree = pd.DataFrame({'Actual': Y_test, 'Predicted': predict})
df_DecisionTree.head()

Unnamed: 0,Actual,Predicted
5,15250.0,13495.0
44,8916.5,8916.5
155,8778.0,15985.0
129,31400.5,41315.0
152,6488.0,5348.0


In [34]:
score = r2_score(Y_test,predict)
print("r_square score --> ",score)
print('Mean Absolute Error -->', metrics.mean_absolute_error(Y_test, predict))
print('Mean Squared Error -->', metrics.mean_squared_error(Y_test, predict))
print('Root Mean Squared Error -->', np.sqrt(metrics.mean_squared_error(Y_test, predict)))

r_square score -->  0.739868079727191
Mean Absolute Error --> 1944.6442307692307
Mean Squared Error --> 7737752.225961538
Root Mean Squared Error --> 2781.681546468168


In [36]:
dataset.columns

Index(['wheelbase', 'carlength', 'carwidth', 'carheight', 'curbweight',
       'enginesize', 'boreratio', 'stroke', 'compressionratio', 'horsepower',
       'peakrpm', 'citympg', 'highwaympg', 'price', 'drivewheel_fwd',
       'drivewheel_rwd', 'doornumber_two', 'cylindernumber_five',
       'cylindernumber_four', 'cylindernumber_six', 'cylindernumber_three',
       'cylindernumber_twelve', 'cylindernumber_two', 'fueltype_gas',
       'aspiration_turbo', 'carbody_hardtop', 'carbody_hatchback',
       'carbody_sedan', 'carbody_wagon', 'enginelocation_rear',
       'enginetype_dohcv', 'enginetype_l', 'enginetype_ohc', 'enginetype_ohcf',
       'enginetype_ohcv', 'enginetype_rotor', 'fuelsystem_2bbl',
       'fuelsystem_4bbl', 'fuelsystem_idi', 'fuelsystem_mfi',
       'fuelsystem_mpfi', 'fuelsystem_spdi', 'fuelsystem_spfi'],
      dtype='object')

In [35]:
importance = regressor.feature_importances_
# summarize feature importance
for i,v in enumerate(importance):
	print('Feature: %0d, Score: %.5f' % (i,v))

Feature: 0, Score: 0.00080
Feature: 1, Score: 0.00300
Feature: 2, Score: 0.01101
Feature: 3, Score: 0.00047
Feature: 4, Score: 0.21049
Feature: 5, Score: 0.71386
Feature: 6, Score: 0.00004
Feature: 7, Score: 0.01322
Feature: 8, Score: 0.00004
Feature: 9, Score: 0.01707
Feature: 10, Score: 0.00617
Feature: 11, Score: 0.00040
Feature: 12, Score: 0.01784
Feature: 13, Score: 0.00004
Feature: 14, Score: 0.00002
Feature: 15, Score: 0.00013
Feature: 16, Score: 0.00000
Feature: 17, Score: 0.00015
Feature: 18, Score: 0.00000
Feature: 19, Score: 0.00000
Feature: 20, Score: 0.00000
Feature: 21, Score: 0.00000
Feature: 22, Score: 0.00000
Feature: 23, Score: 0.00000
Feature: 24, Score: 0.00090
Feature: 25, Score: 0.00157
Feature: 26, Score: 0.00018
Feature: 27, Score: 0.00000
Feature: 28, Score: 0.00094
Feature: 29, Score: 0.00000
Feature: 30, Score: 0.00000
Feature: 31, Score: 0.00000
Feature: 32, Score: 0.00001
Feature: 33, Score: 0.00000
Feature: 34, Score: 0.00000
Feature: 35, Score: 0.00162
Fe

# **Random Forest Regression Model Building** 

### **Building a decision tree regression model**

In [None]:
from sklearn.ensemble import RandomForestRegressor
regressor_F = RandomForestRegressor(n_estimators = 10, random_state = 0) #n_etimator is the no. of trees or no. of estimates by taking average of which we will get the final result


### **Train the model on the train dataset**

In [None]:
regressor_F.fit(X_train, Y_train)

In [None]:
predict_F = regressor.predict(X_test)

### **Printing the model summary**

In [None]:
df_RandomForest = pd.DataFrame({'Actual': Y_test, 'Predicted': predict_F})
df_RandomForest.head()