In [1]:
# !pip install ydata-profiling

In [2]:
import pandas as pd
import numpy as np
from sklearn import metrics
from sklearn import preprocessing
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.model_selection import train_test_split

In [3]:
import warnings
warnings.filterwarnings('ignore')

In [4]:
df_raw = pd.read_csv('Automobile_data.csv')

In [5]:
df_raw[:5]

Unnamed: 0,symboling,normalized-losses,make,fuel-type,aspiration,num-of-doors,body-style,drive-wheels,engine-location,wheel-base,...,num-of-cylinders,engine-size,bore,stroke,compression-ratio,horsepower,peak-rpm,city-mpg,highway-mpg,price
0,3,?,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,four,130,3.47,2.68,9.0,111,5000,21,27,13495
1,3,?,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,four,130,3.47,2.68,9.0,111,5000,21,27,16500
2,1,?,alfa-romero,gas,std,two,hatchback,rwd,front,94.5,...,six,152,2.68,3.47,9.0,154,5000,19,26,16500
3,2,164,audi,gas,std,four,sedan,fwd,front,99.8,...,four,109,3.19,3.4,10.0,102,5500,24,30,13950
4,2,164,audi,gas,std,four,sedan,4wd,front,99.4,...,five,136,3.19,3.4,8.0,115,5500,18,22,17450


In [6]:
from ydata_profiling import ProfileReport

df_raw = pd.read_csv('Automobile_data.csv')
profile = ProfileReport(df_raw, title="Profiling Report")

In [7]:
profile

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]



In [8]:
df_raw.describe()

Unnamed: 0,symboling,wheel-base,length,width,height,curb-weight,engine-size,compression-ratio,city-mpg,highway-mpg
count,205.0,205.0,205.0,205.0,205.0,205.0,205.0,205.0,205.0,205.0
mean,0.834146,98.756585,174.049268,65.907805,53.724878,2555.565854,126.907317,10.142537,25.219512,30.75122
std,1.245307,6.021776,12.337289,2.145204,2.443522,520.680204,41.642693,3.97204,6.542142,6.886443
min,-2.0,86.6,141.1,60.3,47.8,1488.0,61.0,7.0,13.0,16.0
25%,0.0,94.5,166.3,64.1,52.0,2145.0,97.0,8.6,19.0,25.0
50%,1.0,97.0,173.2,65.5,54.1,2414.0,120.0,9.0,24.0,30.0
75%,2.0,102.4,183.1,66.9,55.5,2935.0,141.0,9.4,30.0,34.0
max,3.0,120.9,208.1,72.3,59.8,4066.0,326.0,23.0,49.0,54.0


In [9]:
# df_raw[:5].T

In [10]:
df_raw = df_raw.replace('?', np.nan)

In [11]:
df_raw[:4]

Unnamed: 0,symboling,normalized-losses,make,fuel-type,aspiration,num-of-doors,body-style,drive-wheels,engine-location,wheel-base,...,num-of-cylinders,engine-size,bore,stroke,compression-ratio,horsepower,peak-rpm,city-mpg,highway-mpg,price
0,3,,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,four,130,3.47,2.68,9.0,111,5000,21,27,13495
1,3,,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,four,130,3.47,2.68,9.0,111,5000,21,27,16500
2,1,,alfa-romero,gas,std,two,hatchback,rwd,front,94.5,...,six,152,2.68,3.47,9.0,154,5000,19,26,16500
3,2,164.0,audi,gas,std,four,sedan,fwd,front,99.8,...,four,109,3.19,3.4,10.0,102,5500,24,30,13950


In [12]:
df_raw.shape

(205, 24)

In [13]:
df_raw.dtypes

symboling              int64
normalized-losses     object
make                  object
fuel-type             object
aspiration            object
num-of-doors          object
body-style            object
drive-wheels          object
engine-location       object
wheel-base           float64
length               float64
width                float64
height               float64
curb-weight            int64
num-of-cylinders      object
engine-size            int64
bore                  object
stroke                object
compression-ratio    float64
horsepower            object
peak-rpm              object
city-mpg               int64
highway-mpg            int64
price                 object
dtype: object

In [14]:
# extracting all the string-type or object columns

col_string = []

for col in df_raw:
    if df_raw[col].dtypes == 'object':
        col_string.append(col)
        
print('Object type columns: ', col_string)       
    

Object type columns:  ['normalized-losses', 'make', 'fuel-type', 'aspiration', 'num-of-doors', 'body-style', 'drive-wheels', 'engine-location', 'num-of-cylinders', 'bore', 'stroke', 'horsepower', 'peak-rpm', 'price']


In [15]:
# convert the following continuous cols to it's appropriate datatypes
cols = ['normalized-losses', 'bore', 'stroke', 'horsepower', 'peak-rpm', 'price']

for i in cols:
    df_raw[i] = pd.to_numeric(df_raw[i])

In [16]:
df_raw.dtypes

symboling              int64
normalized-losses    float64
make                  object
fuel-type             object
aspiration            object
num-of-doors          object
body-style            object
drive-wheels          object
engine-location       object
wheel-base           float64
length               float64
width                float64
height               float64
curb-weight            int64
num-of-cylinders      object
engine-size            int64
bore                 float64
stroke               float64
compression-ratio    float64
horsepower           float64
peak-rpm             float64
city-mpg               int64
highway-mpg            int64
price                float64
dtype: object

In [17]:
df_raw['num-of-cylinders'].unique()

array(['four', 'six', 'five', 'three', 'twelve', 'two', 'eight'],
      dtype=object)

In [18]:
df_raw['num-of-cylinders'].replace({'four': 4, 'six':6, 'five':5, 'three':3, 'twelve':12, 'two':2, 'eight':8}, inplace=True)

In [19]:
df_raw.dtypes

symboling              int64
normalized-losses    float64
make                  object
fuel-type             object
aspiration            object
num-of-doors          object
body-style            object
drive-wheels          object
engine-location       object
wheel-base           float64
length               float64
width                float64
height               float64
curb-weight            int64
num-of-cylinders       int64
engine-size            int64
bore                 float64
stroke               float64
compression-ratio    float64
horsepower           float64
peak-rpm             float64
city-mpg               int64
highway-mpg            int64
price                float64
dtype: object

In [20]:
df_raw['make'].nunique()

22

In [21]:
le_cols = ['fuel-type', 'aspiration', 'num-of-doors', 'engine-location']

In [22]:
le = preprocessing.LabelEncoder()

for i in le_cols:
    df_raw[i] = le.fit_transform(df_raw[i])

In [23]:
df_raw.dtypes

symboling              int64
normalized-losses    float64
make                  object
fuel-type              int32
aspiration             int32
num-of-doors           int32
body-style            object
drive-wheels          object
engine-location        int32
wheel-base           float64
length               float64
width                float64
height               float64
curb-weight            int64
num-of-cylinders       int64
engine-size            int64
bore                 float64
stroke               float64
compression-ratio    float64
horsepower           float64
peak-rpm             float64
city-mpg               int64
highway-mpg            int64
price                float64
dtype: object

In [24]:
df_raw[:5]

Unnamed: 0,symboling,normalized-losses,make,fuel-type,aspiration,num-of-doors,body-style,drive-wheels,engine-location,wheel-base,...,num-of-cylinders,engine-size,bore,stroke,compression-ratio,horsepower,peak-rpm,city-mpg,highway-mpg,price
0,3,,alfa-romero,1,0,1,convertible,rwd,0,88.6,...,4,130,3.47,2.68,9.0,111.0,5000.0,21,27,13495.0
1,3,,alfa-romero,1,0,1,convertible,rwd,0,88.6,...,4,130,3.47,2.68,9.0,111.0,5000.0,21,27,16500.0
2,1,,alfa-romero,1,0,1,hatchback,rwd,0,94.5,...,6,152,2.68,3.47,9.0,154.0,5000.0,19,26,16500.0
3,2,164.0,audi,1,0,0,sedan,fwd,0,99.8,...,4,109,3.19,3.4,10.0,102.0,5500.0,24,30,13950.0
4,2,164.0,audi,1,0,0,sedan,4wd,0,99.4,...,5,136,3.19,3.4,8.0,115.0,5500.0,18,22,17450.0


In [25]:
df_raw = pd.get_dummies(df_raw, columns=['make', 'body-style', 'drive-wheels'])

In [26]:
df_raw[:4]

Unnamed: 0,symboling,normalized-losses,fuel-type,aspiration,num-of-doors,engine-location,wheel-base,length,width,height,...,make_volkswagen,make_volvo,body-style_convertible,body-style_hardtop,body-style_hatchback,body-style_sedan,body-style_wagon,drive-wheels_4wd,drive-wheels_fwd,drive-wheels_rwd
0,3,,1,0,1,0,88.6,168.8,64.1,48.8,...,False,False,True,False,False,False,False,False,False,True
1,3,,1,0,1,0,88.6,168.8,64.1,48.8,...,False,False,True,False,False,False,False,False,False,True
2,1,,1,0,1,0,94.5,171.2,65.5,52.4,...,False,False,False,False,True,False,False,False,False,True
3,2,164.0,1,0,0,0,99.8,176.6,66.2,54.3,...,False,False,False,False,False,True,False,False,True,False


In [27]:
df_raw.dtypes

symboling                   int64
normalized-losses         float64
fuel-type                   int32
aspiration                  int32
num-of-doors                int32
engine-location             int32
wheel-base                float64
length                    float64
width                     float64
height                    float64
curb-weight                 int64
num-of-cylinders            int64
engine-size                 int64
bore                      float64
stroke                    float64
compression-ratio         float64
horsepower                float64
peak-rpm                  float64
city-mpg                    int64
highway-mpg                 int64
price                     float64
make_alfa-romero             bool
make_audi                    bool
make_bmw                     bool
make_chevrolet               bool
make_dodge                   bool
make_honda                   bool
make_isuzu                   bool
make_jaguar                  bool
make_mazda    

In [28]:
df_raw.isnull().sum()

symboling                  0
normalized-losses         41
fuel-type                  0
aspiration                 0
num-of-doors               0
engine-location            0
wheel-base                 0
length                     0
width                      0
height                     0
curb-weight                0
num-of-cylinders           0
engine-size                0
bore                       4
stroke                     4
compression-ratio          0
horsepower                 2
peak-rpm                   2
city-mpg                   0
highway-mpg                0
price                      4
make_alfa-romero           0
make_audi                  0
make_bmw                   0
make_chevrolet             0
make_dodge                 0
make_honda                 0
make_isuzu                 0
make_jaguar                0
make_mazda                 0
make_mercedes-benz         0
make_mercury               0
make_mitsubishi            0
make_nissan                0
make_peugot   

In [29]:
df_raw.fillna(df_raw.median(), inplace=True)

In [30]:
df_raw.isna().sum()

symboling                 0
normalized-losses         0
fuel-type                 0
aspiration                0
num-of-doors              0
engine-location           0
wheel-base                0
length                    0
width                     0
height                    0
curb-weight               0
num-of-cylinders          0
engine-size               0
bore                      0
stroke                    0
compression-ratio         0
horsepower                0
peak-rpm                  0
city-mpg                  0
highway-mpg               0
price                     0
make_alfa-romero          0
make_audi                 0
make_bmw                  0
make_chevrolet            0
make_dodge                0
make_honda                0
make_isuzu                0
make_jaguar               0
make_mazda                0
make_mercedes-benz        0
make_mercury              0
make_mitsubishi           0
make_nissan               0
make_peugot               0
make_plymouth       

### Predict continuous variable - price

In [32]:
X = df_raw.drop('price', axis=1)  #axis = 0 -> rows
y = df_raw['price']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size= 0.3, random_state= 99)
X_train.shape, X_val.shape, y_train.shape, y_val.shape

((143, 50), (62, 50), (143,), (62,))

In [33]:
m = RandomForestRegressor()
m.fit(X_train, y_train)

In [34]:
# Error -> act - pre
# Sum Squared error -> (sum(act - pre)^2 )/n #MSE
# np.sqrt

In [35]:
def rmse(preds, actuals):
    return  math.sqrt(((preds - actuals) **2).mean())

In [36]:
# y_train

In [37]:
# m.predict(X_train)

In [38]:
# m.predict(X_val)

In [39]:
import math

In [40]:
rmse(m.predict(X_train), y_train), rmse(m.predict(X_val), y_val)

(1132.567887275742, 2607.217107277285)

In [41]:
m.score(X_train, y_train), m.score(X_val, y_val)

(0.9785811058102774, 0.8972224224243596)

### Predict the categorical variable - symboling

In [43]:
from sklearn.metrics import accuracy_score

In [44]:
df_raw['symboling'].value_counts()

symboling
 0    67
 1    54
 2    32
 3    27
-1    22
-2     3
Name: count, dtype: int64

In [45]:
X = df_raw.drop('symboling', axis=1)  #axis = 0 -> rows
y = df_raw['symboling']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size= 0.3, random_state= 99)
X_train.shape, X_val.shape, y_train.shape, y_val.shape

((143, 50), (62, 50), (143,), (62,))

In [46]:
m = RandomForestClassifier()
m.fit(X_train, y_train)

In [47]:
train_pred = m.predict(X_train)

In [48]:
val_pred = m.predict(X_val)

In [49]:
accuracy_score(train_pred, y_train)

1.0

In [50]:
accuracy_score(val_pred, y_val)

0.7903225806451613

In [51]:
m.score(X_train, y_train)

1.0

In [52]:
m.score(X_val, y_val)

0.7903225806451613

### RF Hyperparameter tuning

In [54]:
from sklearn.model_selection import GridSearchCV

In [55]:
param_grid = [
{'n_estimators': [1, 5, 10, 15, 20, 25, 30],
 'max_features': [0.3, 0.4, 0.6, 0.8],
 'min_samples_leaf': [1, 3, 5, 10, 25],
 'bootstrap': [True, False]}  
]

rf = RandomForestClassifier()
grid_search_forest = GridSearchCV(rf, param_grid, cv=10, scoring= 'neg_mean_squared_error')
grid_search_forest.fit(X_train, y_train)




In [56]:
# grid_search_forest.cv_results_

In [57]:

grid_search_forest.best_estimator_

In [58]:
205 * 0.67

137.35

### Model Metrics

In [60]:
from sklearn.metrics import mean_squared_error


In [61]:
y_train

30     2
174   -1
175   -1
57     3
119    1
      ..
68    -1
168    2
185    2
35     0
129    1
Name: symboling, Length: 143, dtype: int64

In [62]:
grid_best = grid_search_forest.best_estimator_.predict(X_train)

In [63]:
grid_mse = mean_squared_error(y_train, grid_best)

In [64]:
grid_mse

0.0

In [65]:
from sklearn.model_selection import RandomizedSearchCV

In [66]:
param_grid = [
{'n_estimators': [10, 15, 20, 25, 30, 35, 40],
 'max_features': [0.3, 0.4, 0.5, 0.6],
 'min_samples_leaf': [ 5, 10, 15, 20, 25],
 'bootstrap': [True, False]}  
]


rf = RandomForestClassifier()
random_search_forest = RandomizedSearchCV(rf, param_grid, cv=5, scoring= 'neg_mean_squared_error', n_iter=50)
random_search_forest.fit(X_train, y_train)



In [67]:
importance = random_search_forest.best_estimator_.feature_importances_

In [68]:
importance

array([0.13251433, 0.        , 0.00298695, 0.13952596, 0.        ,
       0.11218163, 0.0282732 , 0.03672236, 0.09185078, 0.05236326,
       0.0016481 , 0.04472663, 0.03206153, 0.03611038, 0.03319212,
       0.01772559, 0.02933031, 0.02616216, 0.05016967, 0.09002125,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.00094067, 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.0077912 , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.00205873,
       0.0031977 , 0.00997692, 0.        , 0.        , 0.00658693,
       0.00342983, 0.00094397, 0.        , 0.00684749, 0.00066035])

In [69]:
feature_list = list(X.columns)

In [70]:
feature_importance = sorted(zip(importance, feature_list), reverse=True)

In [71]:
df_features = pd.DataFrame(feature_importance, columns=['importance', 'Features'])

In [72]:
importance = list(df_features['importance'])
features = list(df_features['Features'])

In [73]:
list(features)

['num-of-doors',
 'normalized-losses',
 'wheel-base',
 'height',
 'price',
 'curb-weight',
 'highway-mpg',
 'engine-size',
 'width',
 'stroke',
 'compression-ratio',
 'bore',
 'peak-rpm',
 'length',
 'city-mpg',
 'horsepower',
 'make_volvo',
 'make_peugot',
 'drive-wheels_fwd',
 'body-style_hatchback',
 'body-style_sedan',
 'make_volkswagen',
 'aspiration',
 'make_toyota',
 'num-of-cylinders',
 'body-style_wagon',
 'make_honda',
 'drive-wheels_rwd',
 'make_subaru',
 'make_saab',
 'make_renault',
 'make_porsche',
 'make_plymouth',
 'make_nissan',
 'make_mitsubishi',
 'make_mercury',
 'make_mercedes-benz',
 'make_mazda',
 'make_jaguar',
 'make_isuzu',
 'make_dodge',
 'make_chevrolet',
 'make_bmw',
 'make_audi',
 'make_alfa-romero',
 'fuel-type',
 'engine-location',
 'drive-wheels_4wd',
 'body-style_hardtop',
 'body-style_convertible']

In [74]:
print(df_features)

    importance                Features
0     0.139526            num-of-doors
1     0.132514       normalized-losses
2     0.112182              wheel-base
3     0.091851                  height
4     0.090021                   price
5     0.052363             curb-weight
6     0.050170             highway-mpg
7     0.044727             engine-size
8     0.036722                   width
9     0.036110                  stroke
10    0.033192       compression-ratio
11    0.032062                    bore
12    0.029330                peak-rpm
13    0.028273                  length
14    0.026162                city-mpg
15    0.017726              horsepower
16    0.009977              make_volvo
17    0.007791             make_peugot
18    0.006847        drive-wheels_fwd
19    0.006587    body-style_hatchback
20    0.003430        body-style_sedan
21    0.003198         make_volkswagen
22    0.002987              aspiration
23    0.002059             make_toyota
24    0.001648        num

In [75]:
import matplotlib.pyplot as plt

In [76]:


plt.style.use('bmh')

x_values = list(range(len(feature_importance)))

plt.figure(figsize= (15, 10))
plt.bar(x_values, importance, orientation = 'vertical')
plt.xticks(x_values, features, rotation = 'vertical')

plt.ylabel('Importance')
plt.xlabel('Variable')
plt.title('Variable importance')
# plt.show()

Text(0.5, 1.0, 'Variable importance')

In [77]:
# plt.bar()

TypeError: bar() missing 2 required positional arguments: 'x' and 'height'