In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

In [2]:
import psycopg2
import re
import sys
from itertools import islice

In [3]:
df = pd.read_csv('auto_clean.csv')

In [4]:
df.head()

Unnamed: 0,symboling,normalized-losses,make,aspiration,num-of-doors,body-style,drive-wheels,engine-location,wheel-base,length,...,compression-ratio,horsepower,peak-rpm,city-mpg,highway-mpg,price,city-L/100km,horsepower-binned,diesel,gas
0,3,122,alfa-romero,std,two,convertible,rwd,front,88.6,0.811148,...,9.0,111.0,5000.0,21,27,13495.0,11.190476,Medium,0,1
1,3,122,alfa-romero,std,two,convertible,rwd,front,88.6,0.811148,...,9.0,111.0,5000.0,21,27,16500.0,11.190476,Medium,0,1
2,1,122,alfa-romero,std,two,hatchback,rwd,front,94.5,0.822681,...,9.0,154.0,5000.0,19,26,16500.0,12.368421,Medium,0,1
3,2,164,audi,std,four,sedan,fwd,front,99.8,0.84863,...,10.0,102.0,5500.0,24,30,13950.0,9.791667,Medium,0,1
4,2,164,audi,std,four,sedan,4wd,front,99.4,0.84863,...,8.0,115.0,5500.0,18,22,17450.0,13.055556,Medium,0,1


In [5]:
df.shape

(201, 29)

- Length
- Width
- Curb-weight
- Engine-size
- Horsepower
- City-mpg
- Highway-mpg
- Wheel-base
- Bore

In [6]:
from sklearn import preprocessing as pr

In [7]:
class MultiColumnLabelEncoder:
    def __init__(self, columns = None):
        self.columns = columns
        
    def fit(self, X, y = None):
        return self
    
    def transform(self, X):
        output = X.copy()
        if self.columns is not None:
            for col in self.columns:
                output[col] = pr.LabelEncoder().fit_transform(output[col].astype(str))
        else:
            for colname, col in output.iteritems():
                output[colname] = pr.LabelEncoder().fit_transform(col.astype(str))
        return output
    
    def fit_transform(self, X, y = None):
        return self.fit(X, y).transform(X)              

In [8]:
df = MultiColumnLabelEncoder(columns = ['make', 'aspiration', 'num-of-doors', 'body-style', 'drive-wheels', 
                                        'engine-location', 'engine-type', 'num-of-cylinders', 'fuel-system', 
                                        'horsepower-binned']).fit_transform(df)
df

Unnamed: 0,symboling,normalized-losses,make,aspiration,num-of-doors,body-style,drive-wheels,engine-location,wheel-base,length,...,compression-ratio,horsepower,peak-rpm,city-mpg,highway-mpg,price,city-L/100km,horsepower-binned,diesel,gas
0,3,122,0,0,1,0,2,0,88.6,0.811148,...,9.0,111.0,5000.0,21,27,13495.0,11.190476,2,0,1
1,3,122,0,0,1,0,2,0,88.6,0.811148,...,9.0,111.0,5000.0,21,27,16500.0,11.190476,2,0,1
2,1,122,0,0,1,2,2,0,94.5,0.822681,...,9.0,154.0,5000.0,19,26,16500.0,12.368421,2,0,1
3,2,164,1,0,0,3,1,0,99.8,0.848630,...,10.0,102.0,5500.0,24,30,13950.0,9.791667,2,0,1
4,2,164,1,0,0,3,0,0,99.4,0.848630,...,8.0,115.0,5500.0,18,22,17450.0,13.055556,2,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
196,-1,95,21,0,0,3,2,0,109.1,0.907256,...,9.5,114.0,5400.0,23,28,16845.0,10.217391,2,0,1
197,-1,95,21,1,0,3,2,0,109.1,0.907256,...,8.7,160.0,5300.0,19,25,19045.0,12.368421,0,0,1
198,-1,95,21,0,0,3,2,0,109.1,0.907256,...,8.8,134.0,5500.0,18,23,21485.0,13.055556,2,0,1
199,-1,95,21,1,0,3,2,0,109.1,0.907256,...,23.0,106.0,4800.0,26,27,22470.0,9.038462,2,1,0


In [9]:
df = df.fillna(0)

In [10]:
y = df.price

In [11]:
X = df.drop('price', axis=1)

In [12]:
from sklearn.model_selection import train_test_split

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [14]:
from sklearn.preprocessing import StandardScaler

In [15]:
sc = StandardScaler()

In [16]:
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [17]:
from sklearn.ensemble import RandomForestRegressor

In [18]:
regr = RandomForestRegressor(max_depth=2,random_state=0, n_estimators=100)

In [19]:
regr.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=2, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=100, n_jobs=None, oob_score=False,
                      random_state=0, verbose=0, warm_start=False)

In [20]:
print(regr.feature_importances_)

[0.00000000e+00 0.00000000e+00 4.19445982e-04 0.00000000e+00
 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
 1.59187667e-03 3.29384960e-03 4.92277560e-03 0.00000000e+00
 4.28988715e-01 1.22252081e-03 2.32488689e-03 3.83127806e-01
 0.00000000e+00 2.08281324e-03 4.45625958e-04 1.53848273e-03
 5.89497477e-02 3.74527089e-04 1.37245544e-02 5.84475101e-02
 3.85448619e-02 0.00000000e+00 0.00000000e+00 0.00000000e+00]


In [26]:
importances = list(zip(regr.feature_importances_, df.columns[:-1]))
importances.sort(reverse=True)

In [27]:
importances

[(0.42898871516993176, 'curb-weight'),
 (0.3831278061693488, 'engine-size'),
 (0.05894974774754659, 'horsepower'),
 (0.058447510065482176, 'highway-mpg'),
 (0.03854486191196443, 'price'),
 (0.013724554362176653, 'city-mpg'),
 (0.004922775600059438, 'width'),
 (0.0032938496040047552, 'length'),
 (0.0023248868919002207, 'num-of-cylinders'),
 (0.0020828132435733717, 'bore'),
 (0.001591876668266976, 'wheel-base'),
 (0.001538482728906754, 'compression-ratio'),
 (0.0012225208084215952, 'engine-type'),
 (0.00044562595775643455, 'stroke'),
 (0.0004194459821507219, 'make'),
 (0.00037452708850942016, 'peak-rpm'),
 (0.0, 'symboling'),
 (0.0, 'num-of-doors'),
 (0.0, 'normalized-losses'),
 (0.0, 'horsepower-binned'),
 (0.0, 'height'),
 (0.0, 'fuel-system'),
 (0.0, 'engine-location'),
 (0.0, 'drive-wheels'),
 (0.0, 'diesel'),
 (0.0, 'city-L/100km'),
 (0.0, 'body-style'),
 (0.0, 'aspiration')]

In [28]:
[(0.42898871516993176, 'curb-weight'),
 (0.3831278061693488, 'engine-size'),
 (0.05894974774754659, 'horsepower'),
 (0.058447510065482176, 'highway-mpg'),
 (0.03854486191196443, 'price'),
 (0.013724554362176653, 'city-mpg')]

[(0.42898871516993176, 'curb-weight'),
 (0.3831278061693488, 'engine-size'),
 (0.05894974774754659, 'horsepower'),
 (0.058447510065482176, 'highway-mpg'),
 (0.03854486191196443, 'price'),
 (0.013724554362176653, 'city-mpg')]