In [1]:
import os

import pandas as pd
import numpy as np

from sklearn.datasets import make_regression
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import MinMaxScaler

In [2]:
ranks = {}
def ranking(ranks, names, order=1):
    minmax = MinMaxScaler()
    ranks = minmax.fit_transform(order*np.array([ranks]).T).T[0]
    ranks = map(lambda x: round(x,2), ranks)
    return dict(zip(names, ranks))

In [3]:
car = pd.read_csv("../datasets/cars/New_cars_preprocessed.csv")
car.head()

Unnamed: 0,EPA Fuel Economy Est - City (MPG),Engine,Drivetrain,Passenger Capacity,Passenger Doors,Wheelbase (in),SAE Net Torque @ RPM,Fuel System,SAE Net Horsepower @ RPM,Displacement,...,Turning Diameter - Curb to Curb (ft),Front Wheel Material,Roadside Assistance Miles/km,Manufacturer,Model year,Category,Front tire width,Front tire aspect ratio,Front tire rim size,MSRP
0,0.333333,0.769231,0.5,0.333333,1.0,0.608427,0.299465,0.0,0.336634,0.266667,...,0.416667,0.25,0.333333,0.0,1.0,0.5,0.27907,0.647059,0.863636,40600.0
1,0.333333,0.769231,0.5,0.333333,1.0,0.608427,0.299465,0.0,0.336634,0.266667,...,0.416667,0.25,0.333333,0.0,1.0,0.5,0.27907,0.647059,0.863636,45500.0
2,0.333333,0.769231,0.5,0.333333,1.0,0.608427,0.299465,0.0,0.336634,0.266667,...,0.416667,0.25,0.333333,0.0,1.0,0.5,0.348837,0.529412,0.909091,43600.0
3,0.333333,0.769231,0.5,0.333333,1.0,0.608427,0.299465,0.0,0.336634,0.266667,...,0.416667,0.25,0.333333,0.0,1.0,0.5,0.27907,0.647059,0.863636,37400.0
4,0.318182,0.769231,0.25,0.333333,1.0,0.608427,0.299465,0.0,0.336634,0.266667,...,0.416667,0.25,0.333333,0.0,1.0,0.5,0.27907,0.647059,0.863636,42600.0


In [4]:
# car = car.drop(['Unnamed: 0'], axis=1)
# car

In [5]:
X_car = car.copy()
y_car = X_car.pop("MSRP")
colnames_car = X_car.columns
colnames = X_car.columns

In [6]:
car.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32262 entries, 0 to 32261
Data columns (total 35 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   EPA Fuel Economy Est - City (MPG)     32262 non-null  float64
 1   Engine                                32262 non-null  float64
 2   Drivetrain                            32262 non-null  float64
 3   Passenger Capacity                    32262 non-null  float64
 4   Passenger Doors                       32262 non-null  float64
 5   Wheelbase (in)                        32262 non-null  float64
 6   SAE Net Torque @ RPM                  32262 non-null  float64
 7   Fuel System                           32262 non-null  float64
 8   SAE Net Horsepower @ RPM              32262 non-null  float64
 9   Displacement                          32262 non-null  float64
 10  Trans Description Cont.               32262 non-null  float64
 11  Trans Type     

In [7]:
model = DecisionTreeRegressor()
model.fit(X_car, y_car)

DecisionTreeRegressor()

In [None]:
importance = model.feature_importances_
# summarize feature importance
for i,v in enumerate(importance):
    print('Feature: %0d, Score: %.5f' % (i,v))
    
# plot feature importance
plt.bar([x for x in range(len(importance))], importance)
plt.show()

Feature: 0, Score: 0.02161
Feature: 1, Score: 0.00607
Feature: 2, Score: 0.00688
Feature: 3, Score: 0.00514
Feature: 4, Score: 0.00155
Feature: 5, Score: 0.08644
Feature: 6, Score: 0.01290
Feature: 7, Score: 0.02695
Feature: 8, Score: 0.14159
Feature: 9, Score: 0.05046
Feature: 10, Score: 0.00196
Feature: 11, Score: 0.00358
Feature: 12, Score: 0.00247
Feature: 13, Score: 0.00193
Feature: 14, Score: 0.00325
Feature: 15, Score: 0.00189
Feature: 16, Score: 0.00656
Feature: 17, Score: 0.00031
Feature: 18, Score: 0.00263
Feature: 19, Score: 0.25638
Feature: 20, Score: 0.00930
Feature: 21, Score: 0.00213
Feature: 22, Score: 0.00463
Feature: 23, Score: 0.01110
Feature: 24, Score: 0.06358
Feature: 25, Score: 0.02112
Feature: 26, Score: 0.00324
Feature: 27, Score: 0.00714
Feature: 28, Score: 0.05739
Feature: 29, Score: 0.10804
Feature: 30, Score: 0.00214
Feature: 31, Score: 0.01417
Feature: 32, Score: 0.01921
Feature: 33, Score: 0.03627


In [None]:
cart = DecisionTreeRegressor()
cart.fit(X_car,y_car)
ranks["CART"] = ranking(cart.feature_importances_, colnames);
ranks

In [None]:
decimalrank_df = pd.DataFrame()

# Join preranks in dataframe
for method_name, rank_list in ranks.items():
    df_Temp = pd.DataFrame(data = {method_name:pd.Series(rank_list)})
    decimalrank_df = pd.concat([decimalrank_df,df_Temp ], axis=1)

rank_df = pd.DataFrame()
# Create new integer ranks
for method_name, rank_list in ranks.items():
    rank_df[method_name] = decimalrank_df[method_name].rank(ascending=False)

rank_df['feature_name'] = rank_df.index

In [None]:
ranksout = rank_df.melt(id_vars = 'feature_name', var_name='method', value_name='feature_rank')
ranksout['dataset'] = 'cars'
ranksout

In [None]:
ranksout.to_csv('../data/CART/cars_featureranks_CART.csv', sep=';',index=False)