In [197]:
import numpy as np
import pandas as pd
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn import metrics


In [198]:
car = pd.read_csv("C:\Alabs\Datasets\Car_sales - 1656244153716.csv", sep= ',')
car

Unnamed: 0,Manufacturer,Model,Sales_in_thousands,four_year_resale_value,Vehicle_type,Price_in_thousands,Engine_size,Horsepower,Wheelbase,Width,Length,Curb_weight,Fuel_capacity,Fuel_efficiency,Latest_Launch,Power_perf_factor
0,Acura,Integra,16.919,16.360,Passenger,21.50,1.8,140.0,101.2,67.3,172.4,2.639,13.2,28.0,2/2/2012,58.280150
1,Acura,TL,39.384,19.875,Passenger,28.40,3.2,225.0,108.1,70.3,192.9,3.517,17.2,25.0,6/3/2011,91.370778
2,Acura,CL,14.114,18.225,Passenger,,3.2,225.0,106.9,70.6,192.0,3.470,17.2,26.0,1/4/2012,
3,Acura,RL,8.588,29.725,Passenger,42.00,3.5,210.0,114.6,71.4,196.6,3.850,18.0,22.0,3/10/2011,91.389779
4,Audi,A4,20.397,22.255,Passenger,23.99,1.8,150.0,102.6,68.2,178.0,2.998,16.4,27.0,10/8/2011,62.777639
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
152,Volvo,V40,3.545,,Passenger,24.40,1.9,160.0,100.5,67.6,176.6,3.042,15.8,25.0,9/21/2011,66.498812
153,Volvo,S70,15.245,,Passenger,27.50,2.4,168.0,104.9,69.3,185.9,3.208,17.9,25.0,11/24/2012,70.654495
154,Volvo,V70,17.531,,Passenger,28.80,2.4,168.0,104.9,69.3,186.2,3.259,17.9,25.0,6/25/2011,71.155978
155,Volvo,C70,3.493,,Passenger,45.50,2.3,236.0,104.9,71.5,185.7,3.601,18.5,23.0,4/26/2011,101.623357


In [199]:

def fn_desc( x ):
    # missing values calculation
    ntot = x.shape[0]
    n = x.count()
    n_miss = ntot - n
    n_miss_perc = n_miss/ntot
    
    # get the lc and uc using IQR
    q1 = x.quantile(0.25)
    q3 = x.quantile(0.75)
    IQR = q3 - q1
    lc_iqr = q1 - 1.5 * IQR
    uc_iqr = q3 + 1.5 * IQR
    
    # return the descriptives
    
    return pd.Series([x.dtype, x.nunique(),
                       ntot, n, n_miss, n_miss_perc,
                      IQR, lc_iqr, uc_iqr,
                      x.sum(), x.mean(), x.var(), x.std(),
                      x.min(), 
                      x.quantile(0.01), x.quantile(0.05), x.quantile(0.10), 
                      x.quantile(0.25), x.quantile(0.5), x.quantile(0.75), 
                      x.quantile(0.90), x.quantile(0.95), x.quantile(0.99), 
                      x.max() ],
                     index = ['dtype', 'cardinality',
                               'ntot', 'n', 'n_miss', 'n_miss_perc',
                              'IQR', 'lc_iqr', 'uc_iqr',
                              'sum', 'mean', 'var', 'std',
                              'min', 
                              'p1', 'p5', 'p10', 
                              'p25', 'p50', 'p75', 
                              'p90', 'p95', 'p99',
                              'max' ])

In [200]:
car.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 157 entries, 0 to 156
Data columns (total 16 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Manufacturer            157 non-null    object 
 1   Model                   157 non-null    object 
 2   Sales_in_thousands      157 non-null    float64
 3   four_year_resale_value  121 non-null    float64
 4   Vehicle_type            157 non-null    object 
 5   Price_in_thousands      155 non-null    float64
 6   Engine_size             156 non-null    float64
 7   Horsepower              156 non-null    float64
 8   Wheelbase               156 non-null    float64
 9   Width                   156 non-null    float64
 10  Length                  156 non-null    float64
 11  Curb_weight             155 non-null    float64
 12  Fuel_capacity           156 non-null    float64
 13  Fuel_efficiency         154 non-null    float64
 14  Latest_Launch           157 non-null    ob

In [201]:
cars_cat_var = car.select_dtypes('object').columns
cars_con_var = car.select_dtypes('float64').columns.difference(['Latest_Launch', 'Sales_in_thousands'])

In [202]:

cars_cat_var = cars_cat_var.drop(labels= 'Latest_Launch')

In [203]:


cars_cat_var

Index(['Manufacturer', 'Model', 'Vehicle_type'], dtype='object')

In [204]:

car.loc[:, cars_con_var].apply(fn_desc)

Unnamed: 0,Curb_weight,Engine_size,Fuel_capacity,Fuel_efficiency,Horsepower,Length,Power_perf_factor,Price_in_thousands,Wheelbase,Width,four_year_resale_value
dtype,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64
cardinality,147,31,55,20,66,127,154,152,88,78,117
ntot,157,157,157,157,157,157,157,157,157,157,157
n,155,156,156,154,156,156,155,155,156,156,121
n_miss,2,1,1,3,1,1,2,2,1,1,36
n_miss_perc,0.012739,0.006369,0.006369,0.019108,0.006369,0.006369,0.012739,0.012739,0.006369,0.006369,0.229299
IQR,0.8285,1.275,3.775,5.0,65.5,18.55,29.007171,13.93,9.2,5.025,8.615
lc_iqr,1.72825,0.3875,10.1375,13.5,51.25,149.75,16.896951,-2.8775,89.2,60.8625,-1.6625
uc_iqr,5.04225,5.4875,25.2375,33.5,313.25,223.95,132.925634,52.8425,126.0,80.9625,32.7975
sum,523.594,477.5,2800.5,3672.0,29008.0,29225.6,11941.756636,4245.567,16768.0,11099.4,2186.83


In [205]:
# Outlier Treatment

def impute_outlier_iqr(df):
    cols = df.columns
    q1 = df[cols].quantile(0.25)
    q3 = df[cols].quantile(0.75)
    iqr = q3 - q1
    lc = q1 - (1.5 * iqr)
    uc = q3 + (1.5 * iqr)

    df = np.clip(df, lc, uc, axis = 1)
    return df

In [206]:
car.loc[:, cars_con_var] =  impute_outlier_iqr(car.loc[:, cars_con_var])
car.loc[:, cars_con_var]

Unnamed: 0,Curb_weight,Engine_size,Fuel_capacity,Fuel_efficiency,Horsepower,Length,Power_perf_factor,Price_in_thousands,Wheelbase,Width,four_year_resale_value
0,2.639,1.8,13.2,28.0,140.0,172.4,58.280150,21.50,101.2,67.3,16.360
1,3.517,3.2,17.2,25.0,225.0,192.9,91.370778,28.40,108.1,70.3,19.875
2,3.470,3.2,17.2,26.0,225.0,192.0,,,106.9,70.6,18.225
3,3.850,3.5,18.0,22.0,210.0,196.6,91.389779,42.00,114.6,71.4,29.725
4,2.998,1.8,16.4,27.0,150.0,178.0,62.777639,23.99,102.6,68.2,22.255
...,...,...,...,...,...,...,...,...,...,...,...
152,3.042,1.9,15.8,25.0,160.0,176.6,66.498812,24.40,100.5,67.6,
153,3.208,2.4,17.9,25.0,168.0,185.9,70.654495,27.50,104.9,69.3,
154,3.259,2.4,17.9,25.0,168.0,186.2,71.155978,28.80,104.9,69.3,
155,3.601,2.3,18.5,23.0,236.0,185.7,101.623357,45.50,104.9,71.5,


In [207]:
car.loc[:, cars_con_var].apply(fn_desc)

Unnamed: 0,Curb_weight,Engine_size,Fuel_capacity,Fuel_efficiency,Horsepower,Length,Power_perf_factor,Price_in_thousands,Wheelbase,Width,four_year_resale_value
dtype,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64
cardinality,144,30,52,20,65,126,148,144,85,78,106
ntot,157,157,157,157,157,157,157,157,157,157,157
n,155,156,156,154,156,156,155,155,156,156,121
n_miss,2,1,1,3,1,1,2,2,1,1,36
n_miss_perc,0.012739,0.006369,0.006369,0.019108,0.006369,0.006369,0.012739,0.012739,0.006369,0.006369,0.229299
IQR,0.8285,1.275,3.775,5.0,65.5,18.55,29.007171,13.93,9.2,5.025,8.615
lc_iqr,1.72825,0.3875,10.1375,13.5,51.25,149.75,16.896951,-2.8775,89.2,60.8625,-1.6625
uc_iqr,5.04225,5.4875,25.2375,33.5,313.25,223.95,132.925634,52.8425,126.0,80.9625,32.7975
sum,522.282,474.5625,2774.8375,3660.5,28839.5,29225.15,11856.904777,4091.5245,16736.6,11099.4,2016.275


In [208]:
def categorical_var_summary(x):
    Mode = x.value_counts().sort_values(ascending = False)[0:1].reset_index()
    return pd.Series([x.count(), x.isnull().sum(), Mode.iloc[0, 0], Mode.iloc[0, 1], 
                          round(Mode.iloc[0, 1] * 100/x.count(), 2)], 
                  index = ['N', 'NMISS', 'MODE', 'FREQ', 'PERCENT'])

In [209]:
car.loc[:, cars_cat_var].apply(categorical_var_summary)

Unnamed: 0,Manufacturer,Model,Vehicle_type
N,157,157,157
NMISS,0,0,0
MODE,Dodge,Neon,Passenger
FREQ,11,2,116
PERCENT,7.01,1.27,73.89


In [210]:
Y = 'Sales_in_thousands'
X = car.columns.difference(['Sales_in_thousands','Latest_Launch'])

In [211]:
X_train, X_test, y_train, y_test = train_test_split(car.loc[:, X], car.loc[:, Y], test_size=0.33, random_state=12)

In [212]:
num_pipeline = Pipeline(steps=[
    ('impute', KNNImputer(n_neighbors= 5, weights= "distance")),
    ('scale',StandardScaler())
])

In [213]:
cat_pipeline = Pipeline(steps=[
    ('impute', SimpleImputer(strategy='most_frequent')),
    ('one-hot',OneHotEncoder(handle_unknown='ignore'))
])

In [214]:
col_trans = ColumnTransformer(transformers=[
    ('num_pipeline', num_pipeline, cars_con_var),
    ('cat_pipeline', cat_pipeline, cars_cat_var)
])

In [215]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

In [216]:
model = RandomForestRegressor(random_state= 14)
regr_pipeline = Pipeline(steps= [
    ('col_trans', col_trans)
])

In [217]:
param_grid = [{   
                  'n_estimators': list(range(7, 16)),
                  'max_depth': list(range(0,5)),
                  'max_features': list(range(0,10)),
                  'criterion': ['squared_error', 'Poisson'],
                }]

In [226]:
rf_cv = GridSearchCV(model, param_grid,
                                   scoring= 'neg_mean_absolute_percentage_error' )

In [227]:
from sklearn.metrics import r2_score

regr = make_pipeline(regr_pipeline, 
                     rf_cv)



In [228]:
regr.fit(X_train, y_train)

2880 fits failed out of a total of 4500.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
450 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\kroop\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\kroop\anaconda3\lib\site-packages\sklearn\base.py", line 1467, in wrapper
    estimator._validate_params()
  File "c:\Users\kroop\anaconda3\lib\site-packages\sklearn\base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "c:\Users\kroop\anaconda3\lib\site-packages\sklearn\utils\_param_validation.py", line 95, in validate_parameter_constraints
    raise InvalidPara

In [233]:
rf_cv.best_estimator_.fit(X_train, y_train)

ValueError: could not convert string to float: 'Ford'

In [231]:
from sklearn.metrics import mean_absolute_percentage_error


train_pred = regr.predict(X_train)
mean_absolute_percentage_error(y_train, train_pred)

2.487980991250439

In [221]:
regr_pipeline.fit(X_train, y_train)

In [222]:
regr.best_estimator_

AttributeError: 'Pipeline' object has no attribute 'best_estimator_'

In [None]:
y_pred_ts = regr_pipeline.predict(X_test)

In [None]:
y_pred_tr = regr_pipeline.predict(X_train)

In [None]:
from sklearn.metrics import mean_absolute_percentage_error

In [None]:
mean_absolute_percentage_error(y_train, y_pred_tr)

0.5082392766240555

In [None]:
mean_absolute_percentage_error(y_test, y_pred_ts)

9.10386060621312