### Importing necessary libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn import preprocessing
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, PowerTransformer, FunctionTransformer
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error

### Loading dataframe

In [2]:
df = pd.read_csv(r'../data/CO2 Emissions_Canada.csv')

In [3]:
#Assigning object type to Cylinders column
df['Cylinders'] = df['Cylinders'].astype('O')

In [4]:
# Dropping unnecessary columns
df.drop(['Fuel Consumption Comb (L/100 km)', 'Model'], axis = 1, inplace = True)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7385 entries, 0 to 7384
Data columns (total 10 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   Make                              7385 non-null   object 
 1   Vehicle Class                     7385 non-null   object 
 2   Engine Size(L)                    7385 non-null   float64
 3   Cylinders                         7385 non-null   object 
 4   Transmission                      7385 non-null   object 
 5   Fuel Type                         7385 non-null   object 
 6   Fuel Consumption City (L/100 km)  7385 non-null   float64
 7   Fuel Consumption Hwy (L/100 km)   7385 non-null   float64
 8   Fuel Consumption Comb (mpg)       7385 non-null   int64  
 9   CO2 Emissions(g/km)               7385 non-null   int64  
dtypes: float64(3), int64(2), object(5)
memory usage: 577.1+ KB


In [6]:
#Getting the numerical, categorical and target columns
cat_cols = ['Make', 'Vehicle Class', 'Cylinders', 'Transmission', 'Fuel Type']
quan_cols = ['Engine Size(L)', 'Fuel Consumption City (L/100 km)', 'Fuel Consumption Hwy (L/100 km)', 'Fuel Consumption Comb (mpg)']
target = ['CO2 Emissions(g/km)']

#### Getting all categories in categorical columns

In [7]:
df_cat = df[cat_cols]

In [8]:
pre_ohe = OneHotEncoder(handle_unknown='ignore')

In [9]:
pre_ohe.fit(df_cat)

OneHotEncoder(handle_unknown='ignore')

In [10]:
#Use for encoder in pipeline
cat_col_categories = list(pre_ohe.categories_)

#### Train test split

In [11]:
X = df.drop(target, axis = 1)
y = df[target]

In [12]:
X

Unnamed: 0,Make,Vehicle Class,Engine Size(L),Cylinders,Transmission,Fuel Type,Fuel Consumption City (L/100 km),Fuel Consumption Hwy (L/100 km),Fuel Consumption Comb (mpg)
0,ACURA,COMPACT,2.0,4,AS5,Z,9.9,6.7,33
1,ACURA,COMPACT,2.4,4,M6,Z,11.2,7.7,29
2,ACURA,COMPACT,1.5,4,AV7,Z,6.0,5.8,48
3,ACURA,SUV - SMALL,3.5,6,AS6,Z,12.7,9.1,25
4,ACURA,SUV - SMALL,3.5,6,AS6,Z,12.1,8.7,27
...,...,...,...,...,...,...,...,...,...
7380,VOLVO,SUV - SMALL,2.0,4,AS8,Z,10.7,7.7,30
7381,VOLVO,SUV - SMALL,2.0,4,AS8,Z,11.2,8.3,29
7382,VOLVO,SUV - SMALL,2.0,4,AS8,Z,11.7,8.6,27
7383,VOLVO,SUV - STANDARD,2.0,4,AS8,Z,11.2,8.3,29


In [13]:
y

Unnamed: 0,CO2 Emissions(g/km)
0,196
1,221
2,136
3,255
4,244
...,...
7380,219
7381,232
7382,240
7383,232


In [14]:
X_train, X_test, Y_train, Y_test = train_test_split(X, y, random_state=0, test_size = 0.1)

In [15]:
X_train

Unnamed: 0,Make,Vehicle Class,Engine Size(L),Cylinders,Transmission,Fuel Type,Fuel Consumption City (L/100 km),Fuel Consumption Hwy (L/100 km),Fuel Consumption Comb (mpg)
479,GMC,SUV - STANDARD,5.3,8,A6,X,16.0,11.1,20
3648,FORD,SUV - STANDARD,3.5,6,AS6,E,18.7,12.9,18
4751,FORD,PICKUP TRUCK - STANDARD,5.0,8,AS10,X,14.6,10.9,22
6224,MERCEDES-BENZ,TWO-SEATER,2.0,4,A9,Z,10.0,7.3,32
7327,TOYOTA,COMPACT,1.8,4,AV,X,7.9,6.1,40
...,...,...,...,...,...,...,...,...,...
4931,JAGUAR,SUV - SMALL,2.0,4,AS8,Z,10.7,8.8,29
3264,VOLKSWAGEN,COMPACT,1.8,4,AS6,X,9.3,6.5,35
1653,JAGUAR,FULL-SIZE,5.0,8,AS8,E,20.6,13.6,16
2607,FORD,SUV - STANDARD,3.5,6,AS6,X,14.7,10.7,22


In [16]:
X_test

Unnamed: 0,Make,Vehicle Class,Engine Size(L),Cylinders,Transmission,Fuel Type,Fuel Consumption City (L/100 km),Fuel Consumption Hwy (L/100 km),Fuel Consumption Comb (mpg)
6307,PORSCHE,MINICOMPACT,3.0,6,M7,Z,12.0,8.2,27
5036,LAND ROVER,SUV - SMALL,2.0,4,AS8,D,9.2,7.8,33
1995,PORSCHE,MINICOMPACT,3.8,6,AM7,Z,12.5,9.0,26
4156,NISSAN,MID-SIZE,2.5,4,AV7,X,8.8,6.5,36
6328,PORSCHE,MINICOMPACT,3.0,6,AM7,Z,11.0,8.5,29
...,...,...,...,...,...,...,...,...,...
4340,VOLKSWAGEN,COMPACT,1.8,4,AM6,X,9.4,6.8,34
3891,JEEP,SUV - SMALL,2.0,4,M5,X,10.3,7.9,30
667,LEXUS,COMPACT,2.5,6,AS6,Z,11.8,8.7,27
1859,MERCEDES-BENZ,SUV - STANDARD,5.5,8,AS7,Z,18.2,14.0,17


#### Building pipeline

In [17]:
quan_pipeline = Pipeline([
    ('std_scaler', PowerTransformer())
])

quan_transformed = quan_pipeline.fit_transform(X_train[quan_cols])
quan_transformed

array([[ 1.42722056,  1.01454396,  1.00757343, -1.10512068],
       [ 0.47704091,  1.60834384,  1.57584459, -1.50869801],
       [ 1.2985109 ,  0.67179427,  0.93622792, -0.73839573],
       ...,
       [ 1.2985109 ,  1.98308863,  1.7667108 , -1.95735443],
       [ 0.47704091,  0.69719059,  0.86297981, -0.73839573],
       [-0.88492844, -0.76647857, -0.91204201,  0.83606729]])

In [18]:
cat_pipeline = Pipeline([
    ('one_hot_encoder', OneHotEncoder(handle_unknown = 'ignore', categories=cat_col_categories))
])

cat_transformed = cat_pipeline.fit_transform(X_train[cat_cols])
cat_transformed.toarray()

array([[0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 1., 0.]])

In [19]:
data_pipeline = ColumnTransformer([
    ('numerical', quan_pipeline, quan_cols),
    ('categorical', cat_pipeline, cat_cols)
    
])

train_data_processed = data_pipeline.fit_transform(X_train)
train_data_processed.toarray()[0]

array([ 1.42722056,  1.01454396,  1.00757343, -1.10512068,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  1.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  1.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  1.        ,  0.        ,  0.        ,  0.  

In [20]:
model_pipeline = Pipeline([
    ('scaling', data_pipeline),
    ('model', LGBMRegressor(random_state=0))
])

In [21]:
Y_train.values.reshape(-1,)

array([317, 264, 304, ..., 280, 303, 202])

In [22]:
model_pipeline.fit(X_train, Y_train.values.reshape(-1, ))

Pipeline(steps=[('scaling',
                 ColumnTransformer(transformers=[('numerical',
                                                  Pipeline(steps=[('std_scaler',
                                                                   PowerTransformer())]),
                                                  ['Engine Size(L)',
                                                   'Fuel Consumption City '
                                                   '(L/100 km)',
                                                   'Fuel Consumption Hwy '
                                                   '(L/100 km)',
                                                   'Fuel Consumption Comb '
                                                   '(mpg)']),
                                                 ('categorical',
                                                  Pipeline(steps=[('one_hot_encoder',
                                                                   OneHotEncoder(categories=[array(

In [23]:
pred = model_pipeline.predict(X_test)

In [24]:
pred

array([241.02303467, 222.53264636, 253.03544837, 183.40088277,
       229.02452974, 199.03443501, 298.66677647, 274.41133631,
       200.62780037, 277.67415548, 253.72451169, 252.66621125,
       194.33930405, 317.37928841, 256.34184789, 240.33381734,
       329.5342547 , 277.26473101, 242.70593067, 218.7728639 ,
       245.91756505, 199.37133038, 362.00894763, 274.19467224,
       235.2956961 , 276.27087669, 179.62879611, 397.46600574,
       243.78964137, 233.04088884, 183.11517232, 242.13001881,
       318.20434778, 242.49124241, 305.03993146, 236.28515767,
       263.78505433, 372.84000389, 366.93573216, 243.77203146,
       268.15556519, 259.48079989, 326.35411022, 277.10777875,
       275.19809126, 219.51456859, 262.94604035, 294.62547392,
       245.52752107, 265.0834101 , 251.83405577, 235.17800507,
       232.1484803 , 283.86203792, 342.48446501, 212.09203624,
       254.09980661, 326.46407345, 288.27217598, 211.63324354,
       313.05336799, 254.65487354, 259.98571958, 168.31

In [25]:
mean_absolute_error(Y_test.values.reshape(-1, ), pred)

2.3741065983061147

#### Using LDA (Experiment)

In [26]:
X_transformed = data_pipeline.fit_transform(X_train).toarray()
Y_transformed = Y_train.values.reshape(-1, )

In [27]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

In [28]:
lda = LinearDiscriminantAnalysis()

In [29]:
lda.fit(X_transformed, Y_transformed)

LinearDiscriminantAnalysis()

In [30]:
lda_var_ratios = lda.explained_variance_ratio_

In [31]:
# Create a function
def select_n_components(var_ratio, goal_var: float) -> int:
    # Set initial variance explained so far
    total_variance = 0.0
    
    # Set initial number of features
    n_components = 0
    
    # For the explained variance of each feature:
    for explained_variance in var_ratio:
        
        # Add the explained variance to the total
        total_variance += explained_variance
        
        # Add one to the number of components
        n_components += 1
        
        # If we reach our goal level of explained variance
        if total_variance >= goal_var:
            # End the loop
            break
            
    # Return the number of components
    return n_components

In [32]:
select_n_components(lda_var_ratios, 0.99)

16

In [33]:
def convert_sparse_to_array(x):
    return x.toarray()

In [34]:
conv_spars_2_array_fn = FunctionTransformer(convert_sparse_to_array)

In [35]:
lda_model_pipeline = Pipeline([
    ('scaling', data_pipeline),
    ('convert_sparse_to_array', conv_spars_2_array_fn),
    ('lda', LinearDiscriminantAnalysis(n_components=30)),
    ('model', LGBMRegressor(random_state=0))
])

In [36]:
lda_model_pipeline.fit(X_train, Y_train.values.reshape(-1, ))

Pipeline(steps=[('scaling',
                 ColumnTransformer(transformers=[('numerical',
                                                  Pipeline(steps=[('std_scaler',
                                                                   PowerTransformer())]),
                                                  ['Engine Size(L)',
                                                   'Fuel Consumption City '
                                                   '(L/100 km)',
                                                   'Fuel Consumption Hwy '
                                                   '(L/100 km)',
                                                   'Fuel Consumption Comb '
                                                   '(mpg)']),
                                                 ('categorical',
                                                  Pipeline(steps=[('one_hot_encoder',
                                                                   OneHotEncoder(categories=[array(

In [37]:
pred = lda_model_pipeline.predict(X_test)

In [38]:
mean_absolute_error(Y_test.values.reshape(-1, ), pred)

1.8801321096470345

In [39]:
import joblib

In [40]:
joblib.dump(lda_model_pipeline, r'../artefacts/lda_model_pipeline.pkl')

['../artefacts/lda_model_pipeline.pkl']

In [42]:
lda_model_pipeline.predict(X_test.iloc[0:1,:])

array([240.35853866])

In [45]:
type(X_test.iloc[0:1,:].values)

numpy.ndarray