<a href="https://kritikseth.github.io/ipynbtagredirect" target="_parent"><img src="https://raw.githack.com/kritikseth/kritikseth/master/assets/icons/kritik_ipynbtagredirect.svg" alt="Kritik Seth"/></a>

In [1]:
import pandas as pd
import numpy as np
from sklearn import linear_model
from sklearn.model_selection import cross_val_score, KFold, train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.datasets import load_iris
from sklearn.metrics import accuracy_score, r2_score

### Linear Regression on Fuel Consumption Dataset

**Import the Dataset**

File 1: https://s3-api.us-geo.objectstorage.softlayer.net/cf-courses-data/CognitiveClass/ML0101ENv3/labs/FuelConsumptionCo2.csv

OR

File 2: https://raw.githubusercontent.com/kritikseth/Datasets/master/Fuel_Consumpltion.csv

In [2]:
fuel = pd.read_csv('https://raw.githubusercontent.com/kritikseth/Datasets/master/Fuel_Consumpltion.csv')

In [3]:
fuel.head()

Unnamed: 0,MODELYEAR,MAKE,MODEL,VEHICLECLASS,ENGINESIZE,CYLINDERS,TRANSMISSION,FUELTYPE,FUELCONSUMPTION_CITY,FUELCONSUMPTION_HWY,FUELCONSUMPTION_COMB,FUELCONSUMPTION_COMB_MPG,CO2EMISSIONS
0,2014,ACURA,ILX,COMPACT,2.0,4,AS5,Z,9.9,6.7,8.5,33,196
1,2014,ACURA,ILX,COMPACT,2.4,4,M6,Z,11.2,7.7,9.6,29,221
2,2014,ACURA,ILX HYBRID,COMPACT,1.5,4,AV7,Z,6.0,5.8,5.9,48,136
3,2014,ACURA,MDX 4WD,SUV - SMALL,3.5,6,AS6,Z,12.7,9.1,11.1,25,255
4,2014,ACURA,RDX AWD,SUV - SMALL,3.5,6,AS6,Z,12.1,8.7,10.6,27,244


In [4]:
fuel.shape

(1067, 13)

In [5]:
fuel.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1067 entries, 0 to 1066
Data columns (total 13 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   MODELYEAR                 1067 non-null   int64  
 1   MAKE                      1067 non-null   object 
 2   MODEL                     1067 non-null   object 
 3   VEHICLECLASS              1067 non-null   object 
 4   ENGINESIZE                1067 non-null   float64
 5   CYLINDERS                 1067 non-null   int64  
 6   TRANSMISSION              1067 non-null   object 
 7   FUELTYPE                  1067 non-null   object 
 8   FUELCONSUMPTION_CITY      1067 non-null   float64
 9   FUELCONSUMPTION_HWY       1067 non-null   float64
 10  FUELCONSUMPTION_COMB      1067 non-null   float64
 11  FUELCONSUMPTION_COMB_MPG  1067 non-null   int64  
 12  CO2EMISSIONS              1067 non-null   int64  
dtypes: float64(4), int64(4), object(5)
memory usage: 108.5+ KB


In [6]:
fuel.describe()

Unnamed: 0,MODELYEAR,ENGINESIZE,CYLINDERS,FUELCONSUMPTION_CITY,FUELCONSUMPTION_HWY,FUELCONSUMPTION_COMB,FUELCONSUMPTION_COMB_MPG,CO2EMISSIONS
count,1067.0,1067.0,1067.0,1067.0,1067.0,1067.0,1067.0,1067.0
mean,2014.0,3.346298,5.794752,13.296532,9.474602,11.580881,26.441425,256.228679
std,0.0,1.415895,1.797447,4.101253,2.79451,3.485595,7.468702,63.372304
min,2014.0,1.0,3.0,4.6,4.9,4.7,11.0,108.0
25%,2014.0,2.0,4.0,10.25,7.5,9.0,21.0,207.0
50%,2014.0,3.4,6.0,12.6,8.8,10.9,26.0,251.0
75%,2014.0,4.3,8.0,15.55,10.85,13.35,31.0,294.0
max,2014.0,8.4,12.0,30.2,20.5,25.8,60.0,488.0


In [7]:
fuel['MODELYEAR'].value_counts()

2014    1067
Name: MODELYEAR, dtype: int64

In [8]:
fuel.drop(['MODELYEAR'], axis=1, inplace=True)

In [9]:
fuel['VEHICLECLASS'].value_counts()

MID-SIZE                    178
COMPACT                     172
SUV - SMALL                 154
SUV - STANDARD              110
FULL-SIZE                    86
TWO-SEATER                   71
SUBCOMPACT                   65
PICKUP TRUCK - STANDARD      62
MINICOMPACT                  47
STATION WAGON - SMALL        36
VAN - PASSENGER              25
VAN - CARGO                  22
MINIVAN                      14
PICKUP TRUCK - SMALL         12
SPECIAL PURPOSE VEHICLE       7
STATION WAGON - MID-SIZE      6
Name: VEHICLECLASS, dtype: int64

In [10]:
fuel['MODEL'].value_counts().count()

663

Since there are too many unique values in a categorical variable as compared to the size of the data, we will drop 'MODEL'

In [11]:
fuel.drop(['MODEL'], axis=1, inplace=True)

In [12]:
fuel['TRANSMISSION'].value_counts()

A6     222
AS6    189
M6     141
A8      87
AS8     80
AS7     76
M5      48
AV      46
A4      45
AM7     34
A5      30
A7      12
AV6     11
AS5     10
M7       9
A9       8
AM6      6
AV7      5
AV8      3
AS9      2
AM5      2
AS4      1
Name: TRANSMISSION, dtype: int64

In [13]:
fuel['MAKE'].value_counts().count()

39

In [14]:
fuel['FUELTYPE'].value_counts()

X    514
Z    434
E     92
D     27
Name: FUELTYPE, dtype: int64

In [15]:
le = LabelEncoder()
label_enc = ['TRANSMISSION', 'VEHICLECLASS', 'MAKE', 'FUELTYPE']
for i in label_enc:
  fuel[i] = le.fit_transform(fuel[i])

In [16]:
fuel.head()

Unnamed: 0,MAKE,VEHICLECLASS,ENGINESIZE,CYLINDERS,TRANSMISSION,FUELTYPE,FUELCONSUMPTION_CITY,FUELCONSUMPTION_HWY,FUELCONSUMPTION_COMB,FUELCONSUMPTION_COMB_MPG,CO2EMISSIONS
0,0,0,2.0,4,10,3,9.9,6.7,8.5,33,196
1,0,0,2.4,4,20,3,11.2,7.7,9.6,29,221
2,0,0,1.5,4,17,3,6.0,5.8,5.9,48,136
3,0,11,3.5,6,11,3,12.7,9.1,11.1,25,255
4,0,11,3.5,6,11,3,12.1,8.7,10.6,27,244


Since we know 'ENGINESIZE' and 'CYLINDERS', we can find the volume of each cylinder using the following formula- 'ENGINESIZE'/'CYLINDERS'

In [17]:
fuel['CYLINDER_VOL'] = fuel['ENGINESIZE']/fuel['CYLINDERS']

In [18]:
stdsc = StandardScaler()

col = fuel.columns
fuel = stdsc.fit_transform(fuel)
fuel = pd.DataFrame(fuel, columns=col)

In [19]:
fuel

Unnamed: 0,MAKE,VEHICLECLASS,ENGINESIZE,CYLINDERS,TRANSMISSION,FUELTYPE,FUELCONSUMPTION_CITY,FUELCONSUMPTION_HWY,FUELCONSUMPTION_COMB,FUELCONSUMPTION_COMB_MPG,CO2EMISSIONS,CYLINDER_VOL
0,-1.575452,-1.244334,-0.951292,-0.998969,0.049460,1.011148,-0.828558,-0.993341,-0.884304,0.878553,-0.950840,-0.614432
1,-1.575452,-1.244334,-0.668653,-0.998969,1.548707,1.011148,-0.511433,-0.635329,-0.568571,0.342734,-0.556161,0.397412
2,-1.575452,-1.244334,-1.304591,-0.998969,1.098933,1.011148,-1.779932,-1.315552,-1.630581,2.887876,-1.898070,-1.879238
3,-1.575452,0.915493,0.108606,0.114242,0.199384,1.011148,-0.145519,-0.134112,-0.138027,-0.193086,-0.019397,0.228771
4,-1.575452,0.915493,0.108606,0.114242,0.199384,1.011148,-0.291885,-0.277317,-0.281542,0.074824,-0.193056,0.228771
...,...,...,...,...,...,...,...,...,...,...,...,...
1062,1.900538,0.915493,-0.244694,0.114242,0.199384,-0.373826,0.025240,0.116497,0.062894,-0.327041,0.233197,-0.614432
1063,1.900538,0.915493,-0.103374,0.114242,0.199384,-0.373826,-0.023548,0.009093,-0.023215,-0.193086,0.122687,-0.277151
1064,1.900538,0.915493,-0.244694,0.114242,0.199384,-0.373826,0.025240,0.116497,0.062894,-0.327041,0.233197,-0.614432
1065,1.900538,0.915493,-0.103374,0.114242,0.199384,-0.373826,-0.096731,-0.062510,-0.080621,-0.193086,0.059538,-0.277151


In [20]:
X = fuel.drop(['CO2EMISSIONS'], axis=1)
Y = fuel['CO2EMISSIONS']

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.4)

In [22]:
linear = linear_model.LinearRegression()
ridge = linear_model.Ridge()
lasso= linear_model.Lasso()
elastic = linear_model.ElasticNet()
lasso_lars = linear_model.LassoLars()
bayes_ridge = linear_model.BayesianRidge()

models_churn = [linear, ridge, lasso, elastic, lasso_lars, bayes_ridge]

In [23]:
def cross_val(model):
  score = cross_val_score(model, X_train, y_train, cv=5, scoring='r2')
  print(f'Maximum R2: {max(score)}, StdDev in R2: {np.std(score)}')
  print('_'*100+'\n')

In [24]:
for i in models_churn:
  print(i, '\n')
  cross_val(i)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False) 

Maximum R2: 0.9202107222465177, StdDev in R2: 0.0195858466912843
____________________________________________________________________________________________________

Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
      normalize=False, random_state=None, solver='auto', tol=0.001) 

Maximum R2: 0.9216386074170556, StdDev in R2: 0.01916131621750003
____________________________________________________________________________________________________

Lasso(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=1000,
      normalize=False, positive=False, precompute=False, random_state=None,
      selection='cyclic', tol=0.0001, warm_start=False) 

Maximum R2: -0.0007737233638676067, StdDev in R2: 0.0018962736261288003
____________________________________________________________________________________________________

ElasticNet(alpha=1.0, copy_X=True, fit_intercept=True, l1_ratio=0.5,
      

In [25]:
alpha = [0.01, 0.05, 0.1, 0.5, 1, 5, 10, 50, 100]
solver = ['auto', 'svd']
param_grid = dict(alpha = alpha, solver = solver)

In [26]:
grid = GridSearchCV(estimator=ridge, param_grid=param_grid, scoring='r2', cv=20)
grid.fit(X_train, y_train)

GridSearchCV(cv=20, error_score=nan,
             estimator=Ridge(alpha=1.0, copy_X=True, fit_intercept=True,
                             max_iter=None, normalize=False, random_state=None,
                             solver='auto', tol=0.001),
             iid='deprecated', n_jobs=None,
             param_grid={'alpha': [0.01, 0.05, 0.1, 0.5, 1, 5, 10, 50, 100],
                         'solver': ['auto', 'svd']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='r2', verbose=0)

In [27]:
grid.best_params_

{'alpha': 1, 'solver': 'auto'}

In [28]:
ridge = linear_model.Ridge(alpha=10, solver='svd')
ridge.fit(X_train, y_train)
pred = ridge.predict(X_test)

In [29]:
print(f'R2 Score: {r2_score(y_test, pred)}')

R2 Score: 0.8907264605405326


### Classification on Iris Dataset

In [30]:
from sklearn.datasets import load_iris
iris = load_iris()
iris = pd.DataFrame(data= np.c_[iris['data'], iris['target']],columns= iris['feature_names'] + ['target'])

In [31]:
iris.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,5.1,3.5,1.4,0.2,0.0
1,4.9,3.0,1.4,0.2,0.0
2,4.7,3.2,1.3,0.2,0.0
3,4.6,3.1,1.5,0.2,0.0
4,5.0,3.6,1.4,0.2,0.0


In [32]:
iris.shape

(150, 5)

In [33]:
iris.describe()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
count,150.0,150.0,150.0,150.0,150.0
mean,5.843333,3.057333,3.758,1.199333,1.0
std,0.828066,0.435866,1.765298,0.762238,0.819232
min,4.3,2.0,1.0,0.1,0.0
25%,5.1,2.8,1.6,0.3,0.0
50%,5.8,3.0,4.35,1.3,1.0
75%,6.4,3.3,5.1,1.8,2.0
max,7.9,4.4,6.9,2.5,2.0


In [34]:
X = iris.drop(['target'], axis=1)
Y = iris['target']

In [35]:
X = stdsc.fit_transform(X)

In [36]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.4)

In [37]:
logistics = linear_model.LogisticRegression()
sgd = linear_model.SGDClassifier()
passagg = linear_model.PassiveAggressiveClassifier()
ridgecv = linear_model.RidgeClassifierCV()
ridgeclass = linear_model.RidgeClassifier()

models_log = [logistics, sgd, passagg, ridgecv, ridgeclass]

In [38]:
def cross_val(model):
  score = cross_val_score(model, X_train, y_train, cv=5, scoring='accuracy')
  print(f'Maximum Accuracy: {max(score)}, StdDev in R2: {np.std(score)}')
  print('_'*100+'\n')

In [39]:
for i in models_log:
  print(i, '\n')
  cross_val(i)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False) 

Maximum Accuracy: 1.0, StdDev in R2: 0.03513641844631534
____________________________________________________________________________________________________

SGDClassifier(alpha=0.0001, average=False, class_weight=None,
              early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
              l1_ratio=0.15, learning_rate='optimal', loss='hinge',
              max_iter=1000, n_iter_no_change=5, n_jobs=None, penalty='l2',
              power_t=0.5, random_state=None, shuffle=True, tol=0.001,
              validation_fraction=0.1, verbose=0, warm_start=False) 

Maximum Accuracy: 1.0, StdDev in R2: 0.09686442096757052
______________________________

In [40]:
solver = ['liblinear', 'newton-cg', 'lbfgs', 'sag', 'saga']
param_grid = dict(solver = solver)

In [41]:
grid = GridSearchCV(estimator = logistics, param_grid = param_grid, scoring = 'accuracy', cv = 10)
grid.fit(X_train, y_train)

GridSearchCV(cv=10, error_score=nan,
             estimator=LogisticRegression(C=1.0, class_weight=None, dual=False,
                                          fit_intercept=True,
                                          intercept_scaling=1, l1_ratio=None,
                                          max_iter=100, multi_class='auto',
                                          n_jobs=None, penalty='l2',
                                          random_state=None, solver='lbfgs',
                                          tol=0.0001, verbose=0,
                                          warm_start=False),
             iid='deprecated', n_jobs=None,
             param_grid={'solver': ['liblinear', 'newton-cg', 'lbfgs', 'sag',
                                    'saga']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='accuracy', verbose=0)

In [42]:
grid.best_params_

{'solver': 'newton-cg'}

In [43]:
logi = linear_model.LogisticRegression(solver='newton-cg')
logi.fit(X_train, y_train)
pred = logi.predict(X_test)

In [44]:
print(f'Accuracy Score: {accuracy_score(y_test, pred)}')

Accuracy Score: 0.9666666666666667
