# Predicting the CO2 emmision for different vehicles

### Importing Needed packages

In [2]:
import matplotlib.pyplot as plt
import pandas as pd
import pylab as pl
import numpy as np
%matplotlib inline

### Reading the data

In [4]:
df = pd.read_csv("FuelConsumption.csv")

# take a look at the dataset
df.head()

Unnamed: 0,MODELYEAR,MAKE,MODEL,VEHICLECLASS,ENGINESIZE,CYLINDERS,TRANSMISSION,FUELTYPE,FUELCONSUMPTION_CITY,FUELCONSUMPTION_HWY,FUELCONSUMPTION_COMB,FUELCONSUMPTION_COMB_MPG,CO2EMISSIONS
0,2014,ACURA,ILX,COMPACT,2.0,4,AS5,Z,9.9,6.7,8.5,33,196
1,2014,ACURA,ILX,COMPACT,2.4,4,M6,Z,11.2,7.7,9.6,29,221
2,2014,ACURA,ILX HYBRID,COMPACT,1.5,4,AV7,Z,6.0,5.8,5.9,48,136
3,2014,ACURA,MDX 4WD,SUV - SMALL,3.5,6,AS6,Z,12.7,9.1,11.1,25,255
4,2014,ACURA,RDX AWD,SUV - SMALL,3.5,6,AS6,Z,12.1,8.7,10.6,27,244


### Feature Selection

In [6]:
# write your code here
X = df[['ENGINESIZE','CYLINDERS','FUELCONSUMPTION_CITY','FUELCONSUMPTION_HWY','FUELCONSUMPTION_COMB', 'FUELCONSUMPTION_COMB_MPG']]
y = df[['CO2EMISSIONS']]
X.head()

Unnamed: 0,ENGINESIZE,CYLINDERS,FUELCONSUMPTION_CITY,FUELCONSUMPTION_HWY,FUELCONSUMPTION_COMB,FUELCONSUMPTION_COMB_MPG
0,2.0,4,9.9,6.7,8.5,33
1,2.4,4,11.2,7.7,9.6,29
2,1.5,4,6.0,5.8,5.9,48
3,3.5,6,12.7,9.1,11.1,25
4,3.5,6,12.1,8.7,10.6,27


### Train Test Split

In [7]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=42)
print ('Train set:', X_train.shape,  y_train.shape)
print ('Test set:', X_test.shape,  y_test.shape)

Train set: (853, 6) (853, 1)
Test set: (214, 6) (214, 1)


### Creating Model

In [8]:
from sklearn.linear_model import LinearRegression

In [9]:
model = LinearRegression()

In [10]:
model.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [11]:
model.coef_

array([[  8.96115125,   7.24023686, -13.00156667,  -5.47215573,
         21.86697334,  -4.03446041]])

In [12]:
model.intercept_

array([262.57216087])

### Checking accuracy

In [13]:
from sklearn.metrics import r2_score

In [14]:
y_test_predict = model.predict(X_test)

In [15]:
r2_score(y_test, y_test_predict)

0.9034041224574011

So, the model is 90.34% accurate.

### Improving Accuracy
Let's try to improve the accuracy of the model.

In [16]:
from sklearn.preprocessing import PolynomialFeatures

In [17]:
# got maximum acuracy for degree 6
poly = PolynomialFeatures(degree=6,interaction_only=True)
X_train_temp = poly.fit_transform(X_train)
X_test_temp = poly.fit_transform(X_test)
X_train_temp.shape, X_test_temp.shape

((853, 64), (214, 64))

In [18]:
model.fit(X_train_temp, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [19]:
r2_score(y_test, model.predict(X_test_temp))

0.942656712168732

So, after increasing the accuracy, got an accuracy of 94.26%

Improvement of around 4%.

### Predicting CO2 emmision on User Input 

In [32]:
userInput = [[2.8, 6.0, 11.2, 7.5, 9.3, 32.0]]
userInput = poly.fit_transform(userInput)
y_pred = model.predict(userInput)
print("Predicted CO2 emmision is", float(y_pred))

Predicted CO2 emmision is 214.2050005115234
