# Challenge 1 - Multiple Regression Analysis

# Before your start:

    Read the README.md file
    Comment as much as you can and use the resources (README.md file)
    Happy learning!

In [20]:
## Import the libraries for loading the data set 
## Load the dataset 
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
import seaborn as sns
from sklearn import linear_model
from sklearn.metrics import mean_squared_error, r2_score

import statsmodels.api as sm
from statsmodels.formula.api import ols #Summary statistics

In [3]:
## You can use the warnings library to ignore warnings that might show when you run the code
import warnings
warnings.filterwarnings('ignore')

In [4]:
data = pd.read_csv('vehicles.csv')

In [17]:
data.columns

Index(['Make', 'Model', 'Year', 'Engine Displacement', 'Cylinders',
       'Transmission', 'Drivetrain', 'Vehicle Class', 'Fuel Type',
       'Fuel Barrels/Year', 'City MPG', 'Highway MPG', 'Combined MPG',
       'CO2 Emission Grams/Mile', 'Fuel Cost/Year'],
      dtype='object')

### Data Pre-processing (Handling Numerical variables)

In [5]:
## Store the specified numerical columns data as a separate dataframe. Give it the name "numerics"
numerics = data[['Engine Displacement', 'Fuel Barrels/Year', 'Combined MPG']].copy()

In [6]:
numerics.head(5)

Unnamed: 0,Engine Displacement,Fuel Barrels/Year,Combined MPG
0,2.5,19.388824,17
1,4.2,25.354615,13
2,2.5,20.600625,16
3,4.2,25.354615,13
4,3.8,20.600625,16


### MinMax scaler

Hint: Since we are using "numerics" to store the nummerical variables we can pass "numerics" directly
as MinMaxScaler().fit(numerics)

In [7]:
## Import the required library
## Perform the scaling and store the results inside "numerical"
from sklearn.preprocessing import MinMaxScaler
scaler  = MinMaxScaler().fit(numerics)

In [8]:
## Convert "numerical" into a dataframe so that it can be used later with the dataframe of categorical variables
numerical = scaler.transform(numerics)


In [9]:
numerical

array([[0.24358974, 0.4110142 , 0.20408163],
       [0.46153846, 0.53787268, 0.12244898],
       [0.24358974, 0.43678233, 0.18367347],
       ...,
       [0.05128205, 0.19341667, 0.59183673],
       [0.03846154, 0.19341667, 0.59183673],
       [0.03846154, 0.19897931, 0.57142857]])

In [10]:
numerical=pd.DataFrame(numerical)

In [23]:
numerical.dtypes

0    float64
1    float64
2    float64
dtype: object

### Data Pre-processing (Handling Categorical variables)

In [12]:
## Similar to numerical variables, store the specified categorical columns data as a dataframe. 
## Give it the name "cats"
cats= data[['Cylinders', 'Fuel Type', 'Drivetrain']].copy()

In [13]:
## Check if "cats" is actually a dataframe using cats.head(3)
cats.head(3)

Unnamed: 0,Cylinders,Fuel Type,Drivetrain
0,4.0,Regular,2-Wheel Drive
1,6.0,Regular,2-Wheel Drive
2,4.0,Regular,Rear-Wheel Drive


### Using One Hot Encoding 

In [14]:
# Perform One hot encoding and store the results (one hot encoded dataframe) into "categorical"
categorical=pd.get_dummies(cats, columns=['Cylinders', 'Fuel Type', 'Drivetrain'])

In [15]:
## Check how the new OHE data looks like using the head() function
categorical.head(3)

Unnamed: 0,Cylinders_2.0,Cylinders_3.0,Cylinders_4.0,Cylinders_5.0,Cylinders_6.0,Cylinders_8.0,Cylinders_10.0,Cylinders_12.0,Cylinders_16.0,Fuel Type_CNG,...,Fuel Type_Regular Gas and Electricity,Fuel Type_Regular Gas or Electricity,Drivetrain_2-Wheel Drive,"Drivetrain_2-Wheel Drive, Front",Drivetrain_4-Wheel Drive,Drivetrain_4-Wheel or All-Wheel Drive,Drivetrain_All-Wheel Drive,Drivetrain_Front-Wheel Drive,Drivetrain_Part-time 4-Wheel Drive,Drivetrain_Rear-Wheel Drive
0,0,0,1,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
1,0,0,0,0,1,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
2,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [16]:
X = pd.concat([numerical,categorical],axis=1)

In [19]:
## Store the target variable "CO2 emissions for vehicles" as a dataframe 'Y'
Y = pd.DataFrame(data['CO2 Emission Grams/Mile'])

In [21]:
## Import the libraries required for regression model 
## Fit the linear regression model on the data
lm = linear_model.LinearRegression()
model = lm.fit(X,Y)
lm.intercept_
lm.coef_
predictions  = lm.predict(X)


In [30]:
## Make predictions on the dataset, store the results in "predictions"

In [22]:
## Print the measures of accuracy of the model - MSE, RMSE, and R2 score
from sklearn.metrics import mean_squared_error, r2_score
print(mean_squared_error(Y, predictions))
r2_score(Y, predictions)

54.69433636217146


0.9961415130148252