In [2]:
#Extracción de Datos
import pandas as pd  
import numpy as np  
import matplotlib.pyplot as plt  
%matplotlib inline

dataset = pd.read_csv('petrol_consumption.csv')
dataset.head() 

Unnamed: 0,Petrol_tax,Average_income,Paved_Highways,Population_Driver_licence(%),Petrol_Consumption
0,9.0,3571,1976,0.525,541
1,9.0,4092,1250,0.572,524
2,9.0,3865,1586,0.58,561
3,7.5,4870,2351,0.529,414
4,8.0,4399,431,0.544,410


In [3]:
#Análisis descriptivo
dataset.describe()  

Unnamed: 0,Petrol_tax,Average_income,Paved_Highways,Population_Driver_licence(%),Petrol_Consumption
count,48.0,48.0,48.0,48.0,48.0
mean,7.668333,4241.833333,5565.416667,0.570333,576.770833
std,0.95077,573.623768,3491.507166,0.05547,111.885816
min,5.0,3063.0,431.0,0.451,344.0
25%,7.0,3739.0,3110.25,0.52975,509.5
50%,7.5,4298.0,4735.5,0.5645,568.5
75%,8.125,4578.75,7156.0,0.59525,632.75
max,10.0,5342.0,17782.0,0.724,968.0


In [4]:
#Preparacion-Preprocesamiento
X = dataset[['Petrol_tax', 'Average_income', 'Paved_Highways',  
       'Population_Driver_licence(%)']]
y = dataset['Petrol_Consumption']  

In [5]:
# Splitting dataset en conjunto de training y conjunto de Test
from sklearn.model_selection import train_test_split  
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0) 

In [7]:
#Training
from sklearn.linear_model import LinearRegression  
regressor = LinearRegression()  
regressor.fit(X_train, y_train)
#El modelo de regresión intenta encontrar los coeficientes más óptimos para todos los atributos.
coeff_df = pd.DataFrame(regressor.coef_, X.columns, columns=['Coefficient'])  
#Análisis: esto significa que por una unidad de incremento en "impuesto a la gasolina", hay una disminución de 40 millones
#de galones de consumo de gas. De manera similar, un aumento unitario en la proporción de la población con una licencia de
#conducir resulta en un aumento de 1.341 millones de galones de consumo de gas. Podemos ver que "Averageincome" y 
#"Paved_Highways" tienen un efecto muy pequeño en el consumo de gas.
coeff_df 

Unnamed: 0,Coefficient
Petrol_tax,-40.01666
Average_income,-0.065413
Paved_Highways,-0.004741
Population_Driver_licence(%),1341.862121


In [8]:
#Predicciones
#y_pred = regressor.predict(X_test) 
y_pred = regressor.predict(X_test) 

In [9]:
#Comparacion Manual
df = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})  
df 

Unnamed: 0,Actual,Predicted
29,534,469.391989
4,410,545.645464
26,577,589.668394
30,571,569.730413
32,577,649.774809
37,704,646.631164
34,487,511.608148
40,587,672.475177
7,467,502.074782
10,580,501.270734


In [11]:
#Evaluación
from sklearn import metrics  
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))  
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))  
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))  
#Se puede ver que RMSE es 68, que es ligeramente mayor que el 10% del valor medio del consumo de gas en todos 
#los estados (media=576.770833).
#Esto significa que el algoritmo no es muy preciso pero aún puede hacer predicciones razonablemente buenas.

Mean Absolute Error: 56.822247478964684
Mean Squared Error: 4666.3447875883585
Root Mean Squared Error: 68.31064915215165
