# Importing libraries

In [42]:
import pickle
import pandas as pd
import numpy as np
from scipy import stats

# Loading pickled dataframe

In [43]:
# loading the clean DataFrame from the pickle file
with open('clean_data.pkl', 'rb') as f:
    dataset = pickle.load(f)

In [44]:
dataset.head()

Unnamed: 0,speed,temp_outside,gas_type,AC,rain,sun,temp_inside,distance,consume
0,26,12,E10,0,0,0,21.5,28.0,5.0
1,30,13,E10,0,0,0,21.5,12.0,4.2
2,38,15,E10,0,0,0,21.5,11.2,5.5
3,36,14,E10,0,0,0,21.5,12.9,3.9
4,46,15,E10,0,0,0,21.5,18.5,4.5


In [45]:
dataset.describe()

Unnamed: 0,speed,temp_outside,AC,rain,sun,temp_inside,distance,consume
count,776.0,776.0,776.0,776.0,776.0,776.0,776.0,776.0
mean,41.927835,11.358247,0.07732,0.123711,0.082474,21.929521,19.652835,4.912371
std,13.589747,6.98703,0.26727,0.329464,0.275263,0.994024,22.653208,1.032505
min,14.0,-5.0,0.0,0.0,0.0,19.0,1.3,3.3
25%,32.75,7.0,0.0,0.0,0.0,21.5,11.8,4.3
50%,40.5,10.0,0.0,0.0,0.0,22.0,14.6,4.7
75%,50.0,16.0,0.0,0.0,0.0,22.5,19.0,5.3
max,90.0,31.0,1.0,1.0,1.0,25.5,216.1,12.2


# Hypothesis testing

In [46]:
#creating an independent t test to check differences between SP98 gas type and E10 gas type in terms of consumption, distance and speed

In [47]:
def t_test_features(s1, s2, features=['speed', 'temp_outside', 'temp_inside','distance', 'consume']):
    """Test means of a feature set of two samples
    
    Args:
        s1 (dataframe): sample 1
        s2 (dataframe): sample 2
        features (list): an array of features to test
    
    Returns:
        dict: a dictionary of t-test scores for each feature where the feature name is the key and the p-value is the value
    """
    results = {}

    # Your code here
    #dictionary comprehension to return first the feature and then the computed p-value for each feature
    results = {f:stats.ttest_ind(s1[f],s2[f])[1] for f in features}
    return results

In [48]:
sp98 = dataset[dataset['gas_type'] == 'SP98']
e10 = dataset[dataset['gas_type'] == 'E10']

In [49]:
#Checking the mean and standard deviation of both groups

In [50]:
sp98.describe()

Unnamed: 0,speed,temp_outside,AC,rain,sun,temp_inside,distance,consume
count,456.0,456.0,456.0,456.0,456.0,456.0,456.0,456.0
mean,40.820175,12.22807,0.100877,0.140351,0.087719,21.938007,18.639912,4.899123
std,13.155641,7.263378,0.301497,0.347732,0.283197,1.176545,24.153013,1.117178
min,16.0,-3.0,0.0,0.0,0.0,19.0,1.3,3.3
25%,32.0,7.0,0.0,0.0,0.0,21.5,11.8,4.2
50%,39.5,11.0,0.0,0.0,0.0,22.0,14.15,4.7
75%,48.0,17.0,0.0,0.0,0.0,22.0,18.15,5.225
max,90.0,31.0,1.0,1.0,1.0,25.5,216.1,12.2


In [51]:
e10.describe()

Unnamed: 0,speed,temp_outside,AC,rain,sun,temp_inside,distance,consume
count,320.0,320.0,320.0,320.0,320.0,320.0,320.0,320.0
mean,43.50625,10.11875,0.04375,0.1,0.075,21.917429,21.09625,4.93125
std,14.055866,6.382158,0.204859,0.30047,0.263804,0.652576,20.27538,0.899543
min,14.0,-5.0,0.0,0.0,0.0,21.0,1.7,3.7
25%,35.0,6.0,0.0,0.0,0.0,21.5,12.075,4.4
50%,42.0,9.0,0.0,0.0,0.0,21.5,15.4,4.8
75%,51.0,14.25,0.0,0.0,0.0,22.5,21.2,5.3
max,88.0,27.0,1.0,1.0,1.0,25.0,130.3,10.8


In [52]:
t_test_features(sp98, e10)

{'speed': 0.006643089547999849,
 'temp_outside': 3.197717172316492e-05,
 'temp_inside': 0.776698809489623,
 'distance': 0.13713832082712782,
 'consume': 0.6698915506471721}

In [53]:
#The only differences that are meaningful is the speed and the temperature outside, and as we saw earlier, 
#these two variables are negatively correlated with consumption. In other words: as speed decreases, consumption increases, 
#and as temperature outside decreases, consumption increases.

#that might be why Cobify thinks that E10 cars consume more, and although it seems like they do (slightly)
#it isn't statistically significant or meaningful

# Predicting consumption

In [54]:
#First we should transform all features to numerical

In [55]:
dummies = pd.get_dummies(dataset['gas_type'], prefix='gas_type')

In [56]:
data = pd.concat([dataset, dummies], axis=1)

In [57]:
# drop the original car_type column
data.drop('gas_type', axis=1, inplace=True)

In [58]:
data

Unnamed: 0,speed,temp_outside,AC,rain,sun,temp_inside,distance,consume,gas_type_E10,gas_type_SP98
0,26,12,0,0,0,21.5,28.0,5.0,1,0
1,30,13,0,0,0,21.5,12.0,4.2,1,0
2,38,15,0,0,0,21.5,11.2,5.5,1,0
3,36,14,0,0,0,21.5,12.9,3.9,1,0
4,46,15,0,0,0,21.5,18.5,4.5,1,0
...,...,...,...,...,...,...,...,...,...,...
771,39,18,0,0,0,24.5,16.0,3.7,0,1
772,38,31,1,0,0,25.0,16.1,4.3,0,1
773,45,19,0,0,0,25.0,16.0,3.8,0,1
774,42,31,1,0,0,25.0,15.4,4.6,0,1


In [59]:
from sklearn.linear_model import LinearRegression


# Define the independent variables (features) and dependent variable (target)
X = data.drop('consume', axis=1)
y = data['consume']

# Split the data into training and testing sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the linear regression model on the training data
reg = LinearRegression().fit(X_train, y_train)

# Predict consumption using the test data
y_pred = reg.predict(X_test)

# Evaluate the model using mean squared error and R squared
from sklearn.metrics import mean_squared_error, r2_score
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print('Mean Squared Error:', mse)
print('R-squared:', r2)

Mean Squared Error: 0.9231227990691891
R-squared: 0.13978040531591962


In [61]:
#MSE is not bad, but the R squared is very poor

# Gradient boosting regressor

In [63]:
from sklearn.ensemble import GradientBoostingRegressor

gb_reg = GradientBoostingRegressor(max_depth=5, 
                                   n_estimators=100,
                                   random_state=42
                                   )
gb_reg.fit(X_train, y_train)
gb_reg.score(X_test,y_test)

0.8845448282487295

In [65]:
y_pred = gb_reg.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print('Mean Squared Error:', mse)
print('R-squared:', r2)

Mean Squared Error: 0.12389778374345048
R-squared: 0.8845448282487295


In [66]:
#A lot better!

# Saving dataframe to create report

In [68]:
#Saving the original one with the different gas type cars

In [69]:
dataset.to_csv('data.csv', index=False)