In [1]:
import pandas as pd
import numpy as np
import scipy.stats as stats
import re

In [2]:
data = pd.read_csv('data_clean.csv')
data.dtypes

distance             float64
consume              float64
speed                  int64
temp_inside          float64
temp_outside           int64
gas_type              object
ac                     int64
rain                   int64
sun                    int64
consumtion_per_km    float64
price_per_km         float64
dtype: object

In [3]:
data['ac'] = data['ac'].astype(object)
data['rain'] = data['rain'].astype(object)
data['sun'] = data['sun'].astype(object)
data.dtypes

distance             float64
consume              float64
speed                  int64
temp_inside          float64
temp_outside           int64
gas_type              object
ac                    object
rain                  object
sun                   object
consumtion_per_km    float64
price_per_km         float64
dtype: object

### Hypothesis is that a car uses more fuel when driving on E10 (the 'greener' fuel)

In [4]:
# H0 = there is no difference which fuel is used
# H1 = e10 consumes more fuel
# alpha = 0.05

In [5]:
# separate the consume data for E10 and E98
consume_e10 = data[data['gas_type'] == 'E10']['consume']
consume_e98 = data[data['gas_type'] == 'SP98']['consume']

# perform a t-test for independent samples
t_statistic, p_value = stats.ttest_ind(consume_e10, consume_e98, equal_var=False)

# set the significance level (alpha)
alpha = 0.05

# compare p-value with alpha
if p_value < alpha:
    print("Reject null hypothesis: there is a significant difference in consumption between E10 and E98")
else:
    print("Cannot reject null hypothesis: there is no significant difference in consumption between E10 and E98")

Cannot reject null hypothesis: there is no significant difference in consumption between E10 and E98


In [6]:
# 1=E10, 2=SP98
data['gas_type'] = [re.sub('E10', '1', str(w)) for w in data['gas_type']]
data['gas_type'] = [re.sub('SP98', '2', str(w)) for w in data['gas_type']]

### Linear regression to predict consumption

In [7]:
from sklearn import linear_model
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error


y = data['consume']
X = data.drop(['consume'], axis=1)


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

X_train_num = X_train.select_dtypes(np.number).reset_index()
X_test_num = X_test.select_dtypes(np.number).reset_index()
X_train_cat = X_train.select_dtypes(object)
X_test_cat = X_test.select_dtypes(object)


MinMaxtransformer = MinMaxScaler().fit(X_train_num)
X_train_num_normalized = pd.DataFrame(MinMaxtransformer.transform(X_train_num),columns=X_train_num.columns)
X_test_num_normalized = pd.DataFrame(MinMaxtransformer.transform(X_test_num),columns=X_train_num.columns)


encoder = OneHotEncoder(drop='first').fit(X_train_cat) 
cols = encoder.get_feature_names_out(input_features=X_train_cat.columns)
X_train_cat_encode = pd.DataFrame(encoder.transform(X_train_cat).toarray(),columns=cols)
X_test_cat_encode = pd.DataFrame(encoder.transform(X_test_cat).toarray(),columns=cols)


lm = linear_model.LinearRegression()
lm.fit(X_train,y_train)

predictions = lm.predict(X_train)
r2_score(y_train, predictions)
predictions_test = lm.predict(X_test)
r2_score(y_test, predictions_test)
mse=mean_squared_error(y_test,predictions_test)
rmse = np.sqrt(mean_squared_error(y_test,predictions_test))
print(rmse)
print(mse)

1.0824034034793824
1.1715971278637505


In [8]:
predictions_df = pd.DataFrame(predictions, index = y_train.index.copy())
predictions_test_df = pd.DataFrame(predictions_test, index = y_test.index.copy())
display(predictions_df)
display(predictions_test_df)

Unnamed: 0,0
299,3.830151
579,4.672019
619,5.032847
592,5.595583
416,5.770812
...,...
328,5.337780
496,5.055583
175,4.350245
473,5.837375


Unnamed: 0,0
239,4.488719
484,4.534221
304,4.190463
773,3.912099
172,4.278351
...,...
305,5.477011
646,5.169341
688,2.927577
521,4.831962


In [9]:
all_predictions = pd.concat([predictions_df, predictions_test_df], axis = 0)
all_predictions["consumption_predicted"] = pd.concat([predictions_df, predictions_test_df], axis = 0)
all_predictions = all_predictions.drop(all_predictions.columns[0],axis=1)
all_predictions['consumption_predicted'] = all_predictions['consumption_predicted'].astype(float)
all_predictions

Unnamed: 0,consumption_predicted
299,3.830151
579,4.672019
619,5.032847
592,5.595583
416,5.770812
...,...
305,5.477011
646,5.169341
688,2.927577
521,4.831962


In [10]:
data_with_pred = pd.merge(data, all_predictions, left_index=True, right_index=True)

display(data_with_pred)

Unnamed: 0,distance,consume,speed,temp_inside,temp_outside,gas_type,ac,rain,sun,consumtion_per_km,price_per_km,consumption_predicted
0,28.0,5.0,26,21.5,12,1,0,0,0,1.4000,1.932000,5.468501
1,12.0,4.2,30,21.5,13,1,0,0,0,0.5040,0.695520,4.737000
2,11.2,5.5,38,21.5,15,1,0,0,0,0.6160,0.850080,5.322694
3,12.9,3.9,36,21.5,14,1,0,0,0,0.5031,0.694278,4.452708
4,18.5,4.5,46,21.5,15,1,0,0,0,0.8325,1.148850,4.628230
...,...,...,...,...,...,...,...,...,...,...,...,...
771,16.0,3.7,39,24.5,18,2,0,0,0,0.5920,0.864320,3.958132
772,16.1,4.3,38,25.0,31,2,1,0,0,0.6923,1.010758,4.371067
773,16.0,3.8,45,25.0,19,2,0,0,0,0.6080,0.887680,3.912099
774,15.4,4.6,42,25.0,31,2,1,0,0,0.7084,1.034264,4.543431


# Conclusion

#### When it comes to the type of gas to use, it looks like using E10 leads to a higher consumption.
#### But as it is also quite a bit cheaper than SP98, it still is the better choice, because on average, you will get farther for the same amount of money.