In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns
import sklearn.metrics as metrics

from sklearn import datasets
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.cluster import KMeans
from matplotlib import pyplot
from sklearn.metrics import silhouette_score
from PIL import Image
from IPython.display import display
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import accuracy_score, roc_curve, roc_auc_score
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.metrics import mean_squared_error, r2_score

from sklearn.linear_model import LogisticRegression

get_ipython().run_line_magic('matplotlib', 'inline')

In [None]:
# Load data
df = pd.read_csv("measurements.csv")

df01 = pd.read_excel("measurements2.xlsx")

In [None]:
# Cleaning & EDA
df.info()

In [None]:
# Data type conversion
column_value_counts = df.count()
columns_to_convert = ["distance", "consume", "temp_inside"]

for column in columns_to_convert:
    df[column] = pd.to_numeric(df[column], errors='coerce', downcast='integer')

df.info()

In [None]:
# Handling null values
columns_to_delete = ["specials", "refill liters", "refill gas"]
df = df.drop(columns=columns_to_delete)

In [None]:
# Filling in missing values
column_means = df.mean()
df.fillna(column_means, inplace=True)

In [None]:
# Adding costs and liter values
df_final = pd.DataFrame(df)
df_final['Liter'] = df_final['consume'] * df_final['distance']

In [None]:
# Defining values by gas type
gas_ratio = {
    'E10': 1.38,
    'SP98': 1.46
}

In [None]:
def apply_multiplier(row):
    return row['Liter'] * gas_ratio.get(row['gas_type'], 1)

In [None]:
# Applying the function
df_final['Cost by Type of Gas'] = df_final.apply(apply_multiplier, axis=1)

df_final.to_csv("gas_analysis_cost.csv", index=False)

filtered_df = df_final[(df_final['distance'] == 28)]

In [None]:
# Preparing for modeling ## This is a regression model
numerical_df = df.select_dtypes(include=[np.number])
categorical_df = df.select_dtypes(include=['object'])

numerical_df = numerical_df.reset_index(drop=True)

In [None]:
# Checking for correlation
corr = numerical_df.loc[:,['distance', 'speed', 'temp_inside', "temp_outside"]].corr()
matrix = np.triu(corr)
np.fill_diagonal(matrix, False)
sns.heatmap(corr, annot=True, mask=matrix)

## There's no high correlation between features

In [None]:
# Categorical data encoding
dummy_nominals = ["gas_type"]
categorical_df = pd.get_dummies(categorical_df, columns=dummy_nominals)

df_model = pd.concat([numerical_df, categorical_df], axis=1)

df_model.to_csv("analysis_best_gas.csv", index=False)

In [None]:
# X/Y Train-Test Split
X = df_model[['distance', 'speed', 'temp_inside', 'temp_outside', 'AC',
              'rain', 'sun', 'gas_type_E10', 'gas_type_SP98']]
y = df_model['consume']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
# Scaling
scaler = MinMaxScaler()
scaler.fit(X_train)
X_scaler_train = scaler.transform(X_train)
X_scaler_test = scaler.transform(X_test)

model_lr = LinearRegression()
model_lr.fit(X_scaler_train, y_train)

y_predictive = model_lr.predict(X_scaler_test)

mse = mean_squared_error(y_test, y_predictive)
r2 = r2_score(y_test, y_predictive)

coefficients = model_lr.coef_

a = {
    "independent variables": ['distance', 'speed', 'temp_inside', 'temp_outside', 'AC',
                              'rain', 'sun', 'gas_type_E10', 'gas_type_SP98'],
    'coefficients': coefficients
}

coefficients_analysis = pd.DataFrame(a)

df_orden_importances = coefficients_analysis.sort_values(by='coefficients', ascending=False)

plt.figure(figsize=(10, 8))
sns.barplot(x='coefficients', y='independent variables', data=df_orden_importances)
plt.xlabel('coefficients')
plt.ylabel('Independent Variables')
plt.title('Importance of variables in the model')
plt.show()

In [None]:
# Take user input and predict consumption
distance = int(input("Enter distance: "))
rain = int(input("Is the day going to be rainy? (1 for yes, 0 for no): "))
sun = int(input("Is the day going to be sunny? (1 for yes, 0 for no): "))
speed = int(input("What is the speed?: "))
ac = int(input("Are you using air-conditioning? (1 for yes, 0 for no): "))
e10 = int(input("Are you using E10? (1 for yes, 0 for no): "))
sp98 = int(input("Are you using SP98? (1 for yes, 0 for no): "))
temp_inside = int(input("What was the temperature inside the car?: "))
temp_outside = int(input("What was the temperature outside the car?: "))

In [None]:
# Create a DataFrame with the correct order of columns
X_example = pd.DataFrame({
    "distance": [distance],
    "speed": [speed],
    "temp_inside": [temp_inside],
    "temp_outside": [temp_outside],
    "AC": [ac],
    "rain": [rain],
    "sun": [sun],
    "gas_type_E10": [e10],
    "gas_type_SP98": [sp98]
})

In [None]:
# Transform the input data using the fitted scaler
X_example_scaled = scaler.transform(X_example)

In [None]:
# Make the prediction
y_example = model_lr.predict(X_example_scaled)

print("Consume:", int(y_example))