# Importing libraries

In [4]:
!pip install ydata_profiling




[notice] A new release of pip is available: 23.3.1 -> 23.3.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [5]:
# Import the necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from ydata_profiling import ProfileReport
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Lasso, LassoCV, LinearRegression, Ridge
from sklearn import metrics
from sklearn.decomposition import PCA
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

## Data loading and preprocessing

In [6]:
# Loading Data from a JSON File
with open('couriers_data.json', 'r') as file:
    data = pd.read_json(file)

In [7]:
# Profiling data
report = ProfileReport(data)
report.to_file('data_profile_report.html')

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

From the report, it is evident that the data contains categorical features that need to be processed. To do this, you can use Label Encoding or one-hot encoding. In this task, the mode of transportation can be considered as ordinal variables.

In [8]:
# Handling Categorical Features
category_mapping = {
    "foot": 1,
    "bicycle": 2,
    "scooter": 3,
    "automobile": 4,
}

data['courier_transport'] = data['courier_transport'].map(category_mapping)

It is also visible from the report that we have two columns with timestamps that, according to the task conditions, we will no longer need after performing the required calculations. Let's perform the necessary calculations and drop the two timestamp columns.

In [9]:
# Transformation of timestamp columns
data['work_start'] = pd.to_datetime(data['work_start'])
data['work_finish'] = pd.to_datetime(data['work_finish'])

# Calculating the difference between 'work_finish' and 'work_start' in minutes.
data['hours_engaged'] = (data['work_finish'] - data['work_start']).dt.total_seconds() / 60

# Removing the two columns 'work_start' and 'work_finish'.
data.drop(['work_start', 'work_finish'], axis=1, inplace=True)

After loading and processing the data, we need to split it into features and targets.

In [10]:
# We split the data into features and targets.
X = data.drop('total_deliveries', axis=1)
y = data[['total_deliveries']]


Next, it is necessary to split the data into training and testing datasets in order to conduct the training and performance evaluation of the machine learning model.

In [11]:
# We split the data into training and testing sets.
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.9, shuffle=False)

To ensure neutrality towards the weight of different features, we apply data scaling.

In [12]:
# We scale the data.
columns_to_scale = [name for name in X_train.columns if name != 'courier_transport']
scaler = StandardScaler()

scaler.fit(X_train[columns_to_scale])
X_train[columns_to_scale] = scaler.transform(X_train[columns_to_scale])
X_test[columns_to_scale] = scaler.transform(X_test[columns_to_scale])

## Feature selection and creation.

To assess feature importance and select the best features for further training, we will train two models: Lasso and LassoCV. After comparing their results, we will be able to determine which features can be excluded from our dataset for more effective model training.

In [13]:
# Let's create an instance of the Lasso model:
lasso = Lasso()
lasso.fit(X_train, y_train)

print('Lasso coef', lasso.coef_)

y_pred = lasso.predict(X_test)

print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
print('Coefficient of determination  R2:', metrics.r2_score(y_test, y_pred))

Lasso coef [-0.          0.          3.3185418  -0.         -0.          0.91680957]
Mean Absolute Error: 2.6547052575996535
Root Mean Squared Error: 3.6618011933926153
Coefficient of determination  R2: 0.5532542199934868


In [14]:
# Let's create an instance of LassoCV with cross-validation:
lasso_cv = LassoCV()
lasso_cv.fit(X_train, y_train)

print('LassoCV coef', lasso_cv.coef_)

y_pred = lasso_cv.predict(X_test)

print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
print('Coefficient of determination  R2:', metrics.r2_score(y_test, y_pred))

LassoCV coef [ 0.          0.2633966   3.94543263 -0.08829676 -0.24219562  1.52772364]
Mean Absolute Error: 2.2554598370650445
Root Mean Squared Error: 3.447522622445247
Coefficient of determination  R2: 0.6040091195153425


  y = column_or_1d(y, warn=True)


Based on the results:

Lasso has an MAE (Mean Absolute Error) of approximately 2.710, an RMSE (Root Mean Squared Error) of approximately 3.880, and an R-squared (R2) of approximately 0.593.

LassoCV has an MAE of approximately 2.400, an RMSE of approximately 3.648, and an R2 of approximately 0.640.

From these metrics, we can conclude that LassoCV performs better, as it has lower MAE and RMSE values and a higher R2 value, indicating more accurate and explainable predictions.

However, it's worth noting that LassoCV shows only a slight improvement compared to Lasso, while Lasso removes a larger number of features. This can be useful for simplifying the training of future models and increasing their interpretability.

In [15]:
# Obtaining coefficients for each feature
feature_coefficients = lasso.coef_
print(lasso.coef_)

# We create a list of selected features.
selected_features = X_train.columns[feature_coefficients != 0]

# We create a mask for the selected features.
selected_feature_mask = feature_coefficients != 0

# We use the mask to select the features.
X_train_selected = X_train.iloc[:, selected_feature_mask]
X_test_selected = X_test.iloc[:, selected_feature_mask]

[-0.          0.          3.3185418  -0.         -0.          0.91680957]


## Visualization of Data

In [16]:
# Applying PCA to the Data
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_train_selected)

# Splitting the data into two subsets, for example, based on a condition
X_pca1 = X_pca[:5600,:]
X_pca2 = X_pca[5600:,:]

# Creating a scatter plot for the first subset (blue color)
plt.scatter(X_pca1[:, 0], X_pca1[:, 1], c='blue', cmap='viridis', label='Group 1')

# Creating a scatter plot for the second subset (red color)
plt.scatter(X_pca2[:, 0], X_pca2[:, 1], c='red', cmap='viridis', label='Group 2')

plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.colorbar(label='Target Variable')
plt.title('PCA Visualization')
plt.legend()  # Adding a legend to label the groups.
plt.show()

  plt.scatter(X_pca1[:, 0], X_pca1[:, 1], c='blue', cmap='viridis', label='Group 1')
  plt.scatter(X_pca2[:, 0], X_pca2[:, 1], c='red', cmap='viridis', label='Group 2')
  plt.show()


On this graph, we can observe a linear relationship.

## Machine Learning Models for Regression

In [17]:
# Create a list of models
models = [
    LinearRegression(),# Example of a Linear Regression model
    Lasso(),  # Example of a Lasso regularization model
    Ridge(),  # Example of a Ridge regularization model
]

# We train and evaluate each model.
for model in models:
    model.fit(X_train_selected, y_train)
    y_pred = model.predict(X_test_selected)
    mae = mean_absolute_error(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred)
    print(f'Model: {model.__class__.__name__}')
    print(f'Mean Absolute Error: {mae:.2f}')
    print(f'Root Mean Squared Error: {rmse:.2f}')
    print(f'R-squared (R2): {r2:.2f}')
    print('-' * 40)


Model: LinearRegression
Mean Absolute Error: 2.37
Root Mean Squared Error: 3.53
R-squared (R2): 0.58
----------------------------------------
Model: Lasso
Mean Absolute Error: 2.65
Root Mean Squared Error: 3.66
R-squared (R2): 0.55
----------------------------------------
Model: Ridge
Mean Absolute Error: 2.37
Root Mean Squared Error: 3.53
R-squared (R2): 0.58
----------------------------------------
