In [1]:
import requests
import numpy as np
import pandas as pd
import plotly.express as px
from sklearn.cluster import KMeans
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder, LabelBinarizer, OrdinalEncoder, LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.compose import ColumnTransformer

# Configuration pour afficher toutes les colonnes
pd.set_option('display.max_columns', 50)

In [2]:
import glob

# Chemin vers le dossier contenant les fichiers CSV
folder_path = "data/"

# Modèle de recherche pour les fichiers CSV
file_pattern = "*.csv"

# Liste pour stocker les DataFrames
datasets = []

# Rechercher tous les fichiers correspondant au modèle de recherche
csv_files = glob.glob(folder_path + file_pattern)

# Lire chaque fichier CSV et ajouter le DataFrame à la liste
for file in csv_files:
    df = pd.read_csv(file)
    datasets.append(df)

In [3]:
len(datasets)

15

In [4]:
# Concaténer tous les DataFrames de la liste "datasets"
concatenated_df = pd.concat(datasets, ignore_index=True)

In [5]:
concatenated_df.shape

(21910, 30)

In [6]:
concatenated_df.columns

Index(['stationcode', 'name', 'nom_arrondissement_communes', 'capacity',
       'latitude', 'longitude', 'is_station_open', 'is_installed',
       'numdocksavailable', 'numbikesavailable', 'mechanical', 'ebike',
       'is_renting', 'is_returning', 'duedate', 'year', 'month', 'day_name',
       'day', 'hour', 'minute', 'second', 'time_period', 'season',
       'is_holiday', 'availability_rate', 'utilization_rate', 'theo_capacity',
       'theo_utilization_rate', 'theo_availability_rate'],
      dtype='object')

In [7]:
target = "theo_utilization_rate"

features = ['nom_arrondissement_communes', 'latitude', 'longitude',
             'numdocksavailable', 'mechanical', 'ebike',
             'is_renting', 'hour', 'day', 'day_name',
              'minute', 'second', 'time_period']

dataset = concatenated_df.dropna().reset_index(drop=True)
dataset = dataset[(dataset.is_station_open==1) & (dataset.is_installed==1)]

X = dataset[features]
y = dataset[target]


In [8]:
X.describe()

Unnamed: 0,latitude,longitude,numdocksavailable,mechanical,ebike,is_renting,hour,day,minute,second
count,12889.0,12889.0,12889.0,12889.0,12889.0,12889.0,12889.0,12889.0,12889.0,12889.0
mean,48.858135,2.34103,19.767243,6.874699,3.920708,0.990457,15.31329,16.22011,39.154395,29.52184
std,0.030493,0.056427,12.408197,8.56138,3.758138,0.097225,3.886512,0.858401,3.665489,16.902677
min,48.764615,2.165597,0.0,0.0,0.0,0.0,0.0,7.0,0.0,0.0
25%,48.837668,2.302569,10.0,1.0,1.0,1.0,16.0,16.0,38.0,15.0
50%,48.858367,2.343698,18.0,3.0,3.0,1.0,16.0,16.0,40.0,30.0
75%,48.879331,2.377885,27.0,10.0,5.0,1.0,17.0,16.0,41.0,43.0
max,48.951432,2.538242,66.0,58.0,40.0,1.0,23.0,30.0,55.0,59.0


In [9]:
# Divid dataset into Train and test sets
print("Dividing into train and test sets ...")
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print("... Done.")
print()

Dividing into train and test sets ...
... Done.



In [10]:
features

['nom_arrondissement_communes',
 'latitude',
 'longitude',
 'numdocksavailable',
 'mechanical',
 'ebike',
 'is_renting',
 'hour',
 'day',
 'day_name',
 'minute',
 'second',
 'time_period']

In [11]:
# Création du pipeline pour les variables catégorielles
categorical_transformer = OneHotEncoder(categories='auto', drop='first', handle_unknown='ignore')

# Création du pipeline pour les variables ordinales
ordinal_transformer = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)

# Utilisation du ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, ['nom_arrondissement_communes', 'is_renting', "time_period", 'day_name']),
        ('ord', ordinal_transformer, ['numdocksavailable', 'mechanical', 'ebike', 'day', 'hour', 'minute', 'second'])
    ]
)

In [12]:
# Preprocessings on train set
print("Performing preprocessing on train set ...")
print(X.head())
X_train = preprocessor.fit_transform(X_train)
print("... Done.")

# Preprocessing on test set
print("Performing preprocessing on test set ...")
print(X_test.head())
X_test = preprocessor.transform(X_test)
print("... Done.")
print(X_test[:2,:])

Performing preprocessing on train set ...
  nom_arrondissement_communes   latitude  longitude  numdocksavailable  \
0                       Paris  48.835093   2.353468                 17   
1                       Paris  48.856452   2.334852                 17   
2                       Paris  48.853148   2.326391                 10   
3             Vitry-sur-Seine  48.796288   2.417212                 12   
4                       Paris  48.880222   2.285468                 36   

   mechanical  ebike  is_renting  hour  day day_name  minute  second  \
0          24      6           1    18   16   Sunday      40      14   
1           2      0           1    18   16   Sunday      36      39   
2           2      3           1    18   16   Sunday      37      42   
3          12      6           1    18   16   Sunday      38      11   
4           0      2           1    18   16   Sunday      40      32   

  time_period  
0     Evening  
1     Evening  
2     Evening  
3     Evening  


In [13]:
# Train model
print("Train model ...")
regressor = LinearRegression()
regressor.fit(X_train, y_train)
print("... Done.")

Train model ...
... Done.


In [14]:
print("R2 score on training set: ", regressor.score(X_train, y_train))
print("R2 score on test set: ", regressor.score(X_test, y_test))

R2 score on training set:  0.8945386873203037
R2 score on test set:  0.8915556090079564


In [15]:
print("Random Forest with default hyperpameters ...")
regressor = RandomForestRegressor()
regressor.fit(X_train, y_train)
print("... Done.")

Random Forest with default hyperpameters ...
... Done.


In [16]:
print("R2 score on training set: ", regressor.score(X_train, y_train))
print("R2 score on test set: ", regressor.score(X_test, y_test))

R2 score on training set:  0.9998715742341968
R2 score on test set:  0.9911297133217641
