# import des modules

In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
import numpy as np
from joblib import dump

# import et nettoyage des données

In [13]:
# import
# source : https://archive.ics.uci.edu/ml/datasets/bike+sharing+dataset
# données de la ville de Washington DC 2011 2012

df = pd.read_csv('Bike-Sharing-Dataset/day.csv')
df.head()

Unnamed: 0,instant,dteday,season,yr,mnth,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
0,1,2011-01-01,1,0,1,0,6,0,2,0.344167,0.363625,0.805833,0.160446,331,654,985
1,2,2011-01-02,1,0,1,0,0,0,2,0.363478,0.353739,0.696087,0.248539,131,670,801
2,3,2011-01-03,1,0,1,0,1,1,1,0.196364,0.189405,0.437273,0.248309,120,1229,1349
3,4,2011-01-04,1,0,1,0,2,1,1,0.2,0.212122,0.590435,0.160296,108,1454,1562
4,5,2011-01-05,1,0,1,0,3,1,1,0.226957,0.22927,0.436957,0.1869,82,1518,1600


In [3]:
# dimensions des données
df.shape

(731, 16)

In [4]:
# sélection des colonnes
df = df[['dteday', 'holiday', 'weekday', 'weathersit', 'temp', 'hum', 'windspeed', 'cnt']]
df.head()

Unnamed: 0,dteday,holiday,weekday,weathersit,temp,hum,windspeed,cnt
0,2011-01-01,0,6,2,0.344167,0.805833,0.160446,985
1,2011-01-02,0,0,2,0.363478,0.696087,0.248539,801
2,2011-01-03,0,1,1,0.196364,0.437273,0.248309,1349
3,2011-01-04,0,2,1,0.2,0.590435,0.160296,1562
4,2011-01-05,0,3,1,0.226957,0.436957,0.1869,1600


In [5]:
# nettoyage des colonnes
df['dteday'] = pd.to_datetime(df['dteday'], format='%Y-%m-%d')
df['temp'] = df['temp']*41
df['hum'] = df['hum']*100
df['windspeed'] = df['windspeed']*67
df['weekday'] = df['dteday'].dt.day_name()
df.head()

Unnamed: 0,dteday,holiday,weekday,weathersit,temp,hum,windspeed,cnt
0,2011-01-01,0,Saturday,2,14.110847,80.5833,10.749882,985
1,2011-01-02,0,Sunday,2,14.902598,69.6087,16.652113,801
2,2011-01-03,0,Monday,1,8.050924,43.7273,16.636703,1349
3,2011-01-04,0,Tuesday,1,8.2,59.0435,10.739832,1562
4,2011-01-05,0,Wednesday,1,9.305237,43.6957,12.5223,1600


In [6]:
# binarisation des colonnes catégorielles (nécessaire pour la modélisation)
df = df.join(pd.get_dummies(df['weekday'], prefix='weekday'))
df = df.drop(['weekday'], axis=1)
df.head()

Unnamed: 0,dteday,holiday,weathersit,temp,hum,windspeed,cnt,weekday_Friday,weekday_Monday,weekday_Saturday,weekday_Sunday,weekday_Thursday,weekday_Tuesday,weekday_Wednesday
0,2011-01-01,0,2,14.110847,80.5833,10.749882,985,0,0,1,0,0,0,0
1,2011-01-02,0,2,14.902598,69.6087,16.652113,801,0,0,0,1,0,0,0
2,2011-01-03,0,1,8.050924,43.7273,16.636703,1349,0,1,0,0,0,0,0
3,2011-01-04,0,1,8.2,59.0435,10.739832,1562,0,0,0,0,0,1,0
4,2011-01-05,0,1,9.305237,43.6957,12.5223,1600,0,0,0,0,0,0,1


In [7]:
# choix des colonnes pour la modélisation
prediction_col = 'cnt'
ignore_cols = ['dteday']
features_cols = [c for c in df.columns if (c != prediction_col and c not in ignore_cols)]

# entraînement d'un modèle de RF sur les données de 2011

In [8]:
# filtrage sur les données de 2011 pour l'apprentissage
train = df[df['dteday']<='2011-12-31']

In [9]:
# préparation des données pour ingestion dans le modèle
train_X = train[features_cols]
train_y = train[prediction_col]

In [10]:
# entraînement d'une random forest
model = RandomForestRegressor(random_state=0)
model.fit(train_X, train_y)

In [11]:
# export du modèle
dump(model, 'model_rf_2011.joblib')

['model_rf_2011.joblib']

# entraînement d'un modèle d'arbre de régression

In [12]:
model = DecisionTreeRegressor(random_state=0)
model.fit(train_X, train_y)
dump(model, 'model_tree_2011.joblib')

['model_tree_2011.joblib']