In [1]:
# bibliotheque
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split,GridSearchCV,RandomizedSearchCV,StratifiedKFold
from sklearn.metrics import mean_squared_error,mean_absolute_error,r2_score
from sklearn.preprocessing import OneHotEncoder,MinMaxScaler,StandardScaler,RobustScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression

import joblib

In [2]:
df = pd.read_csv("dataset.csv")
df.head()

Unnamed: 0,date,Unit,Quantity,Returned,Revenu,Region,Ville,ProductName,Categorie,Price,manufacturer
0,2023-10-05,634263.31,11,0,6976896.41,Est,Bertoua,Apple iPhone Model 103,Électronique,530754,Johnson & Johnson
1,2022-05-21,177400.67,16,0,2838410.72,Est,Bertoua,Samsung Galaxy Model 146,Électronique,168451,Johnson & Johnson
2,2023-01-26,113909.61,13,0,1480824.93,Est,Bertoua,LG Velvet Model 148,Électronique,89353,L'Oréal
3,2021-04-14,1909.67,6,0,11458.02,Est,Bertoua,Oxford Notebook 17,Fournitures,1647,Adidas
4,2023-06-24,3829.84,13,0,49787.92,Est,Bertoua,Oxford Notebook 30,Fournitures,3608,Philips


In [3]:
#division de la date en annee mois jour
df['date'] = pd.to_datetime(df['date'])
df['Mois'] = df['date'].dt.month
df['Annee'] = df['date'].dt.year

In [4]:
df = df.drop(columns="date")

In [5]:
df = df.drop(columns = "Returned")

In [6]:
df = df.drop(columns = "Price")

In [7]:
df = df.drop(columns = "Revenu")

In [8]:
df.columns

Index(['Unit', 'Quantity', 'Region', 'Ville', 'ProductName', 'Categorie',
       'manufacturer', 'Mois', 'Annee'],
      dtype='object')

In [9]:
df_grouper = df.groupby(["Ville","ProductName","Mois","Annee"],as_index=False).agg({"Quantity":"sum","Categorie":"first","Unit":"first","Region":"first","manufacturer":"first"})

In [10]:
df_grouper.head()
df_grouper.shape

(85479, 9)

In [11]:
df_grouper.to_csv("data_grouper.csv",index=False)

In [12]:
df_final = pd.read_csv("data_grouper.csv")

In [13]:
df_final.shape

(85479, 9)

In [14]:
df_yde = df_final[df_final["Ville"] == "Yaounde"].sample(n= 3000, random_state = 42 )
df_yde.shape

(3000, 9)

In [15]:
df_dla = df_final[df_final["Ville"] == "Douala"].sample(n= 3000, random_state = 42 )
df_dla.shape

(3000, 9)

In [16]:
df_lmb = df_final[df_final["Ville"] == "Limbe"].sample(n= 3000, random_state = 42 )
df_lmb.shape

(3000, 9)

In [17]:
df_bfs = df_final[df_final["Ville"] == "Bafoussam"].sample(n= 3000, random_state = 42 )
df_bfs.shape

(3000, 9)

In [18]:
df_Ds = df_final[df_final["Ville"] == "Dschang"].sample(n= 3000, random_state = 42 )
df_Ds.shape

(3000, 9)

In [19]:
df_bt = df_final[df_final["Ville"] == "Bertoua"].sample(n= 3000, random_state = 42 )
df_bt.shape

(3000, 9)

In [20]:
df_final = pd.concat([df_yde,df_dla,df_lmb,df_bfs,df_Ds,df_bt])

In [21]:
df_final.shape

(18000, 9)

In [22]:

def preprocessing():
    # Colonnes
    cat_features = ['ProductName', 'Categorie', 'manufacturer', 'Ville']
    num_features = ['Unit', 'Mois', 'Annee']

    # Prétraitement
    num_transformer = Pipeline([
        ('scaler', RobustScaler())
    ])

    cat_transformer = Pipeline([
        ('encoder', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
    ])

    preprocessor = ColumnTransformer([
        ('num', num_transformer, num_features),
        ('cat', cat_transformer, cat_features)
    ])

    # Pipeline complet
    pipeline = Pipeline([
        ('preprocessing', preprocessor),
        ('regressor', DecisionTreeRegressor(random_state = 42))
    ])

    return pipeline

In [23]:
pipeline = preprocessing()
X_train,X_test,y_train,y_test = train_test_split(df_final.drop(columns = "Quantity"), df_final.Quantity, stratify = df_final.Ville,test_size = 0.2, random_state = 42)

In [24]:
def train_model_CV(model,param,xtrain,ytrain,xtest,ytest):
       print("", model)

       cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)


       grid = GridSearchCV(model,param,cv=cv.split(X_train,X_train['Ville']),scoring='neg_mean_absolute_error',n_jobs = -1)
       grid.fit(xtrain,ytrain)
       # Meilleurs paramètres
       print("Meilleurs paramètres :", grid.best_params_)

       # Meilleur modèle
       best_model = grid.best_estimator_
       ypred = best_model.predict(xtest)

       square_error = mean_squared_error(ytest,ypred)
       abs_error = mean_absolute_error( ytest,ypred)
       score_r2 = r2_score( ytest,ypred)
       return best_model,ypred, square_error,abs_error,score_r2

In [25]:
param_random = {    "n_estimators":[100,400,300],
                    "max_depth" :[30,20,10,None],
                    "min_samples_split" :[10,5,2],
                    "max_features" : ["sqrt" ,"log2",None,0.5],
               }
param_svc = { "C" :[0.1,1,10,100] ,
              "kernel": ["linear","poly","sigmoid","rbf"] ,
              "gamma" : ["scale","auto" , 0.01, 0.05]
                        }
param_reg = {  "fit_intercept" : [True,False]}


param_tree = {
              "regressor__max_depth" :[20,30,50,40],#[None,5,10,20],
              "regressor__min_samples_split" :[20,10,5],
              "regressor__max_features" : ["log2" ,"sqrt",3],
              "regressor__min_samples_leaf" :[7,10,15,5]
            }
svm_model = (SVR(), param_svc)
forest_model = (RandomForestRegressor(random_state=42),param_random)
tree_model = ( pipeline , param_tree )
reg_model = ( LinearRegression() , param_reg)

models =[ tree_model,forest_model , reg_model,svm_model]

In [26]:

svm_model = SVR()
forest_model = RandomForestRegressor(random_state=42)
tree_model =  DecisionTreeRegressor(random_state=42)
reg_model = LinearRegression()
models2 =[ forest_model, tree_model, reg_model,svm_model]

In [27]:
Models=[]
tree_model = ( pipeline , param_tree )
for model in [tree_model]:
       mod,para  = model
       best_model,ypred, square_error,abs_error,score_r2 = train_model_CV(mod, para,X_train,y_train,X_test,y_test)
       print('erreur absolue au test',abs_error)

       Models.append(best_model)

 Pipeline(steps=[('preprocessing',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('scaler',
                                                                   RobustScaler())]),
                                                  ['Unit', 'Mois', 'Annee']),
                                                 ('cat',
                                                  Pipeline(steps=[('encoder',
                                                                   OneHotEncoder(handle_unknown='ignore',
                                                                                 sparse_output=False))]),
                                                  ['ProductName', 'Categorie',
                                                   'manufacturer',
                                                   'Ville'])])),
                ('regressor', DecisionTreeRegressor(random_state=42))])
Meilleurs paramètres : {'regressor__m

In [28]:

# Prédiction
y_pred = best_model.predict(X_train)
y_pred_test = best_model.predict(X_test)

# Évaluation

mae = mean_absolute_error(y_train, y_pred)
mae_test = mean_absolute_error(y_test, y_pred_test)
r2 = r2_score(y_train, y_pred)

# Affichage des résultats

print("Erreur absolue moyenne (MAE) à l'entraînement :", mae)
print("Erreur absolue moyenne (MAE) au test :", mae_test)

Erreur absolue moyenne (MAE) à l'entraînement : 60.538233973349875
Erreur absolue moyenne (MAE) au test : 66.29633727294008


In [29]:
# Chemin complet vers un dossier dans Drive
chemin = 'meilleurModeles.joblib'
joblib.dump(best_model, chemin)

['meilleurModeles.joblib']