In [64]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score
from sklearn.preprocessing import LabelEncoder

In [65]:
data = pd.read_csv("data/Prediction.csv")
data.head()

Unnamed: 0,cart_type,entreprise_email,entreprise_name,entreprise_city,day_publication,month_publication,hour_publication,quantity_published,quantity_reserved,quantity_take
0,frais,entreprise14@example.com,Carrefour,Nice,4,6,0,59,59,59
1,sec,entreprise13@example.com,Monoprix,Lyon,3,9,0,111,110,105
2,frais,entreprise7@example.com,Carrefour,Toulouse,4,3,0,194,155,155
3,frais,entreprise3@example.com,Auchan,Marseille,6,12,0,350,344,336
4,mixte,entreprise14@example.com,Carrefour,Nice,1,2,0,96,26,26


In [66]:
# Ajouter une colonne cible : `taken`
data['taken'] = (data['quantity_take']/data['quantity_published'] > 0.75).astype(int)

In [67]:
# Encodage des variables catégorielles
categorical_cols = ['cart_type', 'entreprise_email', 'entreprise_name', 'entreprise_city']
label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col])
    label_encoders[col] = le

In [68]:
# Variables explicatives et cible
X = data[['cart_type', 'entreprise_email', 'entreprise_name', 'entreprise_city', 'day_publication', 'month_publication', 'hour_publication', 'quantity_published', 'quantity_reserved']]
y = data['taken']

# Diviser en ensemble d'entraînement et de test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [69]:
# Choisir et entraîner un modèle
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

In [70]:
# Prédictions
y_pred = model.predict(X_test)

# Évaluation
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))
print("ROC-AUC Score:", roc_auc_score(y_test, model.predict_proba(X_test)[:, 1]))

Accuracy: 0.885
Classification Report:
               precision    recall  f1-score   support

           0       0.97      0.58      0.73        53
           1       0.87      0.99      0.93       147

    accuracy                           0.89       200
   macro avg       0.92      0.79      0.83       200
weighted avg       0.90      0.89      0.87       200

ROC-AUC Score: 0.8922474650237453


In [71]:
# Exemple de cart à tester
new_cart = {
    "cart_type": "frais",
    "entreprise_name": "Carrefour",
    "entreprise_city": "Nice",
    "entreprise_email": "entreprise14@example.com",
    "day_publication": 2,
    "month_publication": 7,
    "hour_publication": 10,
    "quantity_published": 70,
    "quantity_reserved": 23
}

def predict_if_will_be_take(cart):
    # Encodage des colonnes catégorielles
    for col in ['cart_type', 'entreprise_email', 'entreprise_name', 'entreprise_city']:
        if cart[col] not in label_encoders[col].classes_:
            raise ValueError(f"La valeur '{cart[col]}' pour la colonne '{col}' n'existe pas dans l'encodage.")
        cart[col] = label_encoders[col].transform([cart[col]])[0]
    
    # Préparer les données pour le modèle
    test_input = np.array([[cart['cart_type'], 
                            cart['entreprise_name'], 
                            cart['entreprise_city'], 
                            cart['entreprise_email'], 
                            cart['day_publication'], 
                            cart['month_publication'], 
                            cart['hour_publication'], 
                            cart['quantity_published'],
                            cart['quantity_reserved']]])
    
    # Prédire la probabilité que la cart soit prise
    probability = model.predict_proba(test_input)[0][1]
    print(f"Probabilité que 75% de la quantité disponible soit prise : {probability:.2%}")

predict_if_will_be_take(new_cart)


Probabilité que 75% de la quantité disponible soit prise : 46.00%


