# Étape 1 : Importer et Explorer les Données
Télécharger les données, supprimer les colonnes inutiles, renommer les colonnes, convertir les dates, et vérifier les données.

In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from xgboost import XGBRegressor
import pickle

# Load the dataset
data = pd.read_csv('avocado.csv')

# Drop unnecessary columns
data = data.drop(columns=['Unnamed: 0', 'Total Volume', 'Total Bags'])

# Rename columns
data = data.rename(columns={'4046': 'Quality1', '4225': 'Quality2', '4770': 'Quality3'})

# Convert dates
data['Date'] = pd.to_datetime(data['Date'])

# Check for missing values and duplicates
data = data.drop_duplicates()

print(data.head())

# Separate columns
numeric_features = ['Quality1', 'Quality2', 'Quality3', 'Small Bags', 'Large Bags', 'XLarge Bags', 'year']
categorical_features = ['type', 'region']

# Standardize numeric features
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

# Encode categorical features
categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine transformations
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Create the pipeline
model_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', XGBRegressor())
])

# Split the data
X = data.drop(columns=['AveragePrice'])
y = data['AveragePrice']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model
model_pipeline.fit(X_train, y_train)

# Save the model using pickle
with open('avocado_model.pkl', 'wb') as f:
    pickle.dump(model_pipeline, f)

        Date  AveragePrice  Quality1   Quality2  Quality3  Small Bags  \
0 2015-12-27          1.33   1036.74   54454.85     48.16     8603.62   
1 2015-12-20          1.35    674.28   44638.81     58.33     9408.07   
2 2015-12-13          0.93    794.70  109149.67    130.50     8042.21   
3 2015-12-06          1.08   1132.00   71976.41     72.58     5677.40   
4 2015-11-29          1.28    941.48   43838.39     75.78     5986.26   

   Large Bags  XLarge Bags          type  year  region  
0       93.25          0.0  conventional  2015  Albany  
1       97.49          0.0  conventional  2015  Albany  
2      103.14          0.0  conventional  2015  Albany  
3      133.76          0.0  conventional  2015  Albany  
4      197.69          0.0  conventional  2015  Albany  
