In [1]:
import numpy as np
import pandas as pd

In [2]:
fourrage = pd.read_excel("../INRA2018_TablesFourrages_etude_prediction_20241121.xlsx", 
                        engine="openpyxl", header = 1) #, dtype = {'Libellé 4' : 'string'})

In [3]:
fourrage.head(2)

Unnamed: 0,No,Etat,Code INRA,Libellé 0,Libellé 1,Libellé 2,Libellé 3,Libellé 4,MS,UFL,...,GlyDI,AlaDI,TyrDI,C14:0,C16:0,C16:1,C18:0,C18:1,C18:2,C18:3
0,1,1,FV0010,FOURRAGES VERTS,"PRAIRIES PERMANENTES, PLAINE (NORMANDIE)",1er cycle,"15-25 avril, déprimage, ST = 172°C",,15.5,1.07,...,6.94,6.87,4.13,1.0,15.68,1.4,2.3,4.04,16.34,54.5
1,2,1,FV0020,FOURRAGES VERTS,"PRAIRIES PERMANENTES, PLAINE (NORMANDIE)",1er cycle,"1-10 mai, pâturage, ST = 298°C",,16.6,1.02,...,6.93,6.88,4.14,1.0,16.36,1.4,2.38,4.19,16.89,52.75


In [3]:
type(fourrage.loc[1,'Libellé 4'])

float

In [7]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.multioutput import MultiOutputRegressor
from sklearn.dummy import DummyRegressor
from xgboost import XGBRegressor
# import prince 

%load_ext autoreload
%autoreload 1
%aimport utils

text_cols = ['Libellé 0', 'Libellé 1', 'Libellé 2' ,'Libellé 3', 'Libellé 4']
num_cols = ['MS', 'MM', 'MAT', 'CB', 'NDF', 'ADF', 'EE']
targets = ['UFL', 'UFV', 'BPR','PDI', 'PDIA']

#X = fourrage[cat_cols]
#print(utils.SplitTransformer().fit(X).transform(X)['Libellé 0-1'].unique())
#bug identified, only 1 comma in first columns...


text_pipeline = Pipeline([
    ('spliting libelle', utils.SplitTransformer()),
    ("one hot encoding", OneHotEncoder(sparse_output=False, handle_unknown='ignore')),
    #("MCA",prince.MCA())
])

num_pipeline = Pipeline([
    ('num_standardization', StandardScaler()),
    ('num_imputation', KNNImputer())    
]) 

preprocessing = ColumnTransformer([
    ("categorical_preproc", text_pipeline, text_cols),
    ("num_preproc", num_pipeline, num_cols)
])

num_only_preprocessing = ColumnTransformer([
    ("num_preproc", num_pipeline, num_cols)
])

model = Pipeline([
    ('preprocessing', preprocessing),
    ("regressor", MultiOutputRegressor(GradientBoostingRegressor(random_state=42), n_jobs=2))    
])

model_xgb = Pipeline([
    ('preprocessing', preprocessing),
    ('regressor', MultiOutputRegressor(XGBRegressor(random_state=42), n_jobs=2))
])

model_dummy = Pipeline([
    ('preprocessing', preprocessing),
    ("regressor", MultiOutputRegressor(DummyRegressor(strategy='mean'), n_jobs=2) )    
])

model_xgb_num_only = Pipeline([
    ('preprocessing', num_only_preprocessing),
    ("regressor", MultiOutputRegressor(XGBRegressor(strategy='mean'), n_jobs=2))
])


text_pipeline.set_output(transform='pandas')


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [8]:
model_xgb_num_only

In [11]:
model_dummy

In [15]:
model_xgb

# Train-test split

In [9]:
from sklearn.model_selection import train_test_split 
X_train, X_test, y_train, y_test = train_test_split(
    fourrage[text_cols+num_cols], fourrage[targets], test_size=0.2, shuffle=True)
X_train_num_only, X_test_num_only, y_train_num_only, y_test_num_only = train_test_split(
    fourrage[num_cols], fourrage[targets], test_size=0.2, shuffle=True)

In [47]:
X_train['Libellé 4'].unique()[3]


nan

# Model saving

In [18]:
import torch
from pickle import dump

In [50]:
from pickle import dump

model.fit(X_train, y_train)
with open('baseline.pkl', 'wb') as f:
    dump(model, f, protocol=5)

In [None]:
# enregistrement du modèle xgb et des prédictions
model_xgb.fit(X_train, y_train)

yhat = model_xgb.predict(X_test)

# print(yhat[:5])
# print(yhat.shape)

dico_pred_model_xgb = {"yhat" : yhat, "ytrue" : y_test}

with open('model_xgb.pkl', 'wb') as f:
    dump(model_xgb, f, protocol=5)

with open('predictions_model_xgb.pth', 'wb') as f:
    torch.save(dico_pred_model_xgb, f)

In [None]:
# enregistrement du modèle dummy et des prédictions
model_dummy.fit(X_train, y_train)

yhat = model_dummy.predict(X_test)

# print(yhat[:5])
# print(yhat.shape)

dico_pred_model_dummy = {"yhat" : yhat, "ytrue" : y_test}

with open('model_dummy.pkl', 'wb') as f:
    dump(model_dummy, f, protocol=5)
    
with open('predictions_model_dummy.pth', 'wb') as f:
    torch.save(dico_pred_model_dummy, f)

In [55]:
# création d'un MinMaxScaler pour les modèles ML et dummy
from sklearn.preprocessing import MinMaxScaler
import pickle as pkl

target_normalizer = MinMaxScaler(feature_range=(-1, 1))
target_normalizer.fit(y_train)

with open('target_normalizer_ML.pkl', 'wb') as f:
    pkl.dump({'Normalizer' : {'target' : target_normalizer}}, f)

In [19]:
# enregistrement du modèle dummy et des prédictions
model_xgb_num_only.fit(X_train, y_train)

yhat = model_xgb_num_only.predict(X_test)

# print(yhat[:5])
# print(yhat.shape)

dico_pred_model_xgb_num_only = {"yhat" : yhat, "ytrue" : y_test}

with open('model_xgb_num_only.pkl', 'wb') as f:
    dump(model_xgb_num_only, f, protocol=5)
    
with open('predictions_model_xgb_num_only.pth', 'wb') as f:
    torch.save(dico_pred_model_xgb_num_only, f)