In [1]:
import mlflow 

### Connecting

In [2]:
mlflow.sklearn.autolog(log_datasets = False )
mlflow.set_tracking_uri("http://127.0.0.1:5000")
mlflow.set_experiment("model_design_3")

<Experiment: artifact_location='mlflow-artifacts:/864443857530066018', creation_time=1699977548211, experiment_id='864443857530066018', last_update_time=1699977548211, lifecycle_stage='active', name='model_design_3', tags={}>

### Build model function 

In [12]:
def build_model(
    dataset,
    dataset_valid,
    pipeline,
    mlflow_run_tags=None,
    mlflow_run_parameters=None,
    mlflow_run_description=None,
):
    """
    Build a sentiment analysis model, print the evaluation result and store everything to MLFlow
    @param: dataset: pandas dataframe containing the input training set
    @param: pipeline: scikit-learn pipeline that will be applied to the input data
    @param: model_name: name of the model as it will be stored in MLFlow
    @param: mlflow_run_tags: dict of tags that will be stored in the MLFlow run
    @param: mlflow_run_parameters: dict of parameters that will be stored in the MLFlow run
    @param: mlflow_run_description: textual description of the run
    @param: mlflow_model_tags: dict of tags that will be stored in the MLFlow registered model
    @param: mlflow_model_description: textual description of the model    
    @return: the ModelInfo of the model generated by MLFlow  
    """
    with mlflow.start_run( description=mlflow_run_description if mlflow_run_description is not None else "") : 
        if mlflow_run_tags is not None:
            for key in mlflow_run_tags.keys():
                mlflow.set_tag(key, mlflow_run_tags[key])
        if mlflow_run_parameters is not None:
            for key in mlflow_run_parameters.keys():
                mlflow.log_param(key, mlflow_run_parameters[key])  

        pipeline.fit(dataset["review"] , dataset['polarity'])

        
        X_valid = dataset_valid["review"]
        y_valid = dataset_valid['polarity']
        y_pred = pipeline.predict(X_valid)


    return pipeline 
                




In [3]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from spacy.lang.fr.stop_words import STOP_WORDS as stopwords
from sklearn.linear_model import LogisticRegression
import pandas as pd

## First pipeline

In [9]:
stopwords = list(stopwords)


pipe = Pipeline([ ("tfidf",TfidfVectorizer(stop_words = stopwords)),("log",LogisticRegression(max_iter = 2000))])

### Data loading

In [10]:
df_t = pd.read_csv("../data/train.csv")
df_v = pd.read_csv("../data/valid.csv")

In [14]:
build_model(df_t , df_v ,pipe , mlflow_run_tags = {"model_type" : "logistic regression" , "model_version" : "v1"} , mlflow_run_parameters= {"max_iter" : 2000} , mlflow_run_description = "model 3 to try the function build_model")



## Second pipeline :

this pipeline is used to see and test the difference in the mlflow side.

In [22]:
pipe2 = Pipeline([ ("tfidf",TfidfVectorizer(stop_words = stopwords)),("log",LogisticRegression(max_iter = 1300))])
build_model(df_t , df_v ,pipe2 , "model_3_bis" , mlflow_run_tags = {"model_version" : "v1"}  , mlflow_run_description = "model 3 to try second time the function build_model")



**As we tested before , the function build model works perfectly in mlflow. And the result are as waited.**

# Hyperopt And Mlflow

In [23]:
from sklearn.metrics import accuracy_score

In [24]:
stopwords = list(stopwords)

# apply it in the df  
tfidf = TfidfVectorizer(stop_words = stopwords) 

X = tfidf.fit_transform(df_t["review"])
X_valid = tfidf.transform(df_v["review"])
y = df_t["polarity"]
y_valid = df_v["polarity"]




In [29]:
from hyperopt import hp, fmin, tpe, STATUS_OK, Trials

space = {
    'C': hp.uniform('C', 0.1, 10),
    'penalty': hp.choice('penalty', ['l2' , 'l1' ])
}




# define the objective function
def objective(params):
    with mlflow.start_run( ) : 
        params = {'C': params['C'], 'penalty': params['penalty'] }
        model = LogisticRegression(**params,max_iter = 2000  , solver = "liblinear")
        model.fit(X , y )
        y_pred = model.predict(X_valid)
        score = accuracy_score(y_valid,y_pred)
        loss = 1 - score
        mlflow.set_tag("hyperopt_candidate" , True )
    return loss

# start the trials to find the best hyperparameters
trials = Trials()
best = fmin(fn=objective,
            space=space,
            algo=tpe.suggest,
            max_evals=20,
            trials=trials)

100%|██████████| 20/20 [01:49<00:00,  5.47s/trial, best loss: 0.08045000000000002]


In [30]:
print(best) 

{'C': 5.318920759871554, 'penalty': 0}


In [20]:
with mlflow.start_run( ) : 
    params = {'C': best['C']}
    model = LogisticRegression(**params,max_iter = 2000  , solver = "liblinear")
    pipe = Pipeline([ ("tfidf",TfidfVectorizer(stop_words = stopwords)),("model" ,model)])
    pipe.fit(df_t["review"] , df_t["polarity"] )
    y_pred = pipe.predict(df_v["review"])
    mlflow.set_tag("hyperopt_selected_new" , True)
    mlflow.sklearn.log_model(pipe , "model_3_hyperopt_new" , registered_model_name = "model_3_hyperopt")

Registered model 'model_3_hyperopt' already exists. Creating a new version of this model...
2023/11/21 15:32:34 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: model_3_hyperopt, version 2
Created version '2' of model 'model_3_hyperopt'.
