In [1]:
import mlflow
import pandas as pd
from transformers import pipeline
from tqdm.notebook import tqdm
import numpy as np
tqdm.pandas()
import datetime
from src.helpers import zero_shot_predict_single_model, get_top_n_label_and_score, save_dataframe_with_timestamp

imported src


## 1. Load and Prepare gold data

In [4]:
inference_input_data_path = '../temp_training/medallion/gold/gold_COMBINED.xlsx'
inference_df = pd.read_excel(inference_input_data_path)
inference_df.shape

(3016, 14)

In [7]:
inference_df.groupby(['TOPIC','RELEVANCE_CLASS']).size()

TOPIC                 RELEVANCE_CLASS
air                   -1                  51
                       0                 119
                       1                 146
cybersecurity         -1                   2
forced_labor          -1                   3
                       1                   1
marine                 0                 190
                       1                 134
maritime              -1                  56
                       0                   7
                       1                  11
material               0                 118
                       1                   3
protest_riot           0                 421
                       1                  39
road                   0                  22
strike                 0                 189
                       1                  23
train                 -1                 134
                       0                  85
                       1                 156
warehouse_fire   

## 2. Listing the experiments recorded 

--> use one for sampling new codes, one for full data

In [2]:
EXPERIMENT_NAME = "zero_shot_prediction"
# EXPERIMENT_ID = mlflow.create_experiment(EXPERIMENT_NAME)
mlflow.search_experiments()

[<Experiment: artifact_location='file:///c:/Users/JOSHUALE/Documents/Github/mlflow_tutorial/experiments/mlruns/227815192018524595', creation_time=1697818753176, experiment_id='227815192018524595', last_update_time=1697818753176, lifecycle_stage='active', name='zero_shot_prediction_L14D_001', tags={}>,
 <Experiment: artifact_location='file:///c:/Users/JOSHUALE/Documents/Github/mlflow_tutorial/experiments/mlruns/677968303451148623', creation_time=1697816832103, experiment_id='677968303451148623', last_update_time=1697816832103, lifecycle_stage='active', name='zero_shot_prediction_001', tags={}>,
 <Experiment: artifact_location='file:///c:/Users/JOSHUALE/Documents/Github/mlflow_tutorial/experiments/mlruns/0', creation_time=1697816832085, experiment_id='0', last_update_time=1697816832085, lifecycle_stage='active', name='Default', tags={}>]

## Sample experiment on small df

In [10]:
# for sample experiment:
EXPERIMENT_ID = '677968303451148623'
RUN_DESCRIPTION = ''
with mlflow.start_run(experiment_id=EXPERIMENT_ID, description=RUN_DESCRIPTION) as run:
    RUN_ID = run.info.run_id
    print(f"Started run: {RUN_ID}")
    # change path accordingly
    class_labels_data_path = '../data/input/class_label_by_topic.csv'
    class_labels_data = pd.read_csv(class_labels_data_path)
    candidate_labels = list(class_labels_data['CLASS_DESCRIPTION'])

    inference_df = pd.read_csv(inference_input_data_path)
    inference_df = inference_df.sample(5, random_state=99) # getting a sample only

    model_path = '../models/pretrained/bart-large-mnli/'
    loaded_classifier = pipeline("zero-shot-classification", model=model_path)
    
    mlflow.log_param("model_path", model_path)
    mlflow.log_artifact(class_labels_data_path, "class_labels_data")
    mlflow.log_artifact(inference_input_data_path, "inference_input_data")
    
    inference_df['ZERO_SHOT_PREDICTION'] = inference_df.progress_apply(lambda row: 
                                                   zero_shot_predict_single_model(
                                                       classifier=loaded_classifier, 
                                                       sequence_to_classify=row['BODY_SUMMARY'], 
                                                       candidate_labels=candidate_labels), 
                                                    axis=1)
    inference_df['FIRST_PREDICTION_CLASS'] = inference_df['ZERO_SHOT_PREDICTION'].apply(lambda result_dict: get_top_n_label_and_score(result_dict, 1)[0])
    inference_df['FIRST_PREDICTION_SCORE'] = inference_df['ZERO_SHOT_PREDICTION'].apply(lambda result_dict: get_top_n_label_and_score(result_dict, 1)[1])
    inference_df['SECOND_PREDICTION_CLASS'] = inference_df['ZERO_SHOT_PREDICTION'].apply(lambda result_dict: get_top_n_label_and_score(result_dict, 2)[0])
    inference_df['SECOND_PREDICTION_SCORE'] = inference_df['ZERO_SHOT_PREDICTION'].apply(lambda result_dict: get_top_n_label_and_score(result_dict, 2)[1])
    # Save the DataFrame to a CSV file
    current_datetime = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
    predictions_path = f"./output/temp_mlflow/zero_shot_prediction_{current_datetime}.csv"
    inference_df.to_csv(predictions_path, index=False)
    # Log the CSV file as an artifact
    mlflow.log_artifact(predictions_path, f"zero_shot_prediction")
mlflow.end_run() 

Started run: cdb489738fe74707b16ee5f81f00aa7a


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


  0%|          | 0/5 [00:00<?, ?it/s]

## Prediction on full dataset

In [6]:
# for time consuming experiment:
EXPERIMENT_ID = '227815192018524595'
RUN_DESCRIPTION = ''
with mlflow.start_run(experiment_id=EXPERIMENT_ID, description=RUN_DESCRIPTION) as run:
    RUN_ID = run.info.run_id
    print(f"Started run: {RUN_ID}")
    # change path accordingly
    class_labels_data_path = '../data/input/class_label_by_topic.csv'
    class_labels_data = pd.read_csv(class_labels_data_path)
    candidate_labels = list(class_labels_data['CLASS_DESCRIPTION'])
    print("Candidate labels in use:")
    print(candidate_labels)

    inference_df = pd.read_csv(inference_input_data_path)
    # inference_df = inference_df.sample(5, random_state=99) # getting a sample only

    model_path = '../models/pretrained/bart-large-mnli/'
    loaded_classifier = pipeline("zero-shot-classification", model=model_path)
    
    mlflow.log_param("model_path", model_path)
    mlflow.log_artifact(class_labels_data_path, "class_labels_data")
    mlflow.log_artifact(inference_input_data_path, "inference_input_data")
    
    inference_df['ZERO_SHOT_PREDICTION'] = inference_df.progress_apply(lambda row: 
                                                   zero_shot_predict_single_model(
                                                       classifier=loaded_classifier, 
                                                       sequence_to_classify=row['BODY_SUMMARY'], 
                                                       candidate_labels=candidate_labels), 
                                                    axis=1)
    inference_df['FIRST_PREDICTION_CLASS'] = inference_df['ZERO_SHOT_PREDICTION'].apply(lambda result_dict: get_top_n_label_and_score(result_dict, 1)[0])
    inference_df['FIRST_PREDICTION_SCORE'] = inference_df['ZERO_SHOT_PREDICTION'].apply(lambda result_dict: get_top_n_label_and_score(result_dict, 1)[1])
    inference_df['SECOND_PREDICTION_CLASS'] = inference_df['ZERO_SHOT_PREDICTION'].apply(lambda result_dict: get_top_n_label_and_score(result_dict, 2)[0])
    inference_df['SECOND_PREDICTION_SCORE'] = inference_df['ZERO_SHOT_PREDICTION'].apply(lambda result_dict: get_top_n_label_and_score(result_dict, 2)[1])
    inference_df['THIRD_PREDICTION_CLASS'] = inference_df['ZERO_SHOT_PREDICTION'].apply(lambda result_dict: get_top_n_label_and_score(result_dict, 3)[0])
    inference_df['THIRD_PREDICTION_SCORE'] = inference_df['ZERO_SHOT_PREDICTION'].apply(lambda result_dict: get_top_n_label_and_score(result_dict, 3)[1])
    # Save the DataFrame to a CSV file
    current_datetime = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
    predictions_path = f"./output/temp_mlflow/zero_shot_prediction_{current_datetime}.csv"
    inference_df.to_csv(predictions_path, index=False)
    # Log the CSV file as an artifact
    mlflow.log_artifact(predictions_path, f"zero_shot_prediction")
mlflow.end_run() 

    

Started run: 431def949bda4214a597e04b53ad217c
Candidate labels in use:
['very recent breaking news on major railway transportation disruption, bad news', 'very recent breaking news on major maritime transportation disruption, bad news', 'very recent breaking news on warehouse and storage facilities disruption or destruction, bad news', 'very recent breaking news on major air transportation or airport disruption, bad news', 'very recent breaking news on severe and extreme weather causing disruption, bad news ', 'very recent breaking news on major and large scale worker strike actions causing disruption, bad news', 'very recent breaking news on major and large scale cyber attacks and security breaches, bad news', 'very recent breaking news on forced labor and sweatshop', 'later reports of past transportation disruption event, bad news', 'lawsuits, legal or insurance impact of past event, bad news', 'general social, business, economic reports, studies and trends', 'leisure or other news']

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


  0%|          | 0/120 [00:00<?, ?it/s]

### For adding more n-th pred score

In [5]:
import ast 

inference_df = pd.read_csv('./output/temp_mlflow/zero_shot_prediction_L14D.csv')
inference_df['ZERO_SHOT_PREDICTION'] = inference_df['ZERO_SHOT_PREDICTION'].apply(ast.literal_eval)
inference_df['FIRST_PREDICTION_CLASS'] = inference_df['ZERO_SHOT_PREDICTION'].apply(lambda result_dict: get_top_n_label_and_score(result_dict, 1)[0])
inference_df['FIRST_PREDICTION_SCORE'] = inference_df['ZERO_SHOT_PREDICTION'].apply(lambda result_dict: get_top_n_label_and_score(result_dict, 1)[1])
inference_df['SECOND_PREDICTION_CLASS'] = inference_df['ZERO_SHOT_PREDICTION'].apply(lambda result_dict: get_top_n_label_and_score(result_dict, 2)[0])
inference_df['SECOND_PREDICTION_SCORE'] = inference_df['ZERO_SHOT_PREDICTION'].apply(lambda result_dict: get_top_n_label_and_score(result_dict, 2)[1])
inference_df['THIRD_PREDICTION_CLASS'] = inference_df['ZERO_SHOT_PREDICTION'].apply(lambda result_dict: get_top_n_label_and_score(result_dict, 3)[0])
inference_df['THIRD_PREDICTION_SCORE'] = inference_df['ZERO_SHOT_PREDICTION'].apply(lambda result_dict: get_top_n_label_and_score(result_dict, 3)[1])

current_datetime = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
predictions_path = f"./output/temp_mlflow/zero_shot_prediction_{current_datetime}.csv"
inference_df.to_csv(predictions_path, index=False)