In [77]:
import re
import os
from datasets import Dataset, DatasetDict
from datasets import ClassLabel
import pandas as pd
from sentence_transformers.losses import CosineSimilarityLoss
import numpy as np
from setfit import SetFitModel, SetFitTrainer
from sklearn.model_selection import train_test_split

In [66]:
def remove_leading_numbering(input_string, delimiter=". "):
    return input_string.split(delimiter, 1)[-1]

In [67]:
from huggingface_hub import login, logout
login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

### 1. Creating custom dataset

In [68]:
parent_folder = '../temp_training/ai_generated_para'
file_names = os.listdir(parent_folder)
dataset_dict = {"class_name": [],
                "class_name_for_onehot":[],
                "text": []}
for file_name in file_names:
    match = re.match(r'^(\d+)_(\w+)', file_name)
    # class_index = int(match.group(1))
    class_name = match.group(2)
    file_path = os.path.join(parent_folder, file_name)
    sentences = []  # Initialize an empty list to store the lines
    with open(file_path, 'r') as file:
        for line in file:
            # Check if the line contains alphabetical characters and doesn't start with a number
            if any(c.isalpha() for c in line):
                sentences.append(remove_leading_numbering(line.strip()))
    # class_indices = [class_index]*len(sentences)
    class_names = [class_name]*len(sentences)
    # dataset_dict['class_index'].extend(class_indices)
    dataset_dict['class_name'].extend(class_names)
    dataset_dict['class_name_for_onehot'].extend(class_names)
    dataset_dict['text'].extend(sentences)

In [132]:
# though the 'class_index' col will not be used as label cols, we still cast it to label feature to stratify the dataset during train/test split
# non_labels = ['text', 'class_name']
dataset_df = pd.DataFrame(dataset_dict)
encoded_df = pd.get_dummies(dataset_df, columns = ['class_name_for_onehot'], prefix="", prefix_sep='', dtype=int) 

In [133]:
# configure labels and their indices from the table:
non_label_columns = ['text', 'class_name']
label_columns = [column for column in encoded_df.columns if column not in non_label_columns]
label_indices = range(len(label_columns))
label_dict = {k:v for (k,v) in zip(label_columns, label_indices)}
encoded_df['class_index'] = encoded_df.apply(lambda row: (label_dict[row['class_name']]), axis=1)

In [134]:
encoded_df.head(5)

Unnamed: 0,class_name,text,aviation,cybersecurity,domestic_unrest_violence,extreme_weather,forced_labor,general_biz_trend,individual_accidents_tragedies,later_report,...,leisure_other_news,maritime,pandemics_large_scale_diseases,railway,strike,trade_war_embargos_bans,transportation_trends_projects,war_conflict,warehouse_fire,class_index
0,lawsuit_legal_insurance,In the aftermath of the catastrophic fire that...,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,8
1,lawsuit_legal_insurance,Residents of Phoenix are up in arms following ...,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,8
2,lawsuit_legal_insurance,Denver's public transportation system faces le...,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,8
3,lawsuit_legal_insurance,Houston's skyline was marred by chaos on Novem...,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,8
4,lawsuit_legal_insurance,The serene coastal town of Wilmington is rattl...,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,8


In [135]:
result_dataset_dict = encoded_df.to_dict('list')

In [136]:
dataset = Dataset.from_dict(result_dataset_dict)
label_features = [feature for feature in dataset.features if feature not in non_label_columns]
new_features = dataset.features.copy()
for label_feature in label_features:
    new_features[label_feature] = ClassLabel(num_classes=2) # binary for each one-hot
dataset = dataset.cast(new_features)

Casting the dataset:   0%|          | 0/900 [00:00<?, ? examples/s]

In [137]:
# using train_test_split method creates a DatasetDict, 
# shuffle is enabled by default
# this step is done twice to create 3 datasets - train, valid, test
train_validtest = dataset.train_test_split(test_size=0.2, seed=99, stratify_by_column="class_index")
valid_test = train_validtest['test'].train_test_split(test_size=0.5, seed=99, stratify_by_column="class_index")
train_valid_test_dataset = DatasetDict({
    'train': train_validtest['train'],
    'valid': valid_test['train'],
    'test': valid_test['test']})

In [141]:
# we can verify that the stratification is done
import collections
counter_train = collections.Counter(train_valid_test_dataset['train']['class_index'])
counter_valid = collections.Counter(train_valid_test_dataset['valid']['class_index'])
counter_test = collections.Counter(train_valid_test_dataset['test']['class_index'])

print(counter_train)
print("="*30)
print(counter_valid)
print("="*30)
print(counter_test)
print("="*30)
# if seeing by class name:
counter_train = collections.Counter(train_valid_test_dataset['train']['class_name'])
counter_valid = collections.Counter(train_valid_test_dataset['valid']['class_name'])
counter_test = collections.Counter(train_valid_test_dataset['test']['class_name'])

print(counter_train)
print("="*30)
print(counter_valid)
print("="*30)
print(counter_test)

Counter({2: 40, 0: 40, 13: 40, 6: 40, 9: 40, 3: 40, 16: 40, 12: 40, 1: 40, 7: 40, 14: 40, 10: 40, 11: 40, 8: 40, 17: 40, 15: 40, 5: 40, 4: 40})
Counter({1: 5, 5: 5, 14: 5, 3: 5, 0: 5, 15: 5, 12: 5, 11: 5, 16: 5, 6: 5, 4: 5, 9: 5, 10: 5, 8: 5, 7: 5, 17: 5, 2: 5, 13: 5})
Counter({13: 5, 4: 5, 15: 5, 2: 5, 5: 5, 11: 5, 9: 5, 6: 5, 10: 5, 3: 5, 14: 5, 0: 5, 8: 5, 7: 5, 1: 5, 16: 5, 12: 5, 17: 5})
Counter({'domestic_unrest_violence': 40, 'aviation': 40, 'strike': 40, 'individual_accidents_tragedies': 40, 'leisure_other_news': 40, 'extreme_weather': 40, 'war_conflict': 40, 'railway': 40, 'cybersecurity': 40, 'later_report': 40, 'trade_war_embargos_bans': 40, 'maritime': 40, 'pandemics_large_scale_diseases': 40, 'lawsuit_legal_insurance': 40, 'warehouse_fire': 40, 'transportation_trends_projects': 40, 'general_biz_trend': 40, 'forced_labor': 40})
Counter({'cybersecurity': 5, 'general_biz_trend': 5, 'trade_war_embargos_bans': 5, 'extreme_weather': 5, 'aviation': 5, 'transportation_trends_proje

In [142]:
# prepare a list of all labels
labels = [col_name for col_name in train_valid_test_dataset['train'].column_names\
                   if col_name not in ['text', 'class_index', 'class_name']]

# helper function to extract the one-hot array value of each row:
def encode_labels(record):
    return {"label": [record[feature] for feature in labels]}

train_valid_test_dataset = train_valid_test_dataset.map(encode_labels)

Map:   0%|          | 0/720 [00:00<?, ? examples/s]

Map:   0%|          | 0/90 [00:00<?, ? examples/s]

Map:   0%|          | 0/90 [00:00<?, ? examples/s]

In [144]:
# random verification to check the agreement of class_name, one-hot value, class_index, label
train_valid_test_dataset['train'][0]

{'class_name': 'domestic_unrest_violence',
 'text': 'Buenos Aires, Argentina, grappled with civic unrest as citizens protested against economic austerity measures. The demonstrations, marked by clashes with security forces, exposed the deepening economic challenges facing the South American nation.',
 'aviation': 0,
 'cybersecurity': 0,
 'domestic_unrest_violence': 1,
 'extreme_weather': 0,
 'forced_labor': 0,
 'general_biz_trend': 0,
 'individual_accidents_tragedies': 0,
 'later_report': 0,
 'lawsuit_legal_insurance': 0,
 'leisure_other_news': 0,
 'maritime': 0,
 'pandemics_large_scale_diseases': 0,
 'railway': 0,
 'strike': 0,
 'trade_war_embargos_bans': 0,
 'transportation_trends_projects': 0,
 'war_conflict': 0,
 'warehouse_fire': 0,
 'class_index': 2,
 'label': [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}

In [145]:
# pushing to hub:
# train_valid_test_dataset.push_to_hub("joshuapsa/gpt-generated-news-paragraphs-v1.1")

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

In [40]:
# we can save and reload the dataset by:
# dataset.save_to_disk('./custom_datasets/sample_dataset')
# from datasets import load_from_disk
# reloaded_dataset = load_from_disk("./custom_datasets/sample_dataset")
# reloaded_dataset

### 2. Multi-label text classification

In [55]:
# getting pretrained model from Huggingface and save it
# model_id = "sentence-transformers/paraphrase-mpnet-base-v2"
# model = SetFitModel.from_pretrained(model_id, # use any Sentence transformer model
#                                     multi_target_strategy="one-vs-rest" 
#                                     )
# model.save_pretrained('../models/pretrained/paraphrase-mpnet-base-v2/')

model_head.pkl not found on HuggingFace Hub, initialising classification head with random weights. You should TRAIN this model on a downstream task to use it for predictions and inference.


In [56]:
# re-load the model from saved location
loaded_model = SetFitModel.from_pretrained('../models/pretrained/paraphrase-mpnet-base-v2/', # use any Sentence transformer model
                                    multi_target_strategy="one-vs-rest")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [107]:
# define the parameters for the finetuning process:
# 'accuracy' is the default metric, explicitly stated for better clarity
trainer = SetFitTrainer(
    model=loaded_model,
    train_dataset=train_valid_test_dataset['train'],
    eval_dataset=train_valid_test_dataset['valid'],
    loss_class=CosineSimilarityLoss,
    num_iterations=5,
    column_mapping={"text": "text", "labels": "label"}, # IMPORTANT for SetFit - hardcoded in the source code
    num_epochs=1,
    batch_size=5,
    metric='accuracy'
)

In [108]:
# training (75min for 2000 summaries):
trainer.train()

Applying column mapping to training dataset


Generating Training Pairs:   0%|          | 0/5 [00:00<?, ?it/s]

***** Running training *****
  Num examples = 6400
  Num epochs = 1
  Total optimization steps = 1280
  Total train batch size = 5


Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/1280 [00:00<?, ?it/s]

In [109]:
# we can save the finetuned model:
# loaded_model.save_pretrained('../models/finetuned/paraphrase-mpnet-base-v2/')

In [110]:
# get performance on eval dataset
trainer.evaluate()

Applying column mapping to evaluation dataset
***** Running evaluation *****


{'accuracy': 0.9875}

In [5]:
finetuned_model = SetFitModel.from_pretrained('../models/finetuned/paraphrase-mpnet-base-v2/')
pretrained_model = SetFitModel.from_pretrained('../models/pretrained/paraphrase-mpnet-base-v2/')

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [7]:
finetuned_model.model_head

In [11]:
# make predictions on test set:
# preds = finetuned_model.predict(train_valid_test_dataset['test']['text'])
sample_text = 'violence broke out on the streets of Paris yesterday when demonstrators clashed with the city police'
finetuned_pred = finetuned_model.predict([sample_text]) # ok, as the head is fitted
# pretrained_pred = pretrained_model.predict([sample_text]) # NotFittedError: This OneVsRestClassifier instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.

In [125]:
# post-processing:
texts = [text for text in train_valid_test_dataset['test']['text']]
true_labels = [label_features[int(np.array(label).argmax())] for label in train_valid_test_dataset['test']['labels']]
pred_labels = [label_features[int(pred.argmax())] for pred in preds]

In [128]:
result_df = pd.DataFrame({"texts":texts, 
                          'true_labels': true_labels,
                          'pred_labels': pred_labels})

In [130]:
# result_df.to_csv('sample_setfit_test.csv')

In [158]:
# test_df = pd.read_csv('../temp_training/medallion/gold/gold_COMBINED_sentences_body_summary.csv')
test_df = pd.read_excel('../temp_training/medallion/gold/gold_COMBINED.xlsx')
# test_df = test_df.sample(200)
test_df.head(2)

Unnamed: 0.1,Unnamed: 0,INSERT_DATETIME,URI,TOPIC,TOPIC_URI,TITLE,BODY,URL,RELEVANCE_CLASS,BODY_SUMMARY,EVENTURI,SOURCE,METADATA,ARTICLE_HIERARCHY
0,0,09:07.6,7501230662,warehouse_fire,eb7688e0-0c9f-4e9a-a2b8-2ba000a8f11b,"Fire destroys Sangre Grande block factory, house",Fire officers were up to late yesterday trying...,https://www.cnc3.co.tt/fire-destroys-sangre-gr...,1,Fire destroyed a block factory in Sangre Grand...,,,,
1,1,02:53.0,7463581898,marine,d4dfd2e7-86fc-4098-975e-32dcf4eea142,Another rail strike in Germany to add to Europ...,A rail strike in Germany on Monday is expected...,https://theloadstar.com/another-rail-strike-in...,1,Rail strike in Germany expected to cause delay...,,,,


In [159]:
preds_2 = loaded_model.predict(test_df['BODY_SUMMARY'].to_list())

In [160]:
# post-processing:
texts_2 = [text for text in test_df['BODY_SUMMARY'].to_list()]
pred_labels_2 = [label_features[int(pred.argmax())] for pred in preds_2]
result_df_2 = pd.DataFrame({"texts":texts_2, 
                          'pred_labels': pred_labels_2})

In [161]:
result_df_2.to_csv('sample_setfit_actual_data_body_summary_all.csv')