# Zero-Shot Event Classification

In [1]:
import json
import torch
import numpy as np
from sklearn.metrics import precision_recall_fscore_support
from sentence_transformers import SentenceTransformer, util

def read_dataset(path):
    """Load tsv dataset from CASE 2021 shared task."""
    with open(path, encoding="utf8") as f:
        dataset = []
        for line in list(f)[1:]:
            id, text, label = line.strip().split("\t")
            item = {
                "id": id, "text": text, "label": label
            }
            dataset.append(item)
    return dataset

## Implementing Simple Zero-Shot Classifier

Our approach in a nutshell:
* We use a sentence encoder from `sentence-transformers` to convert both label descriptions and texts to predict into embeddings that live in the same embedding space.
* At test time, we embed a new text and compare it to each label embedding via cosine similarity.
* We assign the label with the highest similarity to the item.
* Optionally, we define a minimum similarity threshold that a label needs to pass. If no label passes this threshold, we assign the "OTHER" class.


In [7]:
class ZeroShotClassifier:
    
    def __init__(self, model=None, threshold=None, null_label="OTHER"):
        self.model = model
        self.labels = []
        self.label_embeddings = None
        self.threshold = threshold
        self.null_label = null_label
    
    def train(self, labels, descriptions):
        self.labels = labels
        self.label_embeddings = model.encode(descriptions)
    
    def predict(self, input_texts=None, input_embeddings=None, output_scores=False):
        
        if input_embeddings is None:
            input_embeddings = self.model.encode(input_texts)
            
        S = util.pytorch_cos_sim(input_embeddings, self.label_embeddings)
        
        predicted_labels = []
        predicted_scores = []
        for i in range(input_embeddings.shape[0]):
            label_scores = S[i].tolist()
            scored = sorted(
                zip(self.labels, label_scores),
                key=lambda x: x[1],
                reverse=True
            )
            pred, score = scored[0]
            if self.threshold is not None and score < self.threshold:
                pred = self.null_label
                
            predicted_scores.append(scored)
            predicted_labels.append(pred)        
        
        if output_scores:
            return predicted_labels, predicted_scores
        else:
            return predicted_labels

## Initializing Classifier

In [None]:
from sentence_transformers import SentenceTransformer, LoggingHandler
from sentence_transformers import models, util, datasets, evaluation, losses
from torch.utils.data import DataLoader

import nltk
nltk.download('punkt')

# Define your sentence transformer model using CLS pooling
model_name = 'bert-base-uncased'
word_embedding_model = models.Transformer(model_name)
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(), 'cls')
model = SentenceTransformer(modules=[word_embedding_model, pooling_model])

# Define a list with sentences (1k - 100k sentences)
train_sentences = ["Your set of sentences",
                   "Model will automatically add the noise", 
                   "And re-construct it",
                   "You should provide at least 1k sentences"]

# Create the special denoising dataset that adds noise on-the-fly
train_dataset = datasets.DenoisingAutoEncoderDataset(train_sentences)

# DataLoader to batch your data
train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True)

# Use the denoising auto-encoder loss
train_loss = losses.DenoisingAutoEncoderLoss(model, decoder_name_or_path=model_name, tie_encoder_decoder=True)

# Call the fit method
model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    epochs=1,
    weight_decay=0,
    scheduler='constantlr',
    optimizer_params={'lr': 3e-5},
    show_progress_bar=True
)

model.save('output/tsdae-model')

In [8]:
device = "cpu" # set as "cuda" instead if you have a GPU set up
# the first time this line runs the model will be downloaded 
model = SentenceTransformer("paraphrase-mpnet-base-v2", device=device)

In [9]:
zs_classifier = ZeroShotClassifier(model=model)
zs_classifier.train(labels=label_names, descriptions=label_descriptions)

## Bonus: Building your own Zero-Shot Classifier

You can build a custom zero-shot classifier in a few lines of code!

Let's say we're interested in a small number of natural disasters mentioned in news headlines: earthquakes, wildfires and floods. <br>
We want our classifier to detect and classify these and label everything else as "OTHER".

To do this, we set our classifier up with embeddings of very simple label descriptions ("earthquake", "wildfire", "floods"):

In [15]:
my_classifier = ZeroShotClassifier(
    model=model,
    threshold=0.3,    
    null_label="OTHER"
)

my_classifier.train(
    labels=["EARTHQUAKE", "WILDFIRE", "FLOODS"],
    descriptions=["earthquake", "wildfire", "floods"]
)

Let's apply the classifier to some examples:

In [16]:
my_classifier.predict([
    "Death toll from Hurricane Ida floods rises to 65 in US",
    "As California burns, some ecologists say it’s time to rethink forest management",
    "Maharashtra: Tremor in Kolhapur, no casualty",
    "Leaked Guntrader firearms data file shared. Worst case scenario?",
    "Taliban take control of last holdout in Panjshir Valley"
])

['FLOODS', 'WILDFIRE', 'EARTHQUAKE', 'OTHER', 'OTHER']

Results look good!

The test examples for `WILDFIRE` and `EARTHQUAKE` above demonstrate that we can correctly classify based on semantic proximity rather than literal word match.

This is not going to work perfectly in all cases! But it's a good start for 1 minute of effort. To improve this approach you can tweak the label descriptions and the threshold. 

You can also use this approach to mine examples for each class you're interested for later manual verification, to build a dataset of ground-truth examples.

#### Another example with fine-grained event types

In [17]:
my_classifier = ZeroShotClassifier(
    model=model,
    threshold=0.2,    
    null_label="OTHER"
)

my_classifier.train(
    labels=["COMP-ACQUISITION", "STAKE-ACQUISITION"],
    descriptions=[
        "Company acquires other company",
        "Company buys stocks/stake in other company"
    ]
)

In [18]:
my_classifier.predict([
    "Galetech Group buys majority stake in Optinergy",
    "SoftBank acquires minor stake in Deutsche Telekom in new 'long-term partnership'",
    "EQT buys stake in Sweden's Storytel, becomes second largest shareholder",
    "UK’s Digital 9 Infrastructure acquires Verne Global for €269.1M; here’s why",
    "French technology company Lectra acquires Gemini CAD systems",
    "Quercus buys Arcadia Books as Bielenberg named publisher",
])

['STAKE-ACQUISITION',
 'STAKE-ACQUISITION',
 'STAKE-ACQUISITION',
 'COMP-ACQUISITION',
 'COMP-ACQUISITION',
 'COMP-ACQUISITION']

In [25]:
import pandas as pd 

df = pd.read_csv('unlabeled.csv')[['Time', 'Log']].head(100)
df.head(10)

Unnamed: 0,Time,Log
0,1940,"T451 no ATO doors at R30-2, 311."
1,1957,T507 A10-1 BPD hold for loud music
2,1957,"T371 no ATO doors at M16-1, 311."
3,2000,Medic10 and Medic16 checked out
4,2003,T507 released ATO. 2 min delay
5,2004,"T365 no ATO doors at M90-2, 311."
6,2022,T369 Y10-2 double dashes.\r\nNo call from Central
7,2033,"T223 no ATO doors at S20-2, 311."
8,2052,T445 R10-1 possible medical emergency.TO to ch...
9,2055,A99 is at R10 and checking on the patron.


In [26]:
my_classifier = ZeroShotClassifier(
    model=model,
    threshold=0.3,    
    null_label="OTHER"
)

my_classifier.train(
    labels=["Medical", "Police", 'Delays', 'Mechanical'],
    descriptions=[
        "medical emergency or injury",
        "police activity or disturbances",
        "delays or late departure",
        "Doors not working or mechanical failure"
    ]
)

pred = my_classifier.predict(
    df['Log'].tolist()
)
df['Pred'] = pred
df.head(20)

Unnamed: 0,Time,Log,Pred
0,1940,"T451 no ATO doors at R30-2, 311.",Mechanical
1,1957,T507 A10-1 BPD hold for loud music,OTHER
2,1957,"T371 no ATO doors at M16-1, 311.",Mechanical
3,2000,Medic10 and Medic16 checked out,Medical
4,2003,T507 released ATO. 2 min delay,Delays
5,2004,"T365 no ATO doors at M90-2, 311.",Mechanical
6,2022,T369 Y10-2 double dashes.\r\nNo call from Central,OTHER
7,2033,"T223 no ATO doors at S20-2, 311.",Mechanical
8,2052,T445 R10-1 possible medical emergency.TO to ch...,Medical
9,2055,A99 is at R10 and checking on the patron.,OTHER


In [33]:
from sentence_transformers import SentenceTransformer, LoggingHandler
from sentence_transformers import models, util, datasets, evaluation, losses
from torch.utils.data import DataLoader

import nltk
nltk.download('punkt')

# Define your sentence transformer model using CLS pooling
model_name = 'bert-base-uncased'
word_embedding_model = models.Transformer(model_name)
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(), 'cls')
model = SentenceTransformer(modules=[word_embedding_model, pooling_model])

# Define a list with sentences (1k - 100k sentences)
train_sentences = ["Your set of sentences",
                   "Model will automatically add the noise", 
                   "And re-construct it",
                   "You should provide at least 1k sentences"]

# Create the special denoising dataset that adds noise on-the-fly
train_dataset = datasets.DenoisingAutoEncoderDataset(train_sentences)

# DataLoader to batch your data
train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True)

# Use the denoising auto-encoder loss
train_loss = losses.DenoisingAutoEncoderLoss(model, decoder_name_or_path=model_name, tie_encoder_decoder=True)

# Call the fit method
model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    epochs=1,
    weight_decay=0,
    scheduler='constantlr',
    optimizer_params={'lr': 3e-5},
    show_progress_bar=True
)

model.save('output/tsdae-model')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Jason\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Whe

The following encoder weights were not tied to the decoder ['bert/pooler']


HBox(children=(HTML(value='Epoch'), FloatProgress(value=0.0, max=1.0), HTML(value='')))

HBox(children=(HTML(value='Iteration'), FloatProgress(value=0.0, max=1.0), HTML(value='')))





In [34]:
del model
device = "cpu" # set as "cuda" instead if you have a GPU set up
# the first time this line runs the model will be downloaded 
model = SentenceTransformer("output/tsdae-model", device=device)