In [3]:
# Filter warnings for readability
import warnings
warnings.filterwarnings('ignore')

Imports & Config

In [1]:
import pandas as pd
from transformers import pipeline
import numpy as np
from transformers import BertTokenizer, Trainer, TrainingArguments,AutoModelForSequenceClassification
from datasets import Dataset
from sklearn.model_selection import train_test_split
import evaluate
import spacy
import torch
import yaml

with open("config.yaml", "r") as file:
    config = yaml.safe_load(file)


  from .autonotebook import tqdm as notebook_tqdm
W0708 23:00:30.259000 55984 site-packages\torch\distributed\elastic\multiprocessing\redirects.py:29] NOTE: Redirects are currently not supported in Windows or MacOs.


In [2]:
print("PyTorch version:", torch.__version__)
print("CUDA version PyTorch was built with:", torch.version.cuda)
print("Is CUDA available:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("CUDA device:", torch.cuda.get_device_name(0))
    print("GPU Opt: ", config['use_available_gpus'] )

PyTorch version: 2.7.1+cu118
CUDA version PyTorch was built with: 11.8
Is CUDA available: True
CUDA device: Quadro P2000 with Max-Q Design
GPU Opt:  True


Read in Dataset

In [4]:
events_df = pd.read_csv('Nat Cat Events.csv')

Only Consider Titles

In [5]:
titles = events_df['title']

### __Step 1__: Remove *duplicates*, *null values* & *whitespaces*

In [6]:
titles = titles.drop_duplicates().reset_index(drop=True)

In [7]:
titles = titles.dropna().reset_index(drop=True)

In [8]:
titles_df = pd.DataFrame(titles)

In [9]:
titles_df["title"] = titles_df["title"].str.strip()

In [10]:
titles_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 65158 entries, 0 to 65157
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   title   65158 non-null  object
dtypes: object(1)
memory usage: 509.2+ KB


### __Step 2__: Find articles containing a *location*.

Select NLP Pipeline depending on hardware availability


`en_core_web_try`: Uses a Transformer for word & context embeddings


`en_core_web_sm`: Token-to-Vector (Tok2Vec) based pipeline uses CNN's or LSTM's for work & context embeddings

See documentation [here](https://spacy.io/models/en#benchmarks)


In [None]:
if torch.cuda.is_available() and config['use_available_gpus']:
    spacy.require_gpu() # Use GPU
    nlp = spacy.load(config['ner_pipeline_gpu'])
    batch_size = config['batch_size_gpu']
    print("Using trf (GPU Supported)")
else:
    nlp = spacy.load(config['ner_pipeline_cpu']) # Use CPU
    batch_size = config['batch_size_cpu']
    print("Using sm")

Using trf (GPU Supported)


##### Find articles where the titles match the following criteria using *SpaCy* a *NLP* module for *Named-Entity-Recognition*
- Contains a geopolitical entity like a country, city or state (Entity GPE) OR
- Contains a Non-GPE location like a mountain or body of water (Entity LOC)

Entities are taken from the [OntoNotes5](https://catalog.ldc.upenn.edu/LDC2013T19) dataset which many english models are built on

In [None]:
docs = list(nlp.pipe(titles_df['title'].values, batch_size=batch_siz))
#titles_df['occurrence_and_location'] = [any(ent.label_ in ("DATE", "TIME") for ent in doc.ents) and any(ent.label_ in ("GPE", "LOC") for ent in doc.ents) for doc in docs]
titles_df['has_location'] = [any(ent.label_ in ("GPE", "LOC") for ent in doc.ents) for doc in docs]

In [None]:
# Free up CUDA Cores
if config['use_available_gpus']:
    torch.cuda.empty_cache()
    torch.cuda.ipc_collect()

In [14]:
titles_df.to_csv('titles_containing_locations.csv', index = False)

In [2]:
titles_df = pd.read_csv('titles_containing_locations.csv')
titles_location_df = titles_df[titles_df['has_location']]

In [3]:
titles_location_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 40989 entries, 0 to 65156
Data columns (total 2 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   title         40989 non-null  object
 1   has_location  40989 non-null  bool  
dtypes: bool(1), object(1)
memory usage: 680.5+ KB


### __Step 3__: Use a *Zero-Shot-Classifier* to capture confidence of a title implying a *natural catastrophe event*

In [None]:
labels = ["natural catastrophe event that has occurred"]

if torch.cuda.is_available() and config['use_available_gpus']:
    print("Using GPU")
    classifier = pipeline("zero-shot-classification", model=config['zero_shot_model_gpu'], device=0) # Use GPU
    results = classifier(titles_location_df['title'].tolist(), candidate_labels=labels, batch_size=config['batch_size_gpu'])
else:
    print("Using CPU")
    classifier = pipeline("zero-shot-classification", model=config['zero_shot_model_cpu'], device=-1) # Use CPU
    results = classifier(titles_location_df['title'].tolist(), candidate_labels=labels, batch_size=config['batch_size_cpu'])

Using GPU




In [None]:
titles_location_df['zero_shot_score'] = [zero_shot['scores'][0] for zero_shot in results]

In [None]:
titles_location_df.to_csv('titles_zero_shot.csv', index=False)

In [7]:
# Free up CUDA Cores
del classifier
del results
if config['use_available_gpus']:
    torch.cuda.empty_cache()
    torch.cuda.ipc_collect()

#### Analyse the Distribution of scores

In [None]:
titles_location_df['zero_shot_score'].hist(bins=100)