In [None]:
# Filter warnings for readability
import warnings
warnings.filterwarnings('ignore')

Imports

In [None]:
import pandas as pd
from transformers import pipeline
import torch
from wordcloud import WordCloud
import yaml

with open("config.yaml", "r") as file:
    config = yaml.safe_load(file)

Read Preprocessed Data

In [None]:
preprocessed_df = pd.read_csv('titles_preprocessed.csv')

In [None]:
preprocessed_df.head()

In [None]:
preprocessed_df.info()

### __Step 1__: Build a *Multi-Label Zero-Shot-Classifier*

In [None]:
labels = ["Earthquake", "Floods", "Volcano", "Tornado", "Wildfire"]

if torch.cuda.is_available():
    print("Using GPU")
    classifier = pipeline("zero-shot-classification", model=config['zero_shot_model_gpu'], device=0) # Use GPU
    results = classifier(preprocessed_df['title'].tolist(), candidate_labels=labels, batch_size=32)
else:
    print("Using CPU")
    classifier = pipeline("zero-shot-classification", model=config['zero_shot_model_cpu'], device=-1) # Use CPU
    results = classifier(preprocessed_df['title'].tolist(), candidate_labels=labels, batch_size=8)

Example Result

In [None]:
results[0]

In [None]:
# Free up CUDA cores:
del classifier
del results
torch.cuda.empty_cache()
torch.cuda.ipc_collect()

In [None]:
parsed = [
    {
        "natural_catastrophe_event": r["labels"][0],
        "zero_shot_score": r["scores"][0]
    }
    for r in results
]

In [None]:
preprocessed_df[["natural_catastrophe_event", "zero_shot_score"]] =  pd.DataFrame(parsed)

In [None]:
preprocessed_df.head()

### Post-Modelling Analysis

In [None]:
preprocessed_df['natural_catastrophe_event'].value_counts()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Optional: Set a clean style
sns.set(style="whitegrid")

# Plot
plt.figure(figsize=(10, 6))
sns.countplot(
    data=preprocessed_df, 
    x="natural_catastrophe_event", 
    order=preprocessed_df["natural_catastrophe_event"].value_counts().index,
    palette="Set2"
)

# Add labels and title
plt.title("Distribution of Predicted Natural Catastrophe Events", fontsize=16)
plt.xlabel("Event Type", fontsize=12)
plt.ylabel("Number of Articles", fontsize=12)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Set plot style
sns.set(style="whitegrid")

# Create the boxplot
plt.figure(figsize=(12, 6))
sns.boxplot(
    data=preprocessed_df,
    x="natural_catastrophe_event",
    y="zero_shot_score",
    palette="Set3"
)

# Labeling
plt.title("Zero-Shot Score Distribution by Natural Catastrophe Event", fontsize=16)
plt.xlabel("Natural Catastrophe Event", fontsize=12)
plt.ylabel("Zero-Shot Score", fontsize=12)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
sns.kdeplot(preprocessed_df["zero_shot_score"], shade=True, color="purple")

plt.title("Density of Zero-Shot Classification Scores", fontsize=16)
plt.xlabel("Zero-Shot Score", fontsize=12)
plt.ylabel("Density", fontsize=12)
plt.grid(True)
plt.tight_layout()
plt.show()


In [None]:
import matplotlib.pyplot as plt

# Filter titles with low confidence
low_score_titles = preprocessed_df[preprocessed_df["zero_shot_score"] < 0.4]["title"]

# Combine all titles into one string
text_blob = " ".join(low_score_titles.astype(str).tolist())

# Generate word cloud
wordcloud = WordCloud(width=1000, height=500, background_color="white", colormap="viridis").generate(text_blob)

# Plot
plt.figure(figsize=(12, 6))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.title("Word Cloud of Titles with Zero-Shot Score < 0.4", fontsize=16)
plt.tight_layout()
plt.show()
