# Imports

In [None]:
%load_ext autoreload
%autoreload 2

import matplotlib.pyplot as plt
import pandas as pd

from constants import PROJECT_ROOT
from utils import read_en_humanitarian_data as read_human_data
from utils import read_en_informativeness_data as read_info_data

DATA_PATH = PROJECT_ROOT / "data" / "all_data_en"

# Data Loading

In [None]:
""" READ DATA """
humanitarian_train_df, humanitarian_dev_df, humanitarian_test_df = read_human_data()

informativeness_train_df, informativeness_dev_df, informativeness_test_df = read_info_data()

In [None]:
""" ENSURE CORRECT SPLIT PROPORTIONS"""
len_humanitarian = len(humanitarian_train_df) + len(humanitarian_dev_df) + len(humanitarian_test_df)

print("Humanitarian splits:")
print("------------------------")
print(
    f"Train split: {humanitarian_train_df.shape[0] / len_humanitarian * 100:.1f}%\n",
    f"Dev split: {humanitarian_dev_df.shape[0] / len_humanitarian * 100:.1f}%\n",
    f"Test split: {humanitarian_test_df.shape[0] / len_humanitarian * 100:.1f}%",
)

len_informativeness = (
    len(informativeness_train_df) + len(informativeness_dev_df) + len(informativeness_test_df)
)

print("\nInformativeness splits:")
print("------------------------")
print(
    f"Train split: {informativeness_train_df.shape[0] / len_informativeness * 100:.1f}%\n",
    f"Dev split: {informativeness_dev_df.shape[0] / len_informativeness * 100:.1f}%\n",
    f"Test split: {informativeness_test_df.shape[0] / len_informativeness * 100:.1f}%",
)

# Exploratory Data Analysis

### Humanitarian

In [None]:
eda_human_train_df = humanitarian_train_df.copy()

In [None]:
eda_human_train_df.head(5)

In [None]:
top_events_n_human = 10
event_counts_human = eda_human_train_df["event"].value_counts()
top_events_human = event_counts_human.head(top_events_n_human)
other_sum_human = event_counts_human.iloc[top_events_n_human:].sum()

clipped_events = pd.concat([top_events_human, pd.Series({"Other": other_sum_human})])

event_colors_human = plt.cm.Set3(range(len(clipped_events)))

ax = clipped_events.plot.pie(
    autopct="%1.1f%%",
    figsize=(10, 10),
    ylabel="",
    legend=False,
    colors=event_colors_human,
    textprops={"fontsize": 12, "fontweight": "bold"},
)

ax.set_title(
    f"Top {top_events_n_human} Events + Other (Humanitarian)",
    fontsize=20,
    fontweight="bold",
    pad=20,
);

In [None]:
top_labels_n_human = 7
label_counts_human = eda_human_train_df["class_label"].value_counts()
top_labels_human = label_counts_human.head(top_labels_n_human)
other_labels_human = label_counts_human.iloc[top_labels_n_human:].sum()

clipped_labels_human = pd.concat([top_labels_human, pd.Series({"Other": other_labels_human})])

label_colors_human = plt.cm.Set3(range(len(clipped_labels_human)))

ax = clipped_labels_human.plot.pie(
    autopct="%1.1f%%",
    figsize=(10, 10),
    ylabel="",
    legend=False,
    colors=label_colors_human,
    textprops={"fontsize": 12, "fontweight": "bold"},
)

ax.set_title(
    f"Top {top_labels_n_human} Label Values + Other (Humanitarian)",
    fontsize=20,
    fontweight="bold",
    pad=20,
);

### Informativeness

In [None]:
eda_info_train_df = informativeness_train_df.copy()

In [None]:
eda_info_train_df.head(5)

In [None]:
top_events_n_info = 10
event_counts_info = eda_info_train_df["event"].value_counts()
top_events_info = event_counts_info.head(top_events_n_info)
other_sum_info = event_counts_info.iloc[top_events_n_info:].sum()

clipped_events_info = pd.concat([top_events_info, pd.Series({"Other": other_sum_info})])

event_colors_info = plt.cm.Set3(range(len(clipped_events_info)))

ax = clipped_events_info.plot.pie(
    autopct="%1.1f%%",
    figsize=(10, 10),
    ylabel="",
    legend=False,
    colors=event_colors_info,
    textprops={"fontsize": 12, "fontweight": "bold"},
)

ax.set_title(
    f"Top {top_events_n_info} Events + Other (Informativeness)",
    fontsize=20,
    fontweight="bold",
    pad=20,
);

In [None]:
label_counts_info = eda_info_train_df["class_label"].value_counts()

label_colors_info = plt.cm.Set3(range(len(label_counts_info)))

ax = label_counts_info.plot.pie(
    autopct="%1.1f%%",
    figsize=(10, 10),
    ylabel="",
    legend=False,
    colors=label_colors_info,
    textprops={"fontsize": 16, "fontweight": "bold"},
)

ax.set_title("Label Values (Informativeness)", fontsize=20, fontweight="bold", pad=20);