# Imports

In [None]:
%load_ext autoreload
%autoreload 2

import matplotlib.pyplot as plt
import pandas as pd

from constants import PROJECT_ROOT

DATA_PATH = PROJECT_ROOT / "data" / "all_data_en"

# Data Loading

In [None]:
""" READ DATA """
humanitarian_data = "crisis_consolidated_humanitarian_filtered_lang_en"
informativeness_data = "crisis_consolidated_informativeness_filtered_lang_en"

humanitarian_train_df = pd.read_csv(
    DATA_PATH / f"{humanitarian_data}_train.tsv", sep="\t", engine="python", on_bad_lines="skip"
)
humanitarian_dev_df = pd.read_csv(
    DATA_PATH / f"{humanitarian_data}_dev.tsv", sep="\t", engine="python", on_bad_lines="skip"
)
humanitarian_test_df = pd.read_csv(
    DATA_PATH / f"{humanitarian_data}_test.tsv", sep="\t", engine="python", on_bad_lines="skip"
)

informativeness_train_df = pd.read_csv(
    DATA_PATH / f"{informativeness_data}_train.tsv", sep="\t", engine="python", on_bad_lines="skip"
)
informativeness_dev_df = pd.read_csv(
    DATA_PATH / f"{informativeness_data}_dev.tsv", sep="\t", engine="python", on_bad_lines="skip"
)
informativeness_test_df = pd.read_csv(
    DATA_PATH / f"{informativeness_data}_test.tsv", sep="\t", engine="python", on_bad_lines="skip"
)

In [None]:
""" ENSURE CORRECT SPLIT PROPORTIONS"""
len_humanitarian = len(humanitarian_train_df) + len(humanitarian_dev_df) + len(humanitarian_test_df)

print("Humanitarian splits:")
print("------------------------")
print(
    f"Train split: {humanitarian_train_df.shape[0] / len_humanitarian * 100:.1f}%\n",
    f"Dev split: {humanitarian_dev_df.shape[0] / len_humanitarian * 100:.1f}%\n",
    f"Test split: {humanitarian_test_df.shape[0] / len_humanitarian * 100:.1f}%",
)

len_informativeness = (
    len(informativeness_train_df) + len(informativeness_dev_df) + len(informativeness_test_df)
)

print("\nInformativeness splits:")
print("------------------------")
print(
    f"Train split: {informativeness_train_df.shape[0] / len_informativeness * 100:.1f}%\n",
    f"Dev split: {informativeness_dev_df.shape[0] / len_informativeness * 100:.1f}%\n",
    f"Test split: {informativeness_test_df.shape[0] / len_informativeness * 100:.1f}%",
)

# Exploratory Data Analysis

In [None]:
eda_human_train_df = humanitarian_train_df.copy()

In [None]:
eda_human_train_df.head(5)

In [None]:
top_events_n = 10
event_counts = eda_human_train_df["event"].value_counts()
top_events = event_counts.head(top_events_n)
other_sum = event_counts.iloc[top_events_n:].sum()

clipped_events = pd.concat([top_events, pd.Series({"Other": other_sum})])

event_colors = plt.cm.Set3(range(len(clipped_events)))

ax = clipped_events.plot.pie(
    autopct="%1.1f%%",
    figsize=(10, 10),
    ylabel="",
    legend=False,
    colors=event_colors,
    textprops={"fontsize": 12, "fontweight": "bold"},
)

ax.set_title(f"Top {top_events_n} Events + Other", fontsize=18, fontweight="bold", pad=20);

In [None]:
top_labels_n = 7
label_counts = eda_human_train_df["class_label"].value_counts()
top_labels = label_counts.head(top_labels_n)
other_labels = label_counts.iloc[top_labels_n:].sum()

clipped_labels = pd.concat([top_labels, pd.Series({"Other": other_labels})])

label_colors = plt.cm.Set3(range(len(clipped_labels)))
ax = clipped_labels.plot.pie(
    autopct="%1.1f%%",
    figsize=(10, 10),
    ylabel="",
    legend=False,
    colors=label_colors,
    textprops={"fontsize": 12, "fontweight": "bold"},
)

ax.set_title(f"Top {top_labels_n} Values + Other", fontsize=18, fontweight="bold", pad=20);