# Imports

In [None]:
%load_ext autoreload
%autoreload 2

import re

from constants import PROJECT_ROOT
from utils import read_en_humanitarian_data as read_human_data
from utils import read_en_informativeness_data as read_info_data

DATA_PATH = PROJECT_ROOT / "data"

# Data Cleaning

### Cleaning Functions

In [None]:
def clean_text(text: str) -> str:
    text = text.lower()
    text = re.sub(r"http\S+", "", text)
    text = re.sub(r"@\w+", "", text)
    text = re.sub(r"#", "", text)
    text = re.sub(r"rt\s+", "", text)
    text = text.encode("ascii", errors="ignore").decode()
    text = re.sub(r"\s+", " ", text).strip()
    if text.startswith(": "):
        text = text[2:]
    return text

### Humanitarian

In [None]:
human_train_df, human_dev_df, human_test_df = read_human_data(dataset="all_data_en")

In [None]:
human_train_df["text"] = human_train_df["text"].apply(clean_text)
human_dev_df["text"] = human_dev_df["text"].apply(clean_text)
human_test_df["text"] = human_test_df["text"].apply(clean_text)

### Informativeness

In [None]:
info_train_df, info_dev_df, info_test_df = read_info_data(dataset="all_data_en")

In [None]:
info_train_df["text"] = info_train_df["text"].apply(clean_text)
info_dev_df["text"] = info_dev_df["text"].apply(clean_text)
info_test_df["text"] = info_test_df["text"].apply(clean_text)

# Save Clean Data

In [None]:
import os

splits = ["train", "dev", "test"]

CLEAN_DATA_PATH = DATA_PATH / "clean_en_data"

os.makedirs(CLEAN_DATA_PATH, exist_ok=True)

for human_df, split in zip([human_train_df, human_dev_df, human_test_df], splits):
    human_df.to_csv(CLEAN_DATA_PATH / f"human_{split}.csv", index=False)

for info_df, split in zip([info_train_df, info_dev_df, info_test_df], splits):
    info_df.to_csv(CLEAN_DATA_PATH / f"info_{split}.csv", index=False)