## Imports

In [2]:
!pip install -q -U emoji pandas wordsegment

[33m  DEPRECATION: emoji is being installed using the legacy 'setup.py install' method, because it does not have a 'pyproject.toml' and the 'wheel' package is not installed. pip 23.1 will enforce this behaviour change. A possible replacement is to enable the '--use-pep517' option. Discussion can be found at https://github.com/pypa/pip/issues/8559[0m[33m
[0m

In [1]:
import re
from pathlib import Path

import emoji
import pandas as pd
from wordsegment import load, segment

In [2]:
load()

## Auxiliary functions

In [3]:
def preprocess_input(text: str):
    """Read in text and preprocesses it and returns"""
    text = re.sub("/"," / ", text)  # split slashes
    text = emoji.demojize(text, delimiters=(" ", " "))  # change emoji
    text = re.sub("_", " ", text)  # split underscores

    # split hashtags
    hashtags = re.findall(r"(#\w+)", text)
    for hs in hashtags:
        words = " ".join(segment(hs))
        text = text.replace(hs, words)

    text = text.lower()  # lower text

    return text

## Process data

In [4]:
# DATA_FOLDER = Path('.')
DATA_FOLDER = Path('datasets')

assert DATA_FOLDER.exists()

In [5]:
df_train = pd.read_csv(DATA_FOLDER / 'train.csv')
df_val = pd.read_csv(DATA_FOLDER / 'val.csv')
df_test = pd.read_csv(DATA_FOLDER / 'test.csv')

for in_filename, out_filename in [('train.csv', 'train_preprocessed.csv'), ('val.csv', 'val_preprocessed.csv'), ('test.csv', 'test_preprocessed.csv')]:
    _df = pd.read_csv(DATA_FOLDER / in_filename)
    _df['tweet'] = _df['tweet'].apply(preprocess_input)
    _df.to_csv(DATA_FOLDER / out_filename, index=False)