# Data Preprocessing

This notebook assumes that data have been collected by running `notebooks/01-scrape_data.ipynb`.

Run this command before running the following cells:
```
python -m spacy download en_core_web_sm
```

In [17]:
import re
import string
from pathlib import Path

import pandas as pd
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split

In [18]:
DATA_DIR = Path("../data/")
RAW_DATA_DIR = DATA_DIR / "raw"
PROCESSED_DATA_DIR = DATA_DIR / "processed"
PROCESSED_DATA_DIR.mkdir(parents=True, exist_ok=True)

stop_words = set(stopwords.words('english'))

Read data and then convert labels into integers.

In [19]:
df = pd.read_csv(RAW_DATA_DIR / "reddit_posts.csv")
df["label"] = df["subreddit_name"].map(
    {
        "MachineLearning": 0,
        "LearnMachineLearning": 1,
    }
)

Do some data cleaning.

In [20]:
def clean_text(post):
    """Text preprocessing before tokenization."""
    # Remove tags
    post["title"] = re.sub(r"\[[A-Z]+\]", "", post["title"])

    # Concatenate title and selftext
    text = post["title"] + " " + post["selftext"]

    # Remove URLs
    text = re.sub(r"http\S+", "", text)

    # Replace punctuation with space
    translator = str.maketrans(
        string.punctuation, " " * len(string.punctuation)
    )
    text = text.translate(translator)

    # Transform multiple spaces and \n to a single space
    text = re.sub(r"\s{1,}", " ", text)

    # Strip white spaces at the beginning and at the end
    text = text.strip()

    # Transform to lowercase
    text = text.lower()

    # Remove stop words
    text = " ".join(word for word in text.split() if not word in stop_words)
    return text

In [21]:
df.fillna("", inplace=True)
df["text"] = df.apply(clean_text, axis=1)
df = df[~(df["text"] == "")]
df.drop(columns=["created_utc", "title", "selftext", "subreddit_name"], inplace=True)

Split dataset.

In [22]:
train_df, test_df = train_test_split(df, test_size=0.1, stratify=df["label"], random_state=45)

Save processed data.

In [23]:
train_df.to_csv(PROCESSED_DATA_DIR / "train.csv", index=False)
test_df.to_csv(PROCESSED_DATA_DIR / "test.csv", index=False)