In [13]:
import re

def convert_to_lower(match_obj):
    """
    Replacement function to convert uppercase letter to lowercase.
    Code reference: https://pynative.com/python-regex-replace-re-sub/
    """
    if match_obj.group() is not None:
        return match_obj.group().lower()


def clean_text(text: str) -> str:
    """
    Takes a raw input data string, and returns a modified version, so that all words are lower case, 
    multiple white spaces, tabs, and new lines are removed. Numbers, date, emails, and URLs are 
    replaced by <NUM>, <DATE>, <EMAIL>, and <URL>.
    """

    # To lowercase
    caps = re.compile(r'[A-Z]')
    text = caps.sub(convert_to_lower, text)

    # Remove multiple white space, taps, and new lines
    newline = re.compile(r'\n')
    spacing = re.compile(r' {2,}')
    text = newline.sub(r' ', text) 
    text = spacing.sub(r' ', text)

    # Replace numbers, dates, email, and URLs with <NUM>, <DATE>, <EMAIL>, <URL>
    # Email
    p_email = re.compile(r'[a-zA-Z]*@[a-zA-Z]*\.[a-zA-Z]*\.?[a-zA-Z]*')
    text = p_email.sub("<EMAIL>", text)

    # URL
    p_url = re.compile(r'(https?[a-z/:\.\-0-9_]*)')
    text = p_url.sub("<URL>", text)

    # Dates
    p_date = re.compile(r'([0-9]{4}-?[0-9]{2}-[0-9]{2} ?[0-9:\.]*)')
    text = p_date.sub("<DATE>", text)

    # Numbers (incl. floats)
    p_num = re.compile(r'[0-9]+\.?[0-9]*')
    text = p_num.sub("<NUM>", text)

    return text

In [45]:
import pandas as pd

# load news_sample.csv file from git source
df = pd.read_csv('https://raw.githubusercontent.com/several27/FakeNewsCorpus/master/news_sample.csv')

# cleanup text on 'content' column
for i in range(len(df.index)):
    df.at[i, 'content'] = clean_text(df.content[i])

# save cleaned up data to csv file
df.to_csv("data/news_sample_cleaned.csv")