In [1]:
import re

def convert_to_lower(match_obj):
    """
    Replacement function to convert uppercase letter to lowercase.
    Code reference: https://pynative.com/python-regex-replace-re-sub/
    """
    if match_obj.group() is not None:
        return match_obj.group().lower()


def clean_text(text: str) -> str:
    """
    Takes a raw input data string, and returns a modified version, so that all words are lower case, 
    multiple white spaces, tabs, and new lines are removed. Numbers, date, emails, and URLs are 
    replaced by <NUM>, <DATE>, <EMAIL>, and <URL>.
    """

    # To lowercase
    caps = re.compile(r'[A-Z]')
    text = caps.sub(convert_to_lower, text)

    # Remove multiple white space, taps, and new lines
    newline = re.compile(r'\n')
    spacing = re.compile(r' {2,}')
    text = newline.sub(r' ', text) 
    text = spacing.sub(r' ', text)

    # Replace numbers, dates, email, and URLs with <NUM>, <DATE>, <EMAIL>, <URL>
    # Email
    p_email = re.compile(r'[a-zA-Z]*@[a-zA-Z]*\.[a-zA-Z]*\.?[a-zA-Z]*')
    text = p_email.sub("<EMAIL>", text)

    # URL
    p_url = re.compile(r'(https?[a-z/:\.\-0-9_]*)')
    text = p_url.sub("<URL>", text)

    # Dates
    p_date = re.compile(r'([0-9]{4}-?[0-9]{2}-[0-9]{2} ?[0-9:\.]*)')
    text = p_date.sub("<DATE>", text)

    # Numbers (incl. floats)
    p_num = re.compile(r'[0-9]+\.?[0-9]*')
    text = p_num.sub("<NUM>", text)

    return text

In [2]:
import pandas as pd

# load news_sample.csv file from git source
df = pd.read_csv('https://raw.githubusercontent.com/several27/FakeNewsCorpus/master/news_sample.csv')

# cleanup text on 'content' column
for i in range(len(df.index)):
    df.at[i, 'content'] = clean_text(df.content[i])

# save cleaned up data to csv file
df.to_csv("data/news_sample_cleaned.csv")

## NLTK

In [3]:
import nltk
nltk.download('punkt')
nltk.download("stopwords")

[nltk_data] Downloading package punkt to /Users/kristian/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/kristian/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [4]:
from nltk.tokenize import word_tokenize

# tokenize the text
def tokenize_text(text: str) -> list:
    return word_tokenize(text)


In [5]:
from nltk.corpus import stopwords


# remove stopwords
def remove_stopwords(filename: str) -> None:
    df = pd.read_csv(filename)
    stop_words = set(stopwords.words('english'))

    # add stopword filtered text to a new column
    for i in range(len(df.index)):
        filtered_sentence = []
        word_tokens = word_tokenize(df.content[i])
        for w in word_tokens:
            if w not in stop_words:
                filtered_sentence.append(w)
        
        nostop_text = ' '.join(filtered_sentence)
        df.at[i, 'stopword_filtered_content'] = nostop_text

    # write dataframe to new csv file
    df.to_csv(filename)
    # df.to_csv(filename[0:-4] + "_sw.csv")

remove_stopwords("data/news_sample_cleaned.csv")

In [6]:
# compute stopword reduction rate, by adding vocabulary columns and reduction rate column
def stopword_reduction_rate(filename: str) -> None:
    df = pd.read_csv(filename)

    for i in range(len(df.index)):
        content_vocabulary_size = len(set(word_tokenize(df.content[i])))
        filtr_content_vocabulary_size = len(set(word_tokenize(df.stopword_filtered_content[i])))
        decrease = content_vocabulary_size - filtr_content_vocabulary_size
        reduction_rate = (decrease/content_vocabulary_size) * 100
        df.at[i, 'vocabulary_size'] = content_vocabulary_size
        df.at[i, 'filtered_vocabulary_size'] = filtr_content_vocabulary_size
        df.at[i, 'reduction_rate'] = round(reduction_rate, 3)
    
    df.to_csv(filename)
    
stopword_reduction_rate('data/news_sample_cleaned.csv')



In [7]:
# remove word variations and compute size of vocabulary. Add column with 'stem vocabulary size' to the csv file