# Part 1: Data Processing

## Task 1:

### Clean the text

In [1]:
import re

def convert_to_lower(match_obj):
    """
    Replacement function to convert uppercase letter to lowercase.
    Code reference: https://pynative.com/python-regex-replace-re-sub/
    """
    if match_obj.group() is not None:
        return match_obj.group().lower()


def clean_text(text: str) -> str:
    """
    Takes a raw input data string, and returns a modified version, so that all words are lower case, 
    multiple white spaces, tabs, and new lines are removed. Numbers, date, emails, and URLs are 
    replaced by <NUM>, <DATE>, <EMAIL>, and <URL>.
    """

    # To lowercase
    caps = re.compile(r'[A-Z]')
    text = caps.sub(convert_to_lower, text)

    # Remove multiple white space, taps, and new lines
    newline = re.compile(r'\n')
    spacing = re.compile(r' {2,}')
    text = newline.sub(r' ', text) 
    text = spacing.sub(r' ', text)

    # Replace numbers, dates, email, and URLs with <NUM>, <DATE>, <EMAIL>, <URL>
    # Email
    p_email = re.compile(r'[a-zA-Z]*@[a-zA-Z]*\.[a-zA-Z]*\.?[a-zA-Z]*')
    text = p_email.sub("<EMAIL>", text)

    # URL
    p_url = re.compile(r'(https?[a-z/:\.\-0-9_]*)')
    text = p_url.sub("<URL>", text)

    # Dates
    p_date = re.compile(r'([0-9]{4}-?[0-9]{2}-[0-9]{2} ?[0-9:\.]*)')
    text = p_date.sub("<DATE>", text)

    # Numbers (incl. floats)
    p_num = re.compile(r'[0-9]+\.?[0-9]*')
    text = p_num.sub("<NUM>", text)

    return text

In [12]:
import pandas as pd

# load news_sample.csv file from git source
df = pd.read_csv('https://raw.githubusercontent.com/several27/FakeNewsCorpus/master/news_sample.csv')

# cleanup text on 'content' column and add into new column 'content_clean'
for i in range(len(df.index)):
    df.at[i, 'content_clean'] = clean_text(df.content[i])

# save cleaned up data to csv file
df.to_csv("data/news_sample_cleaned.csv")

pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', 150)

print(df["content_clean"])

0      sometimes the power of christmas will make you do wild and wonderful things. you do not need to believe in the holy trinity to believe in the posi...
1      awakening of <NUM> strands of dna – “reconnecting with you” movie % of readers think this story is fact. add your two cents. headline: bitcoin & b...
2      never hike alone: a friday the <NUM>th fan film usa | <NUM> | <NUM> min a fan tribute to friday the <NUM>th, never hike alone follows an adventure...
3      when a rare shark was caught, scientists were left blundering for answers. this shark has a unique feature. and it’s not that it can survive at ex...
4      donald trump has the unnerving ability to ability to create his own reality and convince millions of americans that what he says it is true. the p...
5                 “republicans and democrats alike are willing to turn over government coffers to bezos and his ilk and the rights of the people be damned.”
6      could you imagine waking up in the morgue? i for on

## NLTK

In [3]:
import nltk
nltk.download('punkt')
nltk.download("stopwords")

[nltk_data] Downloading package punkt to /Users/kristian/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/kristian/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [4]:
from nltk.tokenize import word_tokenize

# tokenize the text
# THIS IS NOT IN USE FOR NOW
def tokenize_text(text: str) -> list:
    return word_tokenize(text)


### Tokenize the text and remove stopwords

In [5]:
from nltk.corpus import stopwords


# remove stopwords
def remove_stopwords(filename: str) -> None:
    df = pd.read_csv(filename)
    stop_words = set(stopwords.words('english'))

    # add stopword filtered text to a new column
    for i in range(len(df.index)):
        filtered_sentence = []
        word_tokens = word_tokenize(df.content_clean[i])
        for w in word_tokens:
            if w not in stop_words:
                filtered_sentence.append(w)
        
        nostop_text = ' '.join(filtered_sentence)
        df.at[i, 'content_stopword'] = nostop_text

    # write dataframe to new csv file
    df.to_csv(filename)
    # df.to_csv(filename[0:-4] + "_sw.csv")

remove_stopwords("data/news_sample_cleaned.csv")

###  Compute the size of the vocabulary and compute the reduction rate of the vocabulary size after removing stopwords.


In [6]:
# compute stopword reduction rate. add vocabulary columns and reduction rate column to csv file
def stopword_reduction_rate(filename: str) -> None:
    df = pd.read_csv(filename)

    for i in range(len(df.index)):
        content_clean_vocabulary_size = len(set(word_tokenize(df.content_clean[i])))
        content_stopword_vocabulary_size = len(set(word_tokenize(df.content_stopword[i])))
        decrease = content_clean_vocabulary_size - content_stopword_vocabulary_size
        reduction_rate = (decrease/content_clean_vocabulary_size) * 100
        df.at[i, 'content_clean_vocabulary_size'] = content_clean_vocabulary_size
        df.at[i, 'content_stopword_vocabulary_size'] = content_stopword_vocabulary_size
        df.at[i, 'content_stopword_reduction_rate'] = round(reduction_rate, 3)
    
    df.to_csv(filename)
    
stopword_reduction_rate('data/news_sample_cleaned.csv')



### Remove word variations with stemming and compute the size of the vocabulary

In [7]:
from nltk.stem.snowball import SnowballStemmer

# remove word variations
def remove_word_variations(filename: str) -> None:
    stemmer = SnowballStemmer("english")
    df = pd.read_csv(filename)

    for i in range(len(df.index)):
        stemmed_words = []
        word_tokens = word_tokenize(df.content_stopword[i])
        for w in word_tokens:
            stemmed_words.append(stemmer.stem(w))
        
        stemmed_text = ' '.join(stemmed_words)
        df.at[i, 'content_stem'] = stemmed_text
    
    df.to_csv(filename)

remove_word_variations('data/news_sample_cleaned.csv')

###  Compute the reduction rate of the vocabulary size after stemming

In [8]:
# compute size of vocabulary after stemming. Add column with 'stem vocabulary size' to the csv file
def stemming_reduction_rate(filename: str) -> None:
    df = pd.read_csv(filename)

    for i in range(len(df.index)):
        stem_vocabulary_size = len(set(word_tokenize(df.content_stem[i])))
        clean_vocabulary_size = len(set(word_tokenize(df.content_clean[i])))
        decrease = clean_vocabulary_size - stem_vocabulary_size
        reduction_rate = (decrease/clean_vocabulary_size) * 100
        df.at[i, 'content_stem_vocabulary_size'] = stem_vocabulary_size
        df.at[i, 'content_stem_reduction_rate'] = round(reduction_rate, 3)

    df.to_csv(filename)

stemming_reduction_rate('data/news_sample_cleaned.csv')