In [1]:
import numpy as np
import pandas as pd
from nltk import word_tokenize, sent_tokenize as to_sentences
import string
import re
from contractions import contractions_dict
from article_summariser import getSummary
from sklearn.model_selection import train_test_split

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
import warnings
warnings.filterwarnings("ignore")

In [2]:
def expand_contractions(text, contractions_dict):
    
    contractions_pattern = re.compile('({})'.format('|'.join(contractions_dict.keys())), flags = re.IGNORECASE | re.DOTALL)
    
    def expand_match(contraction):

        match = contraction.group(0)
        first_char = match[0]
        expanded_contraction = contractions_dict.get(match) if contractions_dict.get(match) else contractions_dict.get(match.lower())
        expanded_contraction = expanded_contraction
        
        return expanded_contraction

    expanded_text = contractions_pattern.sub(expand_match, text)
    expanded_text = re.sub("'", "", expanded_text)
    
    return expanded_text

In [3]:
def process_article_body(body):
    
    if not isinstance(body, str):
        
        raise ValueError("Expected a string input. Received an object of class " + type(body).__name__ + " instead.")
        
    # get summary of the article in seven sentences
    body_summary = getSummary(body)
    
    # article body to list of sentences
    sentences = to_sentences(body_summary)
    try:
        assert len(sentences) == 7
    except:
        print(f"Expected length of summary was 7. Found length {len(sentences)} instead. Length of article is {len(to_sentences(body))}.\n\nArticle Body -\n\n{body}\n\nSummary -\n\n{body_summary}")
        raise AssertionError
    
    # mapping from punctuations to empty string
    table = str.maketrans("", "", string.punctuation)
    
    for idx in range(len(sentences)):
        
        # get sentence
        sentence = sentences[idx]
        
        # replace characters with replacements
        replacements = {"\n": "", "\r": "", "-": " - "}
        for to_replace, replacement in replacements.items():
            sentence = sentence.replace(to_replace, replacement)
        
        # remove extra spaces
        sentence = re.sub(" +", " ", sentence)
        
        # remove anything in brackets
        sentence = re.sub(r"\([^()]*\)", "", sentence)
        
        # remove punctuation
        sentence = " ".join([word.translate(table) for word in word_tokenize(sentence)])
        
        # expand contractions
        sentence = expand_contractions(sentence, contractions_dict)
        
        # encode to utf-8
        sentence = sentence.encode(encoding = "utf-8", errors = "ignore").decode("utf-8")
        
        # set sentence
        sentences[idx] = sentence
        
    # join sentences comma separated
    body = ", ".join(sentences)
    
    # return cleaned body    
    return body

In [4]:
def transform(bodies, stances):
    
    flagged_ids = []
    
    for idx, row in bodies.iterrows():
        
        body_id, article_body = row["Body ID"], row["articleBody"]
        
        body_sentences = to_sentences(article_body)        
        if len(body_sentences) < 7:
            flagged_ids.append(body_id)
        else:
            try:
                assert len(to_sentences(article_body)) >= 7
            except:
                print(f"Expected length >= 7. Found {len(body_sentences)} instead.\n{article_body}")
                raise AssertionError
            bodies[bodies["Body ID"] == body_id] = (body_id, process_article_body(article_body))
            
    for body_id in flagged_ids:
        
        bodies = bodies[bodies["Body ID"] != body_id]
        stances = stances[stances["Body ID"] != body_id]
    
    return bodies, stances

In [5]:
train_bodies = pd.read_csv("./data/raw/train_bodies.csv", header = 0)
train_stances = pd.read_csv("./data/raw/train_stances.csv", header = 0)
test_bodies = pd.read_csv("./data/raw/test_bodies.csv", header = 0)
test_stances = pd.read_csv("./data/raw/test_stances.csv", header = 0)

print("Raw Data")

train_bodies.shape
train_stances.shape
test_bodies.shape
test_stances.shape

train_bodies_transformed, train_stances_transformed = transform(train_bodies, train_stances)
test_bodies_transformed, test_stances_transformed = transform(test_bodies, test_stances)

print("\nTransformed Data")

train_bodies_transformed.shape
train_stances_transformed.shape
test_bodies_transformed.shape
test_stances_transformed.shape

Raw Data


(1683, 2)

(49972, 3)

(904, 2)

(25413, 3)

Expected length of summary was 7. Found length 6 instead. Length of article is 15.

Article Body -

An article saying NASA confirmed six days of “total darkness” in December is fake and it’s merely an iteration of an old Internet hoax.

Essentially every year, there’s bogus rumors saying that there will be three days of darkness in December, but they’ve obviously proven not to be true.

They seem to reference the Three Days of Darkness mentioned by Catholic prophets.

However, a “satirical” and entertainment website, Huzlers.com, posted an article about six days of darkness. It uses fake quotes from a NASA official.

“WORLDWIDE – NASA has confirmed that the Earth will experience 6 days of almost complete darkness and will happen from the dates Tuesday the 16 – Monday the 22 in December. The world will remain, during these three days, without sunlight due to a solar storm, which will cause dust and space debris to become plentiful and thus, block 90% sunlight,” it reads in part.

AssertionError: 

In [None]:
to_train_bodies, test_bodies_new = train_test_split(test_bodies_transformed, test_size = 0.5, shuffle = True)
train_bodies_new = pd.concat([train_bodies_transformed, to_train_bodies], ignore_index = True)
to_train_stances = test_stances_transformed[test_stances_transformed["Body ID"].isin(to_train_bodies["Body ID"].values)]
train_stances_new = pd.concat([train_stances_transformed, to_train_stances], ignore_index = True)
test_stances_new = test_stances_transformed[~test_stances_transformed["Body ID"].isin(to_train_bodies["Body ID"].values)]

train_bodies_new.shape
train_stances_new.shape
test_bodies_new.shape
test_stances_new.shape

In [None]:
train_bodies_new.to_csv("./data/processed/train_bodies.csv", index = False)
train_stances_new.to_csv("./data/processed/train_stances.csv", index = False)
test_bodies_new.to_csv("./data/processed/test_bodies.csv", index = False)
test_stances_new.to_csv("./data/processed/test_stances.csv", index = False)