In [8]:
import numpy as np
import pandas as pd
from nltk import word_tokenize, sent_tokenize as to_sentences
import string
import re
from contractions import contractions_dict
from article_summariser import getSummary
from sklearn.model_selection import train_test_split

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
import warnings
warnings.filterwarnings("ignore")

In [9]:
def expand_contractions(text, contractions_dict):
    
    contractions_pattern = re.compile('({})'.format('|'.join(contractions_dict.keys())), flags = re.IGNORECASE | re.DOTALL)
    
    def expand_match(contraction):

        match = contraction.group(0)
        first_char = match[0]
        expanded_contraction = contractions_dict.get(match) if contractions_dict.get(match) else contractions_dict.get(match.lower())
        expanded_contraction = expanded_contraction
        
        return expanded_contraction

    expanded_text = contractions_pattern.sub(expand_match, text)
    expanded_text = re.sub("'", "", expanded_text)
    
    return expanded_text

In [27]:
def process_article_body(body):
    
    if not isinstance(body, str):
        raise ValueError("Expected a string input. Received an object of class " + type(body).__name__ + " instead.")
        
    # preprocessing
    replacements = {"\n": " ", "\r": " ", "“": '"', "”": '"', "‘": "'", "’": "'"}
    for to_replace, replacement in replacements.items():
        body = body.replace(to_replace, replacement)
    body = re.sub(" +", " ", body)
    body = re.sub(r"\([^()]*\)", "", body)
        
    # get summary of the article in seven sentences
    body_summary = getSummary(body)
    
    # article body to list of sentences
    sentences = to_sentences(body_summary)
    try:
        assert len(sentences) == 7
    except:
        print(f"Expected length of summary was 7. Found length {len(sentences)} instead. Length of article is {len(to_sentences(body))}.")
        return None
    
    # mapping from punctuations to empty string
    table = str.maketrans({to_replace: (" " if to_replace == "-" else "") for to_replace in string.punctuation})
    
    for idx in range(len(sentences)):
        
        # get sentence
        sentence = sentences[idx]
        
        # expand contractions
        sentence = expand_contractions(sentence, contractions_dict)
        
        # remove punctuation
        sentence = " ".join([word.translate(table) for word in word_tokenize(sentence)])
        
        # remove leading and trailing spaces
        sentence = sentence.strip()
        
        # remove extra spaces
        sentence = re.sub(" +", " ", sentence)
        
        # encode to utf-8
        sentence = sentence.encode(encoding = "utf-8", errors = "ignore").decode("utf-8")
        
        # set sentence
        sentences[idx] = sentence
        
    # join sentences comma separated
    body = ", ".join(sentences)
    
    # return cleaned body    
    return body

In [28]:
def transform(bodies, stances):
    
    flagged_ids = []
    
    for _, row in bodies.iterrows():
        
        body_id, article_body = row["Body ID"], row["articleBody"]
        
        body_sentences = to_sentences(article_body)        
        processed_article_body = process_article_body(article_body)
        if processed_article_body:
            bodies[bodies["Body ID"] == body_id] = (body_id, processed_article_body)
        else:
            flagged_ids.append(body_id)
        
    bodies = bodies[~bodies["Body ID"].isin(flagged_ids)]
    stances = stances[~stances["Body ID"].isin(flagged_ids)]
    
    assert not len(set(stances["Body ID"]).difference(set(bodies["Body ID"])))
    
    return bodies, stances

In [29]:
train_bodies = pd.read_csv("./data/raw/train_bodies.csv", header = 0)
train_stances = pd.read_csv("./data/raw/train_stances.csv", header = 0)
test_bodies = pd.read_csv("./data/raw/test_bodies.csv", header = 0)
test_stances = pd.read_csv("./data/raw/test_stances.csv", header = 0)

print("Raw Data")

train_bodies.shape
train_stances.shape
test_bodies.shape
test_stances.shape

train_bodies_transformed, train_stances_transformed = transform(train_bodies.copy(), train_stances.copy())
test_bodies_transformed, test_stances_transformed = transform(test_bodies.copy(), test_stances.copy())

print("\nTransformed Data")

train_bodies_transformed.shape
train_stances_transformed.shape
test_bodies_transformed.shape
test_stances_transformed.shape

Raw Data


(1683, 2)

(49972, 3)

(904, 2)

(25413, 3)

Expected length of summary was 7. Found length 4 instead. Length of article is 5.
Expected length of summary was 7. Found length 3 instead. Length of article is 3.
Expected length of summary was 7. Found length 5 instead. Length of article is 6.
Expected length of summary was 7. Found length 3 instead. Length of article is 3.
Expected length of summary was 7. Found length 6 instead. Length of article is 6.
Expected length of summary was 7. Found length 6 instead. Length of article is 7.
Expected length of summary was 7. Found length 6 instead. Length of article is 27.
Expected length of summary was 7. Found length 6 instead. Length of article is 24.
Expected length of summary was 7. Found length 5 instead. Length of article is 5.
Expected length of summary was 7. Found length 6 instead. Length of article is 7.
Expected length of summary was 7. Found length 5 instead. Length of article is 6.
Expected length of summary was 7. Found length 6 instead. Length of article is 6.
Expected lengt

Expected length of summary was 7. Found length 6 instead. Length of article is 7.
Expected length of summary was 7. Found length 6 instead. Length of article is 6.
Expected length of summary was 7. Found length 1 instead. Length of article is 1.
Expected length of summary was 7. Found length 3 instead. Length of article is 3.
Expected length of summary was 7. Found length 5 instead. Length of article is 5.
Expected length of summary was 7. Found length 6 instead. Length of article is 6.
Expected length of summary was 7. Found length 6 instead. Length of article is 6.
Expected length of summary was 7. Found length 3 instead. Length of article is 3.
Expected length of summary was 7. Found length 3 instead. Length of article is 3.
Expected length of summary was 7. Found length 5 instead. Length of article is 5.
Expected length of summary was 7. Found length 1 instead. Length of article is 1.
Expected length of summary was 7. Found length 5 instead. Length of article is 6.
Expected length 

Expected length of summary was 7. Found length 3 instead. Length of article is 3.
Expected length of summary was 7. Found length 1 instead. Length of article is 1.
Expected length of summary was 7. Found length 6 instead. Length of article is 9.
Expected length of summary was 7. Found length 4 instead. Length of article is 5.
Expected length of summary was 7. Found length 1 instead. Length of article is 1.
Expected length of summary was 7. Found length 6 instead. Length of article is 6.
Expected length of summary was 7. Found length 6 instead. Length of article is 9.
Expected length of summary was 7. Found length 6 instead. Length of article is 6.
Expected length of summary was 7. Found length 1 instead. Length of article is 1.
Expected length of summary was 7. Found length 6 instead. Length of article is 11.
Expected length of summary was 7. Found length 1 instead. Length of article is 1.
Expected length of summary was 7. Found length 1 instead. Length of article is 1.
Expected length

Expected length of summary was 7. Found length 6 instead. Length of article is 17.
Expected length of summary was 7. Found length 6 instead. Length of article is 6.
Expected length of summary was 7. Found length 3 instead. Length of article is 3.
Expected length of summary was 7. Found length 6 instead. Length of article is 7.
Expected length of summary was 7. Found length 6 instead. Length of article is 13.
Expected length of summary was 7. Found length 1 instead. Length of article is 1.
Expected length of summary was 7. Found length 1 instead. Length of article is 1.
Expected length of summary was 7. Found length 6 instead. Length of article is 11.
Expected length of summary was 7. Found length 0 instead. Length of article is 1.
Expected length of summary was 7. Found length 6 instead. Length of article is 6.
Expected length of summary was 7. Found length 1 instead. Length of article is 1.
Expected length of summary was 7. Found length 4 instead. Length of article is 4.
Expected leng

Expected length of summary was 7. Found length 3 instead. Length of article is 3.
Expected length of summary was 7. Found length 1 instead. Length of article is 1.
Expected length of summary was 7. Found length 4 instead. Length of article is 4.
Expected length of summary was 7. Found length 6 instead. Length of article is 6.
Expected length of summary was 7. Found length 6 instead. Length of article is 7.
Expected length of summary was 7. Found length 1 instead. Length of article is 1.
Expected length of summary was 7. Found length 6 instead. Length of article is 7.
Expected length of summary was 7. Found length 1 instead. Length of article is 2.
Expected length of summary was 7. Found length 6 instead. Length of article is 7.
Expected length of summary was 7. Found length 4 instead. Length of article is 4.
Expected length of summary was 7. Found length 5 instead. Length of article is 5.
Expected length of summary was 7. Found length 6 instead. Length of article is 6.
Expected length 

(1404, 2)

(41914, 3)

(738, 2)

(20214, 3)

In [30]:
to_train_bodies, test_bodies_new = train_test_split(test_bodies_transformed, test_size = 0.5, shuffle = True)
train_bodies_new = pd.concat([train_bodies_transformed, to_train_bodies], ignore_index = True)
to_train_stances = test_stances_transformed[test_stances_transformed["Body ID"].isin(to_train_bodies["Body ID"].values)]
train_stances_new = pd.concat([train_stances_transformed, to_train_stances], ignore_index = True)
test_stances_new = test_stances_transformed[~test_stances_transformed["Body ID"].isin(to_train_bodies["Body ID"].values)]

train_bodies_new.shape
train_stances_new.shape
test_bodies_new.shape
test_stances_new.shape

(1773, 2)

(51716, 3)

(369, 2)

(10412, 3)

In [32]:
train_bodies_new.to_csv("./data/processed/train_bodies.csv", index = False)
train_stances_new.to_csv("./data/processed/train_stances.csv", index = False)
test_bodies_new.to_csv("./data/processed/test_bodies.csv", index = False)
test_stances_new.to_csv("./data/processed/test_stances.csv", index = False)