In [19]:
import numpy as np
import pandas as pd
from nltk import word_tokenize, sent_tokenize as to_sentences
import string
import re
from contractions import contractions_dict
from article_summariser import getSummary
from sklearn.model_selection import train_test_split

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
import warnings
warnings.filterwarnings("ignore")
from tqdm import tqdm
from time import sleep

In [2]:
def expand_contractions(text, contractions_dict):
    
    contractions_pattern = re.compile('({})'.format('|'.join(contractions_dict.keys())), flags = re.IGNORECASE | re.DOTALL)
    
    def expand_match(contraction):

        match = contraction.group(0)
        first_char = match[0]
        expanded_contraction = contractions_dict.get(match) if contractions_dict.get(match) else contractions_dict.get(match.lower())
        expanded_contraction = expanded_contraction
        
        return expanded_contraction

    expanded_text = contractions_pattern.sub(expand_match, text)
    expanded_text = re.sub("'", "", expanded_text)
    
    return expanded_text

In [6]:
def process_article_body(body):
    
    if not isinstance(body, str):
        raise ValueError("Expected a string input. Received an object of class " + type(body).__name__ + " instead.")
        
    # preprocessing
    replacements = {"\n": " ", "\r": " ", "“": '"', "”": '"', "‘": "'", "’": "'"}
    for to_replace, replacement in replacements.items():
        body = body.replace(to_replace, replacement)
    body = re.sub(" +", " ", body)
    body = re.sub(r"\([^()]*\)", "", body)
    sentences = to_sentences(body)
    
    # mapping from punctuations to empty string
    table = str.maketrans({to_replace: (" " if to_replace == "-" else "") for to_replace in string.punctuation})
    
    for idx in range(len(sentences)):
        
        # get sentence
        sentence = sentences[idx]
        
        # expand contractions
        sentence = expand_contractions(sentence, contractions_dict)
        
        # remove punctuation
        sentence = " ".join([word.translate(table) for word in word_tokenize(sentence)])
        
        # remove leading and trailing spaces
        sentence = sentence.strip()
        
        # remove extra spaces
        sentence = re.sub(" +", " ", sentence)
        
        # encode to utf-8
        sentence = sentence.encode(encoding = "utf-8", errors = "ignore").decode("utf-8")
        
        # set sentence
        sentences[idx] = sentence
        
    # join sentences comma separated
    body = ". ".join(sentences)
    
    # return cleaned body    
    return body

In [11]:
def transform(bodies, stances):
    
    flagged_ids = []
    
    for _, row in tqdm(bodies.iterrows()):
        
        body_id, article_body = row["Body ID"], row["articleBody"]
        processed_article_body = process_article_body(article_body)
        bodies[bodies["Body ID"] == body_id] = (body_id, processed_article_body)
    
    assert not len(set(stances["Body ID"]).difference(set(bodies["Body ID"])))
    
    return bodies, stances

In [20]:
train_bodies = pd.read_csv("./data/raw/train_bodies.csv", header = 0)
train_stances = pd.read_csv("./data/raw/train_stances.csv", header = 0)
test_bodies = pd.read_csv("./data/raw/test_bodies.csv", header = 0)
test_stances = pd.read_csv("./data/raw/test_stances.csv", header = 0)

print("Raw Data")

train_bodies.shape
train_stances.shape
test_bodies.shape
test_stances.shape

train_bodies_transformed, train_stances_transformed = transform(train_bodies.copy(), train_stances.copy())
test_bodies_transformed, test_stances_transformed = transform(test_bodies.copy(), test_stances.copy())
sleep(1)

Raw Data


(1683, 2)

(49972, 3)

(904, 2)

(25413, 3)

1683it [00:31, 53.00it/s]
904it [00:16, 55.01it/s]



Transformed Data


(1683, 2)

(49972, 3)

(904, 2)

(25413, 3)

In [21]:
to_train_bodies, test_bodies_new = train_test_split(test_bodies_transformed, test_size = 0.5, shuffle = True)
train_bodies_new = pd.concat([train_bodies_transformed, to_train_bodies], ignore_index = True)
to_train_stances = test_stances_transformed[test_stances_transformed["Body ID"].isin(to_train_bodies["Body ID"].values)]
train_stances_new = pd.concat([train_stances_transformed, to_train_stances], ignore_index = True)
test_stances_new = test_stances_transformed[~test_stances_transformed["Body ID"].isin(to_train_bodies["Body ID"].values)]

train_bodies_new.shape
train_stances_new.shape
test_bodies_new.shape
test_stances_new.shape

(2135, 2)

(62845, 3)

(452, 2)

(12540, 3)

In [22]:
train_bodies_new.to_csv("./data/processed/train_bodies.csv", index = False)
train_stances_new.to_csv("./data/processed/train_stances.csv", index = False)
test_bodies_new.to_csv("./data/processed/test_bodies.csv", index = False)
test_stances_new.to_csv("./data/processed/test_stances.csv", index = False)