In [1]:
import numpy as np
import pandas as pd
from nltk import word_tokenize, sent_tokenize as to_sentences
import string
import re
from contractions import contractions_dict
from article_summariser import getSummary
from sklearn.model_selection import train_test_split

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
import warnings
warnings.filterwarnings("ignore")

In [2]:
def expand_contractions(text, contractions_dict):
    
    contractions_pattern = re.compile('({})'.format('|'.join(contractions_dict.keys())), flags = re.IGNORECASE | re.DOTALL)
    
    def expand_match(contraction):

        match = contraction.group(0)
        first_char = match[0]
        expanded_contraction = contractions_dict.get(match) if contractions_dict.get(match) else contractions_dict.get(match.lower())
        expanded_contraction = expanded_contraction
        
        return expanded_contraction

    expanded_text = contractions_pattern.sub(expand_match, text)
    expanded_text = re.sub("'", "", expanded_text)
    
    return expanded_text

In [24]:
def process_article_body(body):
    
    if not isinstance(body, str):
        
        raise ValueError("Expected a string input. Received an object of class " + type(body).__name__ + " instead.")
        
    # get summary of the article in seven sentences
    body = getSummary(body)
    
    # article body to list of sentences
    sentences = to_sentences(body)
    try:
        assert len(sentences) >= 7
    except Exception as e:
        print(f"Expected length >= 7. Found {len(sentences)} instead.\n{body}")
        raise AssertionError
    
    # mapping from punctuations to empty string
    table = str.maketrans('', '', string.punctuation)
    
    for idx in range(len(sentences)):
        
        # get sentence
        sentence = sentences[idx]
        
        # replace characters with replacements
        replacements = {"\n": "", "\r": "", "-": " - "}
        for to_replace, replacement in replacements.items():
            sentence = sentence.replace(to_replace, replacement)
        
        # remove extra spaces
        sentence = re.sub(" +", " ", sentence)
        
        # remove anything in brackets
        sentence = re.sub(r"\([^()]*\)", "", sentence)
        
        # remove punctuation
        sentence = " ".join([word.translate(table) for word in word_tokenize(sentence)])
        
        # expand contractions
        sentence = expand_contractions(sentence, contractions_dict)
        
        # encode to utf-8
        sentence = sentence.encode(encoding = "utf-8", errors = "ignore").decode("utf-8")
        
        # set sentence
        sentences[idx] = sentence
        
    # join sentences comma separated
    body = ", ".join(sentences)
    
    # return cleaned body    
    return body

In [25]:
def transform(bodies, stances):
    
    flagged_ids = []
    
    for idx, row in bodies.iterrows():
        
        body_id, article_body = row["Body ID"], row["articleBody"]
        
        if len(to_sentences(article_body)) < 7:
            flagged_ids.append(body_id)
        else:
            try:
                assert len(to_sentences(article_body)) >= 7
            except:
                print(f"Expected length >= 7. Found {len(to_sentences(article_body))} instead.\n{article_body}")
                raise AssertionError
            bodies[bodies["Body ID"] == body_id] = (body_id, process_article_body(article_body))
            
    for body_id in flagged_ids:
        
        bodies = bodies[bodies["Body ID"] != body_id]
        stances = stances[stances["Body ID"] != body_id]
    
    return bodies, stances

In [26]:
train_bodies = pd.read_csv("./data/raw/train_bodies.csv", header = 0)
train_stances = pd.read_csv("./data/raw/train_stances.csv", header = 0)
test_bodies = pd.read_csv("./data/raw/test_bodies.csv", header = 0)
test_stances = pd.read_csv("./data/raw/test_stances.csv", header = 0)

print("Raw Data")

train_bodies.shape
train_stances.shape
test_bodies.shape
test_stances.shape

train_bodies_transformed, train_stances_transformed = transform(train_bodies, train_stances)
test_bodies_transformed, test_stances_transformed = transform(test_bodies, test_stances)

print("\nTransformed Data")

train_bodies_transformed.shape
train_stances_transformed.shape
test_bodies_transformed.shape
test_stances_transformed.shape

Raw Data


(1683, 2)

(49972, 3)

(904, 2)

(25413, 3)

Expected length >= 7. Found 5 instead.
But instead of confronting him immediately, she planned a little Christmas surprise: She printed out the evidence, wrapped it up, and gave it to him as a present. On the other hand, Cassy has 31,700 tweets that prove she's a real person, and the guy in the photo seems to have fessed up. Don't fucking cheat on Cassy, aka @NessLovnTrey247 (she really likes Trey Songz), because she finds out everything. All's well that ends well, I guess? He thought he was getting his gifts.


AssertionError: 

In [98]:
to_train_bodies, test_bodies_new = train_test_split(test_bodies_transformed, test_size = 0.5, shuffle = True)
train_bodies_new = pd.concat([train_bodies_transformed, to_train_bodies], ignore_index = True)
to_train_stances = test_stances_transformed[test_stances_transformed["Body ID"].isin(to_train_bodies["Body ID"].values)]
train_stances_new = pd.concat([train_stances_transformed, to_train_stances], ignore_index = True)
test_stances_new = test_stances_transformed[~test_stances_transformed["Body ID"].isin(to_train_bodies["Body ID"].values)]

train_bodies_new.shape
train_stances_new.shape
test_bodies_new.shape
test_stances_new.shape

(1846, 2)

(54110, 3)

(378, 2)

(10212, 3)

In [99]:
train_bodies_new.to_csv("./data/processed/train_bodies.csv", index = False)
train_stances_new.to_csv("./data/processed/train_stances.csv", index = False)
test_bodies_new.to_csv("./data/processed/test_bodies.csv", index = False)
test_stances_new.to_csv("./data/processed/test_stances.csv", index = False)