# Part 1. Text summarizing

An Introduction to Text Summarization using the TextRank Algorithm https://www.analyticsvidhya.com/blog/2018/11/introduction-text-summarization-textrank-python/ 

### Import Libraries

In [93]:
import pandas as pd
import numpy as np
import os
import urllib.request

import nltk
from nltk.tokenize import sent_tokenize, word_tokenize

#nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

from sklearn.metrics.pairwise import cosine_similarity

import networkx as nx

In [95]:
import warnings

warnings.filterwarnings("ignore", category=DeprecationWarning)

In [8]:
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)

## Read Data

### Training dataset

Github dataset: https://github.com/FakeNewsChallenge/fnc-1

In [171]:
def LoadDatasets():
    url1 = 'https://github.com/ivabu/fnc-1/blob/master/train_bodies.csv?raw=true'
    train_bodies = pd.read_csv(url1)

    url2 = 'https://github.com/ivabu/fnc-1/blob/master/train_stances.csv?raw=true'
    train_stances = pd.read_csv(url2)
    
    #merge the training datasets for bodies and headlines
    train_stances_bodies = pd.merge(left=train_bodies, right=train_stances, left_on='Body ID', right_on='Body ID')
    
    url3 = 'https://github.com/ivabu/fnc-1/blob/master/test_bodies.csv?raw=true'
    test_bodies = pd.read_csv(url3)

    url4 = 'https://github.com/ivabu/fnc-1/blob/master/test_stances_unlabeled.csv?raw=true'
    test_stances = pd.read_csv(url4)
    
    #merge the testing datasets for bodies and headlines
    test_stances_bodies = pd.merge(left=test_bodies, right=test_stances, left_on='Body ID', right_on='Body ID')

    
    return train_bodies, train_stances, train_stances_bodies, test_bodies, test_stances, test_stances_bodies

train_bodies,train_stances,train_stances_bodies,test_bodies,test_stances,test_stances_bodies = LoadDatasets()

In [172]:
def DescribeDatasets(full_dataset, stances, bodies):
    print("Count of stances_bodies: \n\n" + str(full_dataset.count().to_string()))
    print("\nCount of stances: \n\n" + str(stances.count().to_string()))
    print("\nCount of bodies: \n\n" + str(bodies.count().to_string()))
    print("\nCount of unique stances: \n\n" + str(stances.nunique().to_string()))
    if 'Stance' in stances.columns:
        proportions = stances["Stance"].value_counts(normalize=True).mul(100).round(1).astype(str) + '%'
        print("\nStance proportions: \n\n" + str(proportions))

In [173]:
print("\nTesting Dataset Summary: \n")    
DescribeDatasets(train_stances_bodies, train_stances, train_bodies)
print("\nTesting Dataset Summary: \n")
DescribeDatasets(test_stances_bodies, test_stances, test_bodies)


Testing Dataset Summary: 

Count of stances_bodies: 

Body ID        49972
articleBody    49972
Headline       49972
Stance         49972

Count of stances: 

Headline    49972
Body ID     49972
Stance      49972

Count of bodies: 

Body ID        1683
articleBody    1683

Count of unique stances: 

Headline    1648
Body ID     1683
Stance         4

Stance proportions: 

unrelated    73.1%
discuss      17.8%
agree         7.4%
disagree      1.7%
Name: Stance, dtype: object

Testing Dataset Summary: 

Count of stances_bodies: 

Body ID        25413
articleBody    25413
Headline       25413

Count of stances: 

Headline    25413
Body ID     25413

Count of bodies: 

Body ID        904
articleBody    904

Count of unique stances: 

Headline    894
Body ID     904


In [174]:
#taking a small subset to test the summarizer faster
obervation_headlines = train_stances.loc[((train_stances["Body ID"] == 1923 )
                                        |(train_stances["Body ID"] == 722))
                                        & (train_stances["Stance"] != 'unrelated')]
obervation_headlines = obervation_headlines.head(100).reset_index()
obervation_headlines

Unnamed: 0,index,Headline,Body ID,Stance
0,4,Spider burrowed through tourist's stomach and up into his chest,1923,disagree
1,2287,"No, a spider (probably) didn't crawl through a man's body for several days",1923,agree
2,3817,Tropical spider burrowed under man's skin through appendix scar and lived there for THREE DAYS,1923,disagree
3,5685,Dylan Thomas Finds Tropical Spider Burrowed Under Skin,1923,disagree
4,6235,Bali Awry,1923,discuss
5,7799,"Spider burrowed into appendix scar, crawled through WA man's body",1923,disagree
6,11381,Expert casts doubt on Bunbury man Dylan Thomas’s burrowing stomach-spider story,1923,agree
7,12660,Web of confusion as scientists cast doubt on man's claims that a spider burrowed into his stomach through his SCAR,1923,agree
8,12889,Tropical spider burrows under man's skin through scar,1923,disagree
9,16508,The Guy Who Said a Spider Burrowed Under His Skin? Experts Say…,1923,discuss


### Subset dataset for code testing

In [175]:
def SubsetDataset(data): 
    data = data.sample(frac=0.5, random_state = 1)
    return data
train_bodies = SubsetDataset(train_bodies)

In [176]:
train_bodies.shape

(842, 2)

In [13]:
observation_body_headline = train_stances_bodies[(train_stances_bodies['Body ID'].isin([1923,722]))
                                                 & (train_stances_bodies["Stance"] != 'unrelated')]

texts = observation_body_headline[['articleBody','Body ID']].drop_duplicates().reset_index()
texts

Unnamed: 0,index,articleBody,Body ID
0,12050,"When Tim Cook finally announced the long-awaited Apple Watch on September 9, the company promised an “early 2015” release. Since then, it’s really been anybody’s guess as to when, exactly, that might end up being. A few new rumors, however, suggest that we’ll get the Apple Watch sometime in February…but it’s likely that it’ll be later than that.\n\nToday, a post on 9to5Mac points the way back to a report from Chinese site Feng, which itself cites “Taiwanese media” as saying that the Apple Watch is likely to hit sometime in February, though not without a fair bit of challenges to overcome. The reports say that the supply of sapphire crystal – which will compose the displays of the highest-end Apple Watch units – has led to some difficulties in manufacturing.\n\nHowever, it should be noted that if sapphire is the biggest problem facing the Apple Watch, then Apple’s in good shape. The software itself is still in development, and as the 9to5Mac post points out, the company has yet to release the WatchKit SDK for developers to start making all the cool apps that’ll run on the device.\n\nAnother report from Asia this week claims that production on the Apple Watch isn’t set to start at manufacturing partner Quanta until sometime in January – and with only a month of actual production, it seems doubtful that Apple would truly be ready to sell the Watch by February. And last week, an unnamed Apple insider was quoted as saying that the company would be “lucky to ship it by Valentine’s Day.” If that quote is true, then February seems like a tall order.\n\nThat said, March doesn’t seem too out of the question. Nor does April. In fact, considering that Motorola promised the Moto 360 smartwatch by “summer 2014,” and didn’t launch it until early September, it would seem that OEMs are pretty loose about their definitions for launch windows. “Early 2015” is simply anything before the very last day in June. And barring any major disasters, we should start to see Apple Watches on consumers’ wrists long before then.\n\nThe main question, though, is what cool stuff will the competition cook up between then and now…",722
1,35850,"Fear not arachnophobes, the story of Bunbury's ""spiderman"" might not be all it seemed.\n\nPerth scientists have cast doubt over claims that a spider burrowed into a man's body during his first trip to Bali. The story went global on Thursday, generating hundreds of stories online.\n\nEarlier this month, Dylan Thomas headed to the holiday island and sought medical help after experiencing ""a really burning sensation like a searing feeling"" in his abdomen.\n\nDylan Thomas says he had a spider crawl underneath his skin.\n\nThomas said a specialist dermatologist was called in and later used tweezers to remove what was believed to be a ""tropical spider"".\n\nBut it seems we may have all been caught in a web... of misinformation.\n\nArachnologist Dr Volker Framenau said whatever the creature was, it was ""almost impossible"" for the culprit to have been a spider.\n\n""If you look at a spider, the fangs, the mouth parts they have, they are not able to burrow. They can't get through skin,"" he said.\n\n""We thought it may have been something like a mite, there are a few different parasitic mites out there, which can sometimes look a bit like a spider. I can't think of any spider which could do this to a person.""\n\nDr Mark Harvey from the Western Australian Museum agreed and said he found the case ""bizarre"".\n\n""I must confess I was amazed because I've never heard of a spider being able to survive under the skin of a human, or indeed any mammal,"" he said.\n\n""Spiders need air to breathe, they have spiracles on the sides of their bodies where air comes into their system through a series of what we call book lungs. Being under the skin of somebody, I would have thought they wouldn't have enough air to survive.\n\n""Even if it was a mite, I've never seen anything like this. Even if it was an insect, I've never heard of an insect crawling under the skin like this, so it really is a remarkable case.""\n\nDr Harvey said spiders were widely feared in the community and often were the subject of urban legends.\n""We hear about people going on holidays and having spiders lay eggs under the skin. Then [the baby spiders] burst out when they return from their holiday in the tropics,"" he said.\n\n""None of those are true, they're just made up stories.\n\n""They're not actually able to dig through the skin, that's why this case is so unusual. Some can burrow into soil, but they have to remove soil particles one at a time if they want to do that.""\n\nSomething which is true, according to Dr Harvey, is that certain arachnids do ""live on humans"".\n\n""We all have mites living on our faces. They're follicle mites, but they're absolutely miniscule and you can't see them. We transmit them to our children when we have kids,"" he said.\n\n""They live in the bases of hair follicles on our faces and in some of the pores in our skin. Those mites are so small, you can't see them, and they're not going to cause a blemish on the skin like this lad has on his stomach.""\n\nDr Framenau said that much of the confusion could be eliminated by keeping or catching the creepy crawly offender, dead or alive, and enlisting the help of experts.\n\n""It would be great if they collected it or took a photo of it,"" he said.\n\n""If you have been bitten by something, the best thing you can do is collect it and submit it to a museum for identification before these things go viral.""\n\nDylan Thomas has been contacted for comment.\n\n- WA Today",1923


### Extract word embeddings with GloVe

In [153]:
embeddings_index = dict()
f = open('./glove.6B.50d.txt')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()
print('Loaded %s word vectors.' % len(embeddings_index))

Loaded 400000 word vectors.


## Processing and Text Summarization 

Steps: <br>
<br>
1. Text Preprocessing: remove stopwords, punctuations, numbers, and special characters, and introduce lowercase <br>
2. Vectorize sentences <br>
3. Caluclate sentence similarity <br>
4. Rank sentences with PageRank algorithm <br>
5. Extract top N sentences as the summary <br>

In [154]:
def getArticles(data):
    articles = []
    for s in data['articleBody']:
        articles.append(sent_tokenize(s))
        
    return articles

In [160]:
def ProcessArticle(article):
    ''' This is the main article processing function which returns the summary of an article '''
    
    clean_sentences = getCleanSentences(article)
    sentence_vectors = getSentenceVectors(clean_sentences)
    
    sentence_count = len(article)
    sim_mat_shape = []
    sim_mat_shape = np.zeros([sentence_count, sentence_count])    
    sim_mat = getSimilartyMatrix(sim_mat_shape, sentence_vectors)
    
    sentence_scores = getSentenceScores(sim_mat)
    ranked_sentences = getRankedSentences(article, sentence_scores)
    
    #printRankedSentences(ranked_sentences, 10)

    return getSummary(ranked_sentences)

    
def getCleanSentences(article):
    ''' This helper function strips special characters from string other than 
    lowercase and uppercase letters, then changes all to lowercase'''
    
    sentences = (pd.Series(article).str.replace("[^a-zA-Z]", " ").str.lower()).tolist()
    clean_sentences = []
    for sentence in sentences:
        clean_sentences.append(removeStopwords(sentence))
    return clean_sentences


def removeStopwords(sentence):
    # Convert text to lowercase and split to a list of words
    tokens = word_tokenize(sentence)
    # Remove stop words
    tokens_no_stopwords = [t for t in tokens if t not in stop_words]
    return " ".join(tokens_no_stopwords)
    #https://stackoverflow.com/questions/37605710/tokenize-a-paragraph-into-sentence-and-then-into-words-in-nltk


def getSentenceVectors(clean_sentences):
    ''' This helper function takes in cleaned sentences and for every sentence, 
    summing the word vectors on the sentence level, and then the vector values 
    of the sentence are divided by the number of words in the sentence  '''
    
    sentence_vectors = []
    for s in clean_sentences:   
        if (len(s) != 0) & (s.isspace() == False):
            v = sum([embeddings_index.get(w, np.zeros((50,))) for w in s.split()])/(len(s.split())+0.001)
        else:
            v = np.zeros((50,))
            
        sentence_vectors.append(v)
            
    return sentence_vectors

def getSimilartyMatrix(sim_mat, sentence_vectors):
    ''' This helper function computes how similar a sentence is to every other 
    sentence in the dataset by comparing sentence vectors. Creates an empty 
    similarity matrix and populates it with cosine similarities of the sentences. '''
    
    count = len(sentence_vectors)
    for i in range(count):
        for j in range(count):
            if i != j:
                sim_mat[i][j] = cosine_similarity(sentence_vectors[i].reshape(1,50), sentence_vectors[j].reshape(1,50))[0,0]
                # calculation from An Introduction to Text Summarization using the TextRank Algorithm 
                # https://www.analyticsvidhya.com/blog/2018/11/introduction-text-summarization-textrank-python/ 
    return sim_mat
                
def getSentenceScores(sim_mat):
    ''' This helper function calls the pageRank algorithm which converts the 
    similarity matrix into a graph and generates scores for sentence ranking. '''
    
    return nx.pagerank(nx.from_numpy_array(sim_mat), max_iter=1000000, tol=1e-06)

def getRankedSentences(article, sentence_scores):
    ''' This helper function sorts the sentences by their scores in the descending order. '''
    
    return sorted(((sentence_scores[i],s) for i,s in enumerate(article)), reverse=True)

def getSummary(sentences):
    ''' This helper function receives the ranked sentences and appends the top 5. '''
    
    summary = []
    for i,s in enumerate(sentences):
        if i <= 5:
            summary.append(s[1])
    return summary
    

In [161]:
def SummarizeTexts(data):
    ''' This main function summarizes all individual articles and merges with the original dataset '''
    
    summaries = []
    for article in getArticles(data):
        summary = ProcessArticle(article)
        summaries.append(" ".join(sentence for sentence in summary))
    text_summaries = pd.DataFrame(pd.Series(summaries), columns = ['articleSummary'])
    #todo clean square brackets
    data_including_summaries = data.merge(text_summaries, left_index=True, right_index=True)
    return data_including_summaries
    

In [162]:
test_run = SummarizeTexts(texts)
test_run

Unnamed: 0,index,articleBody,Body ID,articleSummary
0,12050,"When Tim Cook finally announced the long-awaited Apple Watch on September 9, the company promised an “early 2015” release. Since then, it’s really been anybody’s guess as to when, exactly, that might end up being. A few new rumors, however, suggest that we’ll get the Apple Watch sometime in February…but it’s likely that it’ll be later than that.\n\nToday, a post on 9to5Mac points the way back to a report from Chinese site Feng, which itself cites “Taiwanese media” as saying that the Apple Watch is likely to hit sometime in February, though not without a fair bit of challenges to overcome. The reports say that the supply of sapphire crystal – which will compose the displays of the highest-end Apple Watch units – has led to some difficulties in manufacturing.\n\nHowever, it should be noted that if sapphire is the biggest problem facing the Apple Watch, then Apple’s in good shape. The software itself is still in development, and as the 9to5Mac post points out, the company has yet to release the WatchKit SDK for developers to start making all the cool apps that’ll run on the device.\n\nAnother report from Asia this week claims that production on the Apple Watch isn’t set to start at manufacturing partner Quanta until sometime in January – and with only a month of actual production, it seems doubtful that Apple would truly be ready to sell the Watch by February. And last week, an unnamed Apple insider was quoted as saying that the company would be “lucky to ship it by Valentine’s Day.” If that quote is true, then February seems like a tall order.\n\nThat said, March doesn’t seem too out of the question. Nor does April. In fact, considering that Motorola promised the Moto 360 smartwatch by “summer 2014,” and didn’t launch it until early September, it would seem that OEMs are pretty loose about their definitions for launch windows. “Early 2015” is simply anything before the very last day in June. And barring any major disasters, we should start to see Apple Watches on consumers’ wrists long before then.\n\nThe main question, though, is what cool stuff will the competition cook up between then and now…",722,"Another report from Asia this week claims that production on the Apple Watch isn’t set to start at manufacturing partner Quanta until sometime in January – and with only a month of actual production, it seems doubtful that Apple would truly be ready to sell the Watch by February. A few new rumors, however, suggest that we’ll get the Apple Watch sometime in February…but it’s likely that it’ll be later than that. Today, a post on 9to5Mac points the way back to a report from Chinese site Feng, which itself cites “Taiwanese media” as saying that the Apple Watch is likely to hit sometime in February, though not without a fair bit of challenges to overcome. “Early 2015” is simply anything before the very last day in June. And last week, an unnamed Apple insider was quoted as saying that the company would be “lucky to ship it by Valentine’s Day.” If that quote is true, then February seems like a tall order. When Tim Cook finally announced the long-awaited Apple Watch on September 9, the company promised an “early 2015” release."
1,35850,"Fear not arachnophobes, the story of Bunbury's ""spiderman"" might not be all it seemed.\n\nPerth scientists have cast doubt over claims that a spider burrowed into a man's body during his first trip to Bali. The story went global on Thursday, generating hundreds of stories online.\n\nEarlier this month, Dylan Thomas headed to the holiday island and sought medical help after experiencing ""a really burning sensation like a searing feeling"" in his abdomen.\n\nDylan Thomas says he had a spider crawl underneath his skin.\n\nThomas said a specialist dermatologist was called in and later used tweezers to remove what was believed to be a ""tropical spider"".\n\nBut it seems we may have all been caught in a web... of misinformation.\n\nArachnologist Dr Volker Framenau said whatever the creature was, it was ""almost impossible"" for the culprit to have been a spider.\n\n""If you look at a spider, the fangs, the mouth parts they have, they are not able to burrow. They can't get through skin,"" he said.\n\n""We thought it may have been something like a mite, there are a few different parasitic mites out there, which can sometimes look a bit like a spider. I can't think of any spider which could do this to a person.""\n\nDr Mark Harvey from the Western Australian Museum agreed and said he found the case ""bizarre"".\n\n""I must confess I was amazed because I've never heard of a spider being able to survive under the skin of a human, or indeed any mammal,"" he said.\n\n""Spiders need air to breathe, they have spiracles on the sides of their bodies where air comes into their system through a series of what we call book lungs. Being under the skin of somebody, I would have thought they wouldn't have enough air to survive.\n\n""Even if it was a mite, I've never seen anything like this. Even if it was an insect, I've never heard of an insect crawling under the skin like this, so it really is a remarkable case.""\n\nDr Harvey said spiders were widely feared in the community and often were the subject of urban legends.\n""We hear about people going on holidays and having spiders lay eggs under the skin. Then [the baby spiders] burst out when they return from their holiday in the tropics,"" he said.\n\n""None of those are true, they're just made up stories.\n\n""They're not actually able to dig through the skin, that's why this case is so unusual. Some can burrow into soil, but they have to remove soil particles one at a time if they want to do that.""\n\nSomething which is true, according to Dr Harvey, is that certain arachnids do ""live on humans"".\n\n""We all have mites living on our faces. They're follicle mites, but they're absolutely miniscule and you can't see them. We transmit them to our children when we have kids,"" he said.\n\n""They live in the bases of hair follicles on our faces and in some of the pores in our skin. Those mites are so small, you can't see them, and they're not going to cause a blemish on the skin like this lad has on his stomach.""\n\nDr Framenau said that much of the confusion could be eliminated by keeping or catching the creepy crawly offender, dead or alive, and enlisting the help of experts.\n\n""It would be great if they collected it or took a photo of it,"" he said.\n\n""If you have been bitten by something, the best thing you can do is collect it and submit it to a museum for identification before these things go viral.""\n\nDylan Thomas has been contacted for comment.\n\n- WA Today",1923,"Even if it was an insect, I've never heard of an insect crawling under the skin like this, so it really is a remarkable case."" ""I must confess I was amazed because I've never heard of a spider being able to survive under the skin of a human, or indeed any mammal,"" he said. ""They're not actually able to dig through the skin, that's why this case is so unusual. Being under the skin of somebody, I would have thought they wouldn't have enough air to survive. ""Even if it was a mite, I've never seen anything like this. I can't think of any spider which could do this to a person."""


## Prepare Datasets to be used in Part 2. LSTM

#### Generate summaries for training bodies

In [177]:
def storeSummaries(summarize_articles, merge_original):
    summaries = SummarizeTexts(summarize_articles)
    #merge the training datasets for bodies summarized and default dataset
    data_incl_summaries = pd.merge(merge_original,
                        summaries[['Body ID','articleSummary']],
                        on='Body ID', 
                        how='left')
    return data_incl_summaries

In [178]:
training_dataset_summaries = storeSummaries(train_bodies, train_stances_bodies)
training_dataset_summaries.head()

  return umr_sum(a, axis, dtype, out, keepdims, initial, where)
  err = np.absolute(x - xlast).sum()


PowerIterationFailedConvergence: (PowerIterationFailedConvergence(...), 'power iteration failed to converge within 1000000 iterations')

In [None]:
testing_dataset_summaries = storeSummaries(test_bodies, test_stances_bodies)
testing_dataset_summaries.head()

In [None]:
training_dataset_summaries = SummarizeTexts(train_bodies)

In [None]:
#merge the training datasets for bodies summarized and default dataset
train_stances_bodies_summaries = pd.merge(train_stances_bodies,
                                          training_dataset_summaries[['Body ID','articleSummary']],
                                          on='Body ID', 
                                          how='left')