# Summarization Comparison

### Imports

In [1]:
!pip install -U transformers kaggle

Requirement already up-to-date: transformers in /usr/local/lib/python3.6/dist-packages (3.4.0)
Requirement already up-to-date: kaggle in /usr/local/lib/python3.6/dist-packages (1.5.9)


In [2]:
import numpy as np

from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, AutoModelWithLMHead
import torch
import pandas as pd
import time
import gc

In [3]:
#from transformers import pipeline; print(pipeline('sentiment-analysis')('I hate you'))

### Read in Data
Reads in the 3 csv files, drops unneeded columns, then concatenates them together. 

In [4]:
from google.colab import files

# Uncomment this line to upload data file
uploaded = files.upload()

In [5]:
df = pd.read_csv('data.csv')

In [6]:
df.head()

Unnamed: 0.1,Unnamed: 0,id,title,publication,author,date,year,month,url,content
0,0,17283,House Republicans Fret About Winning Their Hea...,New York Times,Carl Hulse,2016-12-31,2016.0,12.0,,WASHINGTON — Congressional Republicans have...


In [7]:
df.describe

<bound method NDFrame.describe of    Unnamed: 0     id  ... url                                            content
0           0  17283  ... NaN  WASHINGTON  —   Congressional Republicans have...

[1 rows x 10 columns]>

## Summarizing Using Different Models

Used different models through huggingface's transformers

### T5

In [8]:
model = AutoModelForSeq2SeqLM.from_pretrained('t5-base')
tokenizer = AutoTokenizer.from_pretrained("t5-base")

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1199.0, style=ProgressStyle(description…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=891691430.0, style=ProgressStyle(descri…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=791656.0, style=ProgressStyle(descripti…




In [9]:
start = time.time()
inputs = tokenizer.encode("summarize: " + df.iloc[0]['content'], return_tensors="pt", max_length=512, truncation=True)
outputs = model.generate(inputs, max_length=150, min_length=40, length_penalty=2.0, num_beams=4, early_stopping=True)
decoded_T5 = tokenizer.decode(outputs[0])
end = time.time()
t5_time = (end - start)
print(t5_time)

18.0212619304657


In [10]:
del tokenizer
del model
gc.collect()

551

### BERT

In [11]:
model = AutoModelWithLMHead.from_pretrained('bert-base-cased')
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")



HBox(children=(FloatProgress(value=0.0, description='Downloading', max=433.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=435779157.0, style=ProgressStyle(descri…




Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=213450.0, style=ProgressStyle(descripti…




In [12]:
start = time.time()
inputs = tokenizer.encode("summarize: " + df.iloc[0]['content'], return_tensors="pt", max_length=100, truncation=True)
outputs = model.generate(inputs, max_length=150, min_length=40, length_penalty=2.0, num_beams=4, early_stopping=True)
decoded_BERT = tokenizer.decode(outputs[0])
end = time.time()
BERT_time = (end-start)
print(BERT_time)

104.34373188018799


In [13]:
del tokenizer
del model
gc.collect()

4

### GPT 

In [14]:
model = AutoModelWithLMHead.from_pretrained('openai-gpt')
tokenizer = AutoTokenizer.from_pretrained("openai-gpt")



HBox(children=(FloatProgress(value=0.0, description='Downloading', max=656.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=478750579.0, style=ProgressStyle(descri…




Some weights of OpenAIGPTLMHeadModel were not initialized from the model checkpoint at openai-gpt and are newly initialized: ['lm_head.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=815973.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=458495.0, style=ProgressStyle(descripti…

ftfy or spacy is not installed using BERT BasicTokenizer instead of SpaCy & ftfy.





In [15]:
start = time.time()
inputs = tokenizer.encode("summarize: " + df.iloc[0]['content'], return_tensors="pt", max_length=149, truncation=True)
outputs = model.generate(inputs, max_length=150, min_length=40, length_penalty=2.0, num_beams=4, early_stopping=True)
decoded_GPT = tokenizer.decode(outputs[0])
end = time.time()
GPT_time = (end - start)
print(GPT_time)

2.7705235481262207


In [16]:
del tokenizer
del model
gc.collect()

4

### DistilBERT

In [17]:
model = AutoModelWithLMHead.from_pretrained('distilgpt2')
tokenizer = AutoTokenizer.from_pretrained("distilgpt2")



HBox(children=(FloatProgress(value=0.0, description='Downloading', max=762.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=352833716.0, style=ProgressStyle(descri…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1042301.0, style=ProgressStyle(descript…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=456318.0, style=ProgressStyle(descripti…




In [18]:
start = time.time()
inputs = tokenizer.encode("summarize: " + df.iloc[0]['content'], return_tensors="pt", max_length=149, truncation=True)
outputs = model.generate(inputs, max_length=150, min_length=40, length_penalty=2.0, num_beams=4, early_stopping=True)
decoded_DistilBERT = tokenizer.decode(outputs[0])
end = time.time()
DistilBERT_time = (end - start)
print(DistilBERT_time)

Setting `pad_token_id` to 50256 (first `eos_token_id`) to generate sequence


1.8445229530334473


In [19]:
del tokenizer
del model
gc.collect()

4

### GPT2

In [20]:
model = AutoModelWithLMHead.from_pretrained('gpt2')
tokenizer = AutoTokenizer.from_pretrained("gpt2")



HBox(children=(FloatProgress(value=0.0, description='Downloading', max=665.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=548118077.0, style=ProgressStyle(descri…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1042301.0, style=ProgressStyle(descript…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=456318.0, style=ProgressStyle(descripti…




In [21]:
start = time.time()
inputs = tokenizer.encode("summarize: " + df.iloc[0]['content'], return_tensors="pt", max_length=149, truncation=True)
outputs = model.generate(inputs, max_length=150, min_length=40, length_penalty=2.0, num_beams=4, early_stopping=True)
decoded_GPT2 = tokenizer.decode(outputs[0])
end = time.time()
GPT2_time = (end - start)
print(GPT2_time)

Setting `pad_token_id` to 50256 (first `eos_token_id`) to generate sequence


2.9879937171936035


In [22]:
del tokenizer
del model
gc.collect()

4

### T5 Trained on Wikihow

In [23]:
model = AutoModelForSeq2SeqLM.from_pretrained("deep-learning-analytics/wikihow-t5-small")
tokenizer = AutoTokenizer.from_pretrained("deep-learning-analytics/wikihow-t5-small")

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=736.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=242068027.0, style=ProgressStyle(descri…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=791656.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1786.0, style=ProgressStyle(description…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=25.0, style=ProgressStyle(description_w…




In [24]:
start = time.time()
inputs = tokenizer.encode("summarize: " + df.iloc[0]['content'], return_tensors="pt", max_length=1024, truncation=True)
outputs = model.generate(inputs, min_length=40, length_penalty=2.0, num_beams=4, early_stopping=True)
decoded_T5_wikihow = tokenizer.decode(outputs[0])
end = time.time()
t5_wikihow_time = (end - start)
print(t5_wikihow_time)

5.159658193588257


In [25]:
del tokenizer
del model
gc.collect()

327

### Pegasus NewsRoom

In [26]:
model = AutoModelForSeq2SeqLM.from_pretrained("google/pegasus-newsroom")
tokenizer = AutoTokenizer.from_pretrained("google/pegasus-newsroom")

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1119.0, style=ProgressStyle(description…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=2275329241.0, style=ProgressStyle(descr…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1912529.0, style=ProgressStyle(descript…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=65.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=87.0, style=ProgressStyle(description_w…




In [27]:
start = time.time()
batch = tokenizer.prepare_seq2seq_batch(df['content'].tolist()[:1], max_target_length=200, padding='longest')
translated = model.generate(**batch)
tgt_text = tokenizer.batch_decode(translated, skip_special_tokens=True)
end = time.time()
decoded_pegasus_newsroom = tgt_text[0]
pegasus_newsroom_time = (end - start)
print(pegasus_newsroom_time)

42.53254175186157


In [28]:
del tokenizer
del model
gc.collect()

1375



```
# This is formatted as code
```

### Pegasus CNN Daily Mail

In [None]:
tokenizer = AutoTokenizer.from_pretrained("google/pegasus-cnn_dailymail")
model = AutoModelForSeq2SeqLM.from_pretrained("google/pegasus-cnn_dailymail")

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1120.0, style=ProgressStyle(description…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1912529.0, style=ProgressStyle(descript…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=65.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=88.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=2275327883.0, style=ProgressStyle(descr…




In [None]:
start = time.time()
batch = tokenizer.prepare_seq2seq_batch(df['content'].tolist()[:1], max_target_length=200, padding='longest')
translated = model.generate(**batch)
tgt_text = tokenizer.batch_decode(translated, skip_special_tokens=True)
end = time.time()
decoded_pegasus_cnn = tgt_text[0]
pegasus_cnn_time = (end - start)
print(pegasus_cnn_time)

In [None]:
del tokenizer
del model
gc.collect()

### Pegasus Multi News

In [None]:
tokenizer = AutoTokenizer.from_pretrained("google/pegasus-multi_news")
model = AutoModelForSeq2SeqLM.from_pretrained("google/pegasus-multi_news")

In [None]:
start = time.time()
batch = tokenizer.prepare_seq2seq_batch(df['content'].tolist()[:1], max_target_length=100, padding='longest')
translated = model.generate(**batch)
tgt_text = tokenizer.batch_decode(translated, skip_special_tokens=True)
end = time.time()
decoded_pegasus_multi = tgt_text[0]
pegasus_multi_time = (end - start)
print(pegasus_multi_time)

In [None]:
del tokenizer
del model
gc.collect()

### Pegasus Billsum

In [None]:
tokenizer = AutoTokenizer.from_pretrained("google/pegasus-billsum")
model = AutoModelForSeq2SeqLM.from_pretrained("google/pegasus-billsum")

In [None]:
start = time.time()
batch = tokenizer.prepare_seq2seq_batch(df['content'].tolist()[:1], max_target_length=100, padding='longest')
translated = model.generate(**batch)
tgt_text = tokenizer.batch_decode(translated, skip_special_tokens=True)
end = time.time()
decoded_pegasus_billsum = tgt_text[0]
pegasus_billsum_time = (end - start)
print(pegasus_billsum_time)

In [None]:
del tokenizer
del model
gc.collect()

## Remove Stop Words from Text And Run Some Models

Testing to see if removing stop words improve performance without reducing any accuracy 

In [None]:
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')
stoplist = stopwords.words('english')
src_text = df.iloc[0]['content']
clean_word_list = [word for word in src_text.split() if word not in stoplist and word is not '-' and word is not '"' and word is not "'s"]
clean_src_text = ""
print(clean_word_list)
for x in clean_word_list:
  clean_src_text = clean_src_text + x + " "

clean_src_text = clean_src_text[:-1]
print(src_text)
print(clean_src_text)

In [None]:
clean_src_text

### T5

In [None]:
model = AutoModelForSeq2SeqLM.from_pretrained('t5-base')
tokenizer = AutoTokenizer.from_pretrained("t5-base")

In [None]:
start = time.time()
inputs = tokenizer.encode("summarize: " + clean_src_text, return_tensors="pt", max_length=512, truncation=True)
outputs = model.generate(inputs, max_length=150, min_length=40, length_penalty=2.0, num_beams=4, early_stopping=True)
decoded_T5_clean = tokenizer.decode(outputs[0])
end = time.time()
t5_time_clean = (end - start)
print(t5_time_clean)

In [None]:
del tokenizer
del model
gc.collect()

### Pegasus CNN Daily Mail

In [None]:
tokenizer = AutoTokenizer.from_pretrained("google/pegasus-cnn_dailymail")
model = AutoModelForSeq2SeqLM.from_pretrained("google/pegasus-cnn_dailymail")

In [None]:
start = time.time()
batch = tokenizer.prepare_seq2seq_batch([clean_src_text], max_target_length=200, padding='longest')
translated = model.generate(**batch)
tgt_text = tokenizer.batch_decode(translated, skip_special_tokens=True)
end = time.time()
decoded_pegasus_cnn_clean = tgt_text[0]
pegasus_cnn_time_clean = (end - start)
print(pegasus_cnn_time_clean)

In [None]:
del tokenizer
del model
gc.collect()

## Results Analysis

### Print All Decoded Summaries

In [None]:
decoded_BERT

In [None]:
decoded_T5

In [None]:
decoded_DistilBERT

In [None]:
decoded_GPT

In [None]:
decoded_DistilBERT

In [None]:
decoded_T5_wikihow

In [None]:
decoded_GPT2

In [None]:
decoded_pegasus_newsroom

In [None]:
decoded_pegasus_cnn

In [None]:
decoded_pegasus_multi

In [None]:
decoded_pegasus_billsum

In [None]:
decoded_pegasus_cnn_clean

In [None]:
decoded_T5_clean

In [None]:
src_text

### Charts

In [None]:
import matplotlib.pyplot as plot
time_array = {
    'Model': ["BERT", "T5", "DistilBERT", "GPT", "GPT2", "T5 wikihow", "Pegasus News Room", "Pegasus CNN DailyMail", "Pegasus Billsum", "Pegasus CNN Clean", "T5 Clean"], 
    'Times': [BERT_time, t5_time, DistilBERT_time, GPT_time, GPT2_time, t5_wikihow_time, pegasus_newsroom_time, pegasus_cnn_time, pegasus_billsum_time, pegasus_cnn_time_clean, t5_time_clean]
    }
df_time = pd.DataFrame(data=time_array)
df_time.sort_values('Times').plot.bar(x="Model", y="Times", title="Time to Summarize for each model")
plot.show()

In [None]:
length_dict = {
    "Model": ["Normal", "BERT", "T5", "DistilBERT", "GPT", "GPT2", "T5 Wikihow", "Pegasus News Room", "Pegasus CNN DailyMail", "Pegasus Multi News", "Pegasus Billsum", "Pegasus CNN Clean", "T5 Clean"], 
    "Length": [
               (len(df.iloc[0]['content'])),
               (len(decoded_BERT)), (len(decoded_T5)),
               (len(decoded_DistilBERT)),
               (len(decoded_GPT)),
               (len(decoded_GPT2)),
               (len(decoded_T5_wikihow)),
               (len(decoded_pegasus_newsroom)),
               (len(decoded_pegasus_cnn)),
               (len(decoded_pegasus_multi)),
               (len(decoded_pegasus_billsum)),
               (len(decoded_pegasus_cnn_clean)),
               (len(decoded_T5_clean)),
               ]
  }
df_length = pd.DataFrame(data=length_dict)
df_length.sort_values('Length').plot.bar(x="Model", y="Length", title="Length of Model Summary")
plot.show()

### ROGUE Metrics

In [None]:
def print_score_results(dictionary):
  print("ROGUE-1")
  for ch in dictionary['rouge-1']:
    print(f"   {ch}: {dictionary['rouge-1'][ch]}")
  print("ROUGE-2")
  for ch in dictionary['rouge-2']:
    print(f"   {ch}: {dictionary['rouge-2'][ch]}")
  print("ROUGE-L")
  for ch in dictionary['rouge-l']:
    print(f"   {ch}: {dictionary['rouge-l'][ch]}")

In [None]:
human_summarized = "WASHINGTON — Congressional Republicans have a new fear when it comes to their health care lawsuit against the Obama administration: They might win. The incoming Trump administration could choose to no longer defend the executive branch against the suit, which challenges the administration’s authority to spend billions of dollars on health insurance subsidies for and Americans, handing House Republicans a big victory on issues. To stave off that outcome, Republicans could find themselves in the awkward position of appropriating huge sums to temporarily prop up the Obama health care law, angeg conservative voters who have been demanding an end to the law for years. But a sudden loss of the disputed subsidies could conceivably cause the health care program to implode, leaving millions of people without access to health insurance before Republicans have prepared a replacemen It is a complicated set of dynamics illustrating how a quick legal victory for the House in the Trump era might come with costs that Republicans never anticipated when they took on the Obama White House. Just as important to House Republicans, Judge Collyer found that Congress had the standing to sue the White House on this issue — a ruling that many legal experts said was flawed — and they want that precedent to be set to restore congressional leverage over the executive branch. But on spending power and standing, the Trump administration may come under pressure from advocates of presidential authority to fight the House no matter their shared views on health care, since those precedents could have broad repercussions. Anticipating that the Trump administration might not be inclined to mount a vigorous fight against the House Republicans given the ’s dim view of the health care law, a team of lawyers this month sought to intervene in the case on behalf of two participants in the health care program. In another twist, Donald J. Trump’s administration, worried about preserving executive branch prerogatives, could choose to fight its Republican allies in the House on some central questions in the dispute. “Upon taking office, the Trump administration will evaluate this case and all related aspects of the Affordable Care Act. Eager to avoid an ugly political pileup, Republicans on Capitol Hill and the Trump transition team are gaming out how to handle the lawsuit, which, after the election, has been put in limbo until at least late February by the United States Court of Appeals for the District of Columbia Circuit. “Given that this pending litigation involves the Obama administration and Congress, it would be inappropriate to comment,” said Phillip J. Blando, a spokesman for the Trump transition effort. House Republicans contend that Congress never appropriated the money for the subsidies, as required by the Constitution. The White House said that the spending was a permanent part of the law passed in 2010, and that no annual appropriation was required — even though the administration initially sought one. ” No matter what happens, House Republicans say, they want to prevail on two overarching concepts: the congressional power of the purse, and the right of Congress to sue the executive branch if it violates the Constitution regarding that spending power."""
print(human_summarized)

In [None]:
!pip install -U rouge

In [None]:
from rouge import Rouge 
rouge = Rouge()

In [None]:
scores_t5 = rouge.get_scores(decoded_T5, human_summarized)
print_score_results(scores_t5[0])

In [None]:
scores_BERT = rouge.get_scores(decoded_BERT, human_summarized)
print_score_results(scores_BERT[0])

In [None]:
scores_GPT2 = rouge.get_scores(decoded_GPT2, human_summarized)
print_score_results(scores_GPT2[0])

In [None]:
scores_pegasus_cnn = rouge.get_scores(decoded_pegasus_cnn, human_summarized)
print_score_results(scores_pegasus_cnn[0])

In [None]:
scores_pegasus_cnn_clean = rouge.get_scores(decoded_pegasus_cnn_clean, human_summarized)
print_score_results(scores_pegasus_cnn_clean[0])

In [None]:
scores_t5_clean = rouge.get_scores(decoded_T5_clean, human_summarized)
print_score_results(scores_t5_clean[0])

# Web Scraper PySpark Implementation

In [1]:
!pip install -U pyspark newspaper3k py4j

Requirement already up-to-date: pyspark in /usr/local/lib/python3.6/dist-packages (3.0.1)
Requirement already up-to-date: newspaper3k in /usr/local/lib/python3.6/dist-packages (0.2.8)
Requirement already up-to-date: py4j in /usr/local/lib/python3.6/dist-packages (0.10.9.1)


In [2]:
import requests
import time

from bs4 import BeautifulSoup
from newspaper import Article
from py4j.java_gateway import JavaGateway
from pyspark import SparkConf, SparkContext
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

conf = SparkConf().setAppName("TestApp")
sc = SparkContext('local', conf=conf)
gateway = JavaGateway()

class Scraper:
    """
    This is the class to handle web scraping. It is able to read in and summarize news articles from preset rss feeds.
    """

    # link_dict is a dictionary of links where the key is a given news site's slug and the value is the rss feed's url
    link_dict = {
        'verge': 'https://www.theverge.com/rss/index.xml',
        # 'nyTimes_US': 'https://rss.nytimes.com/services/xml/rss/nyt/US.xml',
        # 'wired_main': 'https://www.wired.com/feed/rss',
        # 'cnet': 'https://www.cnet.com/rss/news/',
    }

    # individual dictionaries to store a given news site's articles, headlines, and links
    verge_dict = {}
    nyTime_dict = {}
    wired_dict = {}
    cnet_dict = {}

    # linking dictionary from slug to article dictionary
    article_to_dict = {
        'verge': verge_dict,
        # 'nyTimes_US': nyTime_dict,
        # 'wired_main': wired_dict,
        # 'cnet': cnet_dict,
    }

    # list of all articles
    articles = []

    # list of links that are frequent in rss feed but that don't need to be scraped
    not_allowed_urls = ['https://www.nytimes.com', 'https://www.nytimes.com/section/us', 'https://www.wired.com',
                        'https://www.cnet.com/#ftag=CAD590a51e']

    # tokenizer for text summarization
    tokenizer = AutoTokenizer.from_pretrained("google/pegasus-cnn_dailymail", use_fast=True)
    # Make sure the file is unzipped
    # model for text summarization
    model = AutoModelForSeq2SeqLM.from_pretrained("google/pegasus-cnn_dailymail")

        
    
    def scrape_all_articles(self):
        """
        Scrapes in all articles from rss feeds in link_dict
        """
        i = 0
        for site in self.link_dict:
            print(f'Started Scraping {site}')
            link = self.link_dict[site]
            res = requests.get(link)
            if res.status_code == 404:
                self.article_to_dict[site]["ERROR"] = "RSS feed responded with 404"

            soup = BeautifulSoup(res.text, 'xml')
            articles = []
            for art in soup.findAll('link')[1:]:

                if site != 'verge':
                    for x in art:
                        art_link = x
                else:
                    art_link = art['href']
                if art_link != link and art_link not in self.not_allowed_urls:
                    try:
                        article = Article(art_link)
                        article.download()
                        article.parse()
                        articles.append(article.text)
                        self.article_to_dict[site][article.title] = {'link': art_link, 'article_loc': i}
                        i += 1
                    except:
                        print(f'ERROR: {art_link}')

            # Creates PySpark RDD and saves it in cache. Then maps the summarize function
            arts_text = sc.parallelize(articles)
            arts_text.cache()
            arts_map = arts_text.map(lambda z: self.summarize(z))
            print("Pre Delete")
            del arts_text
            gc.collect()
            print("Post Delete")
            print("Post Cache")
            self.articles.append(arts_map.collect())
            print(f'Finished Scraping {site}')
        # Replaces the index value in article_to_dict[site][article.title][article_loc] to the summarized article string
        self.update_articles_in_dict()

    def update_articles_in_dict(self):
        """
        Replaces index with article text in dictionary
        """
        for site, headline_dict in self.article_to_dict.items():
            for headline, link_dict in headline_dict.items():
                if isinstance(link_dict['article_loc'], int):
                    link_dict.update({
                        'article_loc': self.articles[link_dict['article_loc']]
                    })

    def summarize(self, art):
        """
        Summarizes the passed in article text
        """
        print("Start Summarizing")
        start_time = time.time()
        batch = self.tokenizer.prepare_seq2seq_batch([art], max_target_length=100)
        translated = self.model.generate(**batch)
        tgt_text = self.tokenizer.batch_decode(translated, skip_special_tokens=True)
        end_time = time.time()
        time_diff = end_time - start_time
        print(f'TIME: {time_diff}')
        return tgt_text

In [3]:
import gc
gc.collect()

1399

In [4]:
s = Scraper()

In [5]:
s.scrape_all_articles()

Started Scraping verge
Pre Delete
Post Delete
Post Cache


Traceback (most recent call last):
  File "/usr/local/lib/python3.6/dist-packages/pyspark/serializers.py", line 468, in dumps
    return cloudpickle.dumps(obj, pickle_protocol)
  File "/usr/local/lib/python3.6/dist-packages/pyspark/cloudpickle.py", line 1097, in dumps
    cp.dump(obj)
  File "/usr/local/lib/python3.6/dist-packages/pyspark/cloudpickle.py", line 357, in dump
    return Pickler.dump(self, obj)
  File "/usr/lib/python3.6/pickle.py", line 409, in dump
    self.save(obj)
  File "/usr/lib/python3.6/pickle.py", line 476, in save
    f(self, obj) # Call unbound method with explicit self
  File "/usr/lib/python3.6/pickle.py", line 751, in save_tuple
    save(element)
  File "/usr/lib/python3.6/pickle.py", line 476, in save
    f(self, obj) # Call unbound method with explicit self
  File "/usr/local/lib/python3.6/dist-packages/pyspark/cloudpickle.py", line 501, in save_function
    self.save_function_tuple(obj)
  File "/usr/local/lib/python3.6/dist-packages/pyspark/cloudpickle.py

PicklingError: ignored

In [None]:
sc.stop()

# Threading?

In [21]:
import requests
import time
import os

from bs4 import BeautifulSoup
from newspaper import Article
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
import concurrent.futures

class Scraper:
    """
    This is the class to handle web scraping. It is able to read in and summarize news articles from preset rss feeds.
    """

    # link_dict is a dictionary of links where the key is a given news site's slug and the value is the rss feed's url
    link_dict = {
        'verge': 'https://www.theverge.com/rss/index.xml',
        # 'nyTimes_US': 'https://rss.nytimes.com/services/xml/rss/nyt/US.xml',
        # 'wired_main': 'https://www.wired.com/feed/rss',
        # 'cnet': 'https://www.cnet.com/rss/news/',
    }

    # individual dictionaries to store a given news site's articles, headlines, and links
    verge_dict = {}
    nyTime_dict = {}
    wired_dict = {}
    cnet_dict = {}

    # linking dictionary from slug to article dictionary
    article_to_dict = {
        'verge': verge_dict,
        # 'nyTimes_US': nyTime_dict,
        # 'wired_main': wired_dict,
        # 'cnet': cnet_dict,
    }

    # list of all articles
    articles = []

    # list of links that are frequent in rss feed but that don't need to be scraped
    not_allowed_urls = ['https://www.nytimes.com', 'https://www.nytimes.com/section/us', 'https://www.wired.com',
                        'https://www.cnet.com/#ftag=CAD590a51e']

    # tokenizer for text summarization
    tokenizer = AutoTokenizer.from_pretrained("google/pegasus-cnn_dailymail", use_fast=True)
    # Make sure the file is unzipped
    # model for text summarization
    model = AutoModelForSeq2SeqLM.from_pretrained("google/pegasus-cnn_dailymail")

    def scrape_all_articles(self):
        """
        Scrapes in all articles from rss feeds in link_dict
        """
        i = 0
        for site in self.link_dict:
            print(f'Started Scraping {site}')
            link = self.link_dict[site]
            res = requests.get(link)
            if res.status_code == 404:
                self.article_to_dict[site]["ERROR"] = "RSS feed responded with 404"

            soup = BeautifulSoup(res.text, 'xml')
            articles2 = []
            for art in soup.findAll('link')[1:]:
              if site != 'verge':
                for x in art:
                  art_link = x
              else:
                art_link = art['href']
                if art_link != link and art_link not in self.not_allowed_urls:
                  try:
                    article = Article(art_link)
                    article.download()
                    article.parse()
                    articles2.append(article.text)
                    self.article_to_dict[site][article.title] = {'link': art_link, 'article_loc': i}
                    i += 1
                  except:
                    print(f'ERROR: {art_link}')

            start_time = time.time()
            with concurrent.futures.ProcessPoolExecutor() as executor:
                results = executor.map(self.summarize, articles2)
                # test = self.tokenizer.batch_decode(results, skip_special_tokens=True)
                # print(test)
            end_time = time.time()
            time_diff = end_time - start_time
            print(f'Time Delta: {time_diff}')
            print(f'Finished Scraping {site}')
        # Replaces the index value in article_to_dict[site][article.title][article_loc] to the summarized article string
        # self.update_articles_in_dict()

    def update_articles_in_dict(self):
        """
        Replaces index with article text in dictionary
        """
        for site, headline_dict in self.article_to_dict.items():
            for headline, link_dict in headline_dict.items():
                if isinstance(link_dict['article_loc'], int):
                    link_dict.update({
                        'article_loc': self.articles[link_dict['article_loc']]
                    })

    def summarize(self, art):
        """
        Summarizes the passed in article text
        """
        print("Start Summarizing")
        batch = self.tokenizer.prepare_seq2seq_batch([art])
        translated = self.model.generate(**batch)
        return translated

In [22]:
import gc
gc.collect()

13483

In [23]:
s = Scraper()

In [24]:
s.scrape_all_articles()

Started Scraping verge
Start Summarizing
Start Summarizing
Start Summarizing
Start Summarizing
Start Summarizing
Start Summarizing
Start Summarizing
Start Summarizing
Start Summarizing
Start Summarizing
Time Delta: 493.7967367172241
Finished Scraping verge


In [25]:
s.verge_dict

{'Apple TV is coming to Xbox consoles on November 10th': {'article_loc': 9,
  'link': 'https://www.theverge.com/2020/11/2/21545980/apple-tv-xbox-app-november-10th-release-date-features'},
 'Apple announces ‘One More Thing’ event for November 10th': {'article_loc': 7,
  'link': 'https://www.theverge.com/2020/11/2/21546136/apple-event-date-time-november-10th-one-more-thing-arm-mac-silicon'},
 'Baby Shark is the most-viewed YouTube video of all time, and there’s no end in sight': {'article_loc': 0,
  'link': 'https://www.theverge.com/21546350/baby-shark-youtube-video-most-popular-despacito-masha'},
 'Moment announces MagSafe-compatible iPhone 12 cases and mounts': {'article_loc': 8,
  'link': 'https://www.theverge.com/2020/11/2/21545486/moment-magsafe-iphone-12-case-mount-announced'},
 'Now you can determine your level of Fauci with this handy meme': {'article_loc': 3,
  'link': 'https://www.theverge.com/2020/11/2/21545725/fauci-tufts-chart-mood-coronavirus-medicine'},
 'OnePlus’ Cyberpun

In [26]:
s.articles

[]