In [1]:
import re
import warnings
import logging
import joblib
import requests
import pysentiment as ps
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
from bs4 import BeautifulSoup
from summarizer import Summarizer

warnings.filterwarnings("ignore")
logging.basicConfig(level=logging.NOTSET)

I0728 16:58:49.606134  8320 file_utils.py:39] PyTorch version 1.2.0 available.


In [2]:
def add_content(url, ratio=0.8):
    """
    Return:
        res_origin: complete paragraph string
        res_ps: important sentence string
        res_bertsum: filtered string by BertSum
    """
    resp = requests.get(url)
    soup = BeautifulSoup(resp.text, 'html.parser')
    paragraph = soup.find_all('p')
    paragraph = [p.text for p in paragraph]
    paragraph = paragraph[1:-1]
    res_origin = "".join(paragraph)
    hiv4_function = ps.hiv4.HIV4()
    po = []
    for p in paragraph:
        tokens = hiv4_function.tokenize(p)
        s = hiv4_function.get_score(tokens)
        po.append(s['Polarity'])
    res = []
    for i, p in enumerate(po):
        if(float(p) >= 0.85 or float(p) <= -0.85):
            res.append(paragraph[i])
    res_ps = "".join(res)
    bert_summarizer = Summarizer()
    result = bert_summarizer(res_origin, ratio=ratio)
    res_bertsum = ''.join(result)
    return (res_origin, res_ps, res_bertsum)

In [3]:
df = joblib.load("../data/sp500_top100_v1.bin")
df = df.iloc[0:10, :]

tqdm.pandas()
df[["content", "ps_content", "bs_content"]] = df.progress_apply(lambda row: pd.Series(add_content(row["url"])), axis=1)

HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))

I0728 16:59:02.322458  8320 configuration_utils.py:285] loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-config.json from cache at C:\Users\YangWang/.cache\torch\transformers\6dfaed860471b03ab5b9acb6153bea82b6632fb9bbe514d3fff050fe1319ee6d.788fed32bb8481a9b15ce726d41c53d5d5066b04c667e34ce3a7a3826d1573d8
I0728 16:59:02.323458  8320 configuration_utils.py:321] Model config BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 1024,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "type_vocab_size": 2,
  "vocab_size": 30522
}

I0728 16:59:02.774490  8320 modeling_utils.py:650] loading weights file https://cdn.huggingface.co/

I0728 17:00:10.688786  8320 modeling_utils.py:650] loading weights file https://cdn.huggingface.co/bert-large-uncased-pytorch_model.bin from cache at C:\Users\YangWang/.cache\torch\transformers\73e65a4648c1a5eab31ecea94e04a92a7168cd7089d588b68e5bc057aff40421.4d5343a4b979c4beeaadef17a0453d1bb183dd9b084f58b84c7cc781df343ae6
I0728 17:00:19.303278  8320 tokenization_utils.py:1015] loading file https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-vocab.txt from cache at C:\Users\YangWang/.cache\torch\transformers\9b3c03a36e83b13d5ba95ac965c9f9074a99e14340c523ab405703179e79fc46.542ce4285a40d23a559526243235df47c5f75c197f04f37d1a0c124c32c9a084
I0728 17:00:24.517315  8320 configuration_utils.py:285] loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-config.json from cache at C:\Users\YangWang/.cache\torch\transformers\6dfaed860471b03ab5b9acb6153bea82b6632fb9bbe514d3fff050fe1319ee6d.788fed32bb8481a9b15ce726d41c53d5d5066b04c667e34ce




In [4]:
df

Unnamed: 0,title,date,query,url,ticker,content,ps_content,bs_content
0,BRIEF-Apple Inc Says Not Allowing Entertainmen...,2020-03-15,Apple Inc.,https://www.reuters.com/article/idUSFWN2B61K2,AAPL,March 14 (Reuters) - Apple Inc: * APPLE INC SA...,* APPLE INC SAYS NOT ALLOWING ENTERTAINMENT OR...,March 14 (Reuters) - Apple Inc: * APPLE INC SA...
1,Apple signs multi-year deals with major music ...,2020-03-12,Apple Inc.,https://www.reuters.com/article/idUSKBN20Z33J,AAPL,(Reuters) - Apple Inc has sealed multi-year li...,(Reuters) - Apple Inc has sealed multi-year li...,(Reuters) - Apple Inc has sealed multi-year li...
2,Apple signs multi-year deals with major music ...,2020-03-12,Apple Inc.,https://www.reuters.com/article/idUSL4N2B54T2,AAPL,(Reuters) - Apple Inc has sealed multi-year li...,(Reuters) - Apple Inc has sealed multi-year li...,(Reuters) - Apple Inc has sealed multi-year li...
3,Chinese regulators remove 'Plague Inc' game fr...,2020-02-28,Apple Inc.,https://www.reuters.com/article/idUSKCN20M043,AAPL,BEIJING/SHANGHAI (Reuters) - The video game “P...,The regulator did not respond to Reuters phone...,BEIJING/SHANGHAI (Reuters) - The video game “P...
4,UPDATE 1-Chinese regulators remove 'Plague Inc...,2020-02-28,Apple Inc.,https://www.reuters.com/article/idUSL3N2AS0OO,AAPL,BEIJING/SHANGHAI (Reuters) - The video game “P...,The regulator did not respond to Reuters phone...,BEIJING/SHANGHAI (Reuters) - The video game “P...
5,Apple launches new MacBook Pro,2020-05-04,Apple Inc.,https://www.reuters.com/article/idUSL4N2CM29Z,AAPL,May 4 (Reuters) - Apple Inc on Monday launched...,Apple said here its new lineup of MacBook Pro ...,May 4 (Reuters) - Apple Inc on Monday launched...
6,Broadcom to supply wireless components to Apple,2020-01-23,Apple Inc.,https://www.reuters.com/article/idUSKBN1ZM32H,AAPL,(Reuters) - Chipmaker Broadcom Inc (AVGO.O) sa...,(Reuters) - Chipmaker Broadcom Inc (AVGO.O) sa...,(Reuters) - Chipmaker Broadcom Inc (AVGO.O) sa...
7,Apple expands services business to markets in ...,2020-04-21,Apple Inc.,https://www.reuters.com/article/idUSL1N2C900N,AAPL,(Reuters) - Apple Inc (AAPL.O) on Tuesday said...,(Reuters) - Apple Inc (AAPL.O) on Tuesday said...,(Reuters) - Apple Inc (AAPL.O) on Tuesday said...
8,"Factbox: How to watch Apple TV+, Apple's entry...",2019-11-01,Apple Inc.,https://www.reuters.com/article/idUSKBN1XB3TM,AAPL,LOS ANGELES (Reuters) - Apple Inc (AAPL.O) unv...,Here are details on how to subscribe and watch...,LOS ANGELES (Reuters) - Apple Inc (AAPL.O) unv...
9,Apple to expand operations in India - IT minister,2019-11-25,Apple Inc.,https://www.reuters.com/article/idUSL4N2852IX,AAPL,"NEW DELHI, Nov 25 (Reuters) - India’s informat...","NEW DELHI, Nov 25 (Reuters) - India’s informat...","NEW DELHI, Nov 25 (Reuters) - India’s informat..."
