In [1]:
import re
import sys
import time
import config
import warnings
import requests
import joblib
import pysentiment as ps
import pandas as pd
import numpy as np
import yfinance as yf
from tqdm.notebook import tqdm
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from contractions import contractions_dict
from zero_shot_learner import extend_df_with_cos_sim
from summarizer import Summarizer
warnings.filterwarnings("ignore")

In [2]:
def add_content(url, ratio=0.8):
    """
    Return:
        res_origin: complete paragraph string
        res_ps: important sentence string
        res_bertsum: filtered string by BertSum
    """
    resp = requests.get(url)
    soup = BeautifulSoup(resp.text, 'html.parser')
    paragraph = soup.find_all('p')
    paragraph = [p.text for p in paragraph]
    paragraph = paragraph[1:-1]
    res_origin = "".join(paragraph)
    hiv4_function = ps.hiv4.HIV4()
    po = []
    for p in paragraph:
        tokens = hiv4_function.tokenize(p)
        s = hiv4_function.get_score(tokens)
        po.append(s['Polarity'])
    res = []
    for i, p in enumerate(po):
        if(float(p) >= 0.85 or float(p) <= -0.85):
            res.append(paragraph[i])
    res_ps = "".join(res)
    bert_summarizer = Summarizer()
    result = bert_summarizer(res_origin, ratio=ratio)
    res_bertsum = ''.join(result)
    return (res_origin, res_ps, res_bertsum)


class NewsPreprocessor:
    """
    Data preprocessing class.
    """
    def __init__(self, contractions_dict, lower=True, rm_stopwords=False):
        """
        :param contractions_dict: dict
        :param lower: bool
        :param rm_stopwords: bool
        """
        self.contractions_dict = contractions_dict
        self.lower = lower
        self.rm_stopwords = rm_stopwords

    def remove_unicode(self, text):
        """
        Removes unicode strings like "\u002c" and "x96"
        """
        text = re.sub(r'(\\u[0-9A-Fa-f]+)', r'', text)
        text = re.sub(r'[^\x00-\x7f]', r'', text)
        return text

    # Function for expanding contractions
    def expand_contractions(self, text, contractions_dict):
        """
        Finding contractions. (e.g. you've -> you have)
        """
        # Regular expression for finding contractions
        contractions_re = re.compile('(%s)' % '|'.join(self.contractions_dict.keys()))

        def replace(match):
            return contractions_dict[match.group(0)]

        return contractions_re.sub(replace, text)

    def remove_stopwords(self, text):
        pattern = re.compile(r'\b(' + r'|'.join(stopwords.words('english')) + r')\b\s*')
        text = pattern.sub('', text)
        return text

    def remove_digits(self, text):
        nums = set(map(int, range(10)))
        text = ''.join(i for i in text if i not in nums)
        return text

    def ultimate_clean(self, text):
        if self.lower:
            text = text.lower()
        if self.rm_stopwords:
            text = self.remove_stopwords(text)
        text = self.remove_unicode(text)
        text = self.expand_contractions(text, self.contractions_dict)
        text = self.remove_digits(text)
        return text

In [4]:
# Load data
df = joblib.load("../data/sp500_top100_v1.bin")
df.drop_duplicates(subset="title", inplace=True)
print("Get content...")
tqdm.pandas()
df[["content", "ps_content", "bs_content"]] = df.progress_apply(lambda row: pd.Series(add_content(row["url"])), axis=1)
print("Done!")

# Clean data
preprocessor = NewsPreprocessor(contractions_dict=contractions_dict)
print("Start cleaning title.")
df["clean_title"] = df["title"].progress_apply(lambda x: preprocessor.ultimate_clean(x))
print("Start cleaning pysentiment content.")
df["clean_ps_content"] = df["ps_content"].progress_apply(lambda x: preprocessor.ultimate_clean(x))
print("Start cleaning BertSum content.")
df["clean_bs_content"] = df["bs_content"].progress_apply(lambda x: preprocessor.ultimate_clean(x))
df = extend_df_with_cos_sim(df=df, col="clean_ps_content", labels=["stock", "finance"], sort_by="stock")
print(df)

joblib.dump(df, "../data/sp500_top100_v2.bin", compress=5)

Get content...


HBox(children=(FloatProgress(value=0.0, max=97326.0), HTML(value='')))




ConnectionError: HTTPSConnectionPool(host='www.reuters.com', port=443): Max retries exceeded with url: /article/idUSASB0AYF4 (Caused by NewConnectionError('<urllib3.connection.VerifiedHTTPSConnection object at 0x0000027089DA3FD0>: Failed to establish a new connection: [Errno 11001] getaddrinfo failed'))

In [10]:
# Load data
import swifter
start = time.time()
df = joblib.load("../data/sp500_top100_v1.bin")
df.drop_duplicates(subset="title", inplace=True)
df.drop_duplicates(subset="url", inplace=True)
print("Get content...")
# tqdm.pandas()
# df[["content"]] = df.progress_apply(lambda row: pd.Series(add_content(row["url"])), axis=1)
df_temp2 = pd.DataFrame()
for ticker in df.ticker.unique().tolist():
    print("Start", ticker)
    df_temp = df[df["ticker"] == ticker]
    df_temp["content"] = df_temp["url"].swifter.apply(add_content, axis=1)
    df_temp2 = pd.concat([df_temp2, df_temp], axis=0)
    print(len(df_temp2))
    time.sleep(20)
print("Done!")
print(df)
print(time.time()-start)

joblib.dump(df, "../data/sp500_top100_content_v1.bin", compress=5)

Get content...
Start AAPL


OSError: Unable to load weights from pytorch checkpoint file. If you tried to load a PyTorch model from a TF 2.0 checkpoint, please set from_tf=True. 