In [33]:
import re
import sys
import time
import config
import warnings
import requests
import joblib
import pysentiment as ps
import pandas as pd
import numpy as np
import yfinance as yf
from collections import defaultdict
from torch.nn import functional as F
from tqdm.notebook import tqdm
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from contractions import contractions_dict
from zero_shot_learner import extend_df_with_cos_sim
from summarizer import Summarizer
warnings.filterwarnings("ignore")

In [2]:
def add_content(url, ratio=0.8):
    """
    Return:
        res_origin: complete paragraph string
        res_ps: important sentence string
        res_bertsum: filtered string by BertSum
    """
    resp = requests.get(url)
    soup = BeautifulSoup(resp.text, 'html.parser')
    paragraph = soup.find_all('p')
    paragraph = [p.text for p in paragraph]
    paragraph = paragraph[1:-1]
    res_origin = "".join(paragraph)
    hiv4_function = ps.hiv4.HIV4()
    po = []
    for p in paragraph:
        tokens = hiv4_function.tokenize(p)
        s = hiv4_function.get_score(tokens)
        po.append(s['Polarity'])
    res = []
    for i, p in enumerate(po):
        if(float(p) >= 0.85 or float(p) <= -0.85):
            res.append(paragraph[i])
    res_ps = "".join(res)
    bert_summarizer = Summarizer()
    result = bert_summarizer(res_origin, ratio=ratio)
    res_bertsum = ''.join(result)
    return (res_origin, res_ps, res_bertsum)


class NewsPreprocessor:
    """
    Data preprocessing class.
    """
    def __init__(self, contractions_dict, lower=True, rm_stopwords=False):
        """
        :param contractions_dict: dict
        :param lower: bool
        :param rm_stopwords: bool
        """
        self.contractions_dict = contractions_dict
        self.lower = lower
        self.rm_stopwords = rm_stopwords

    def remove_unicode(self, text):
        """
        Removes unicode strings like "\u002c" and "x96"
        """
        text = re.sub(r'(\\u[0-9A-Fa-f]+)', r'', text)
        text = re.sub(r'[^\x00-\x7f]', r'', text)
        return text

    # Function for expanding contractions
    def expand_contractions(self, text, contractions_dict):
        """
        Finding contractions. (e.g. you've -> you have)
        """
        # Regular expression for finding contractions
        contractions_re = re.compile('(%s)' % '|'.join(self.contractions_dict.keys()))

        def replace(match):
            return contractions_dict[match.group(0)]

        return contractions_re.sub(replace, text)

    def remove_stopwords(self, text):
        pattern = re.compile(r'\b(' + r'|'.join(stopwords.words('english')) + r')\b\s*')
        text = pattern.sub('', text)
        return text

    def remove_digits(self, text):
        nums = set(map(int, range(10)))
        text = ''.join(i for i in text if i not in nums)
        return text

    def ultimate_clean(self, text):
        if self.lower:
            text = text.lower()
        if self.rm_stopwords:
            text = self.remove_stopwords(text)
        text = self.remove_unicode(text)
        text = self.expand_contractions(text, self.contractions_dict)
        text = self.remove_digits(text)
        return text

In [12]:
file_path = r"C:\Users\YangWang\Downloads\file_name.bin"
df = joblib.load(file_path)
df.head()

Unnamed: 0,title,date,query,url,ticker,content,ps_content
0,BRIEF-Apple Inc Says Not Allowing Entertainmen...,2020-03-15,Apple Inc.,https://www.reuters.com/article/idUSFWN2B61K2,AAPL,March 14 (Reuters) - Apple Inc: * APPLE INC SA...,
1,Apple signs multi-year deals with major music ...,2020-03-12,Apple Inc.,https://www.reuters.com/article/idUSKBN20Z33J,AAPL,(Reuters) - Apple Inc has sealed multi-year li...,(Reuters) - Apple Inc has sealed multi-year li...
2,Apple signs multi-year deals with major music ...,2020-03-12,Apple Inc.,https://www.reuters.com/article/idUSL4N2B54T2,AAPL,(Reuters) - Apple Inc has sealed multi-year li...,
3,Chinese regulators remove 'Plague Inc' game fr...,2020-02-28,Apple Inc.,https://www.reuters.com/article/idUSKCN20M043,AAPL,BEIJING/SHANGHAI (Reuters) - The video game “P...,The regulator did not respond to Reuters phone...
4,UPDATE 1-Chinese regulators remove 'Plague Inc...,2020-02-28,Apple Inc.,https://www.reuters.com/article/idUSL3N2AS0OO,AAPL,BEIJING/SHANGHAI (Reuters) - The video game “P...,


In [14]:
df.dropna(inplace=True)

In [15]:
df

Unnamed: 0,title,date,query,url,ticker,content,ps_content
0,BRIEF-Apple Inc Says Not Allowing Entertainmen...,2020-03-15,Apple Inc.,https://www.reuters.com/article/idUSFWN2B61K2,AAPL,March 14 (Reuters) - Apple Inc: * APPLE INC SA...,
1,Apple signs multi-year deals with major music ...,2020-03-12,Apple Inc.,https://www.reuters.com/article/idUSKBN20Z33J,AAPL,(Reuters) - Apple Inc has sealed multi-year li...,(Reuters) - Apple Inc has sealed multi-year li...
3,Chinese regulators remove 'Plague Inc' game fr...,2020-02-28,Apple Inc.,https://www.reuters.com/article/idUSKCN20M043,AAPL,BEIJING/SHANGHAI (Reuters) - The video game “P...,The regulator did not respond to Reuters phone...
5,Apple launches new MacBook Pro,2020-05-04,Apple Inc.,https://www.reuters.com/article/idUSL4N2CM29Z,AAPL,May 4 (Reuters) - Apple Inc on Monday launched...,Apple said here its new lineup of MacBook Pro ...
6,Broadcom to supply wireless components to Apple,2020-01-23,Apple Inc.,https://www.reuters.com/article/idUSKBN1ZM32H,AAPL,(Reuters) - Chipmaker Broadcom Inc (AVGO.O) sa...,(Reuters) - Chipmaker Broadcom Inc (AVGO.O) sa...
...,...,...,...,...,...,...,...
126652,SAP terminates advisory contract with former C...,2019-11-18,ServiceNow,https://www.reuters.com/article/idUSKBN1XS1Y0,NOW,BERLIN (Reuters) - SAP and its former Chief Ex...,Reporting by Tassilo HummelAll quotes delayed ...
126653,BRIEF-Lincoln Financial Announces Steps to Sup...,2020-04-21,ServiceNow,https://www.reuters.com/article/idUSFWN2C90NS,NOW,April 21 (Reuters) - Lincoln National Corp: * ...,
126654,SAP to streamline hardware infrastructure in d...,2019-11-12,ServiceNow,https://www.reuters.com/article/idUSKBN1XM2IA,NOW,FRANKFURT (Reuters) - German business software...,"Luka Mucic, finance chief, said SAP will reduc..."
126659,SAP to return 1.5 bln euros to shareholders in...,2019-11-04,ServiceNow,https://www.reuters.com/article/idUSL3N27K40Z,NOW,FRANKFURT (Reuters) - German business software...,The capital markets day will be the first majo...


In [16]:
df.shape

(93723, 7)

In [40]:
import torch

def transform_df(df, sort_by, k=25):
    """
    Transform dataframe into another dataframe with top k news using zero-shot learner.
    :param df: pandas dataframe
    :param sort_by: str
    :param k: int
    :return: pandas dataframe
    """
    # Group news by date and aggregate into a list
    df_temp = df.copy()
    df_temp["date"] = pd.to_datetime(df_temp["date"], utc=True)
    df_temp['date'] = df_temp['date'].apply(lambda x: x.date())
    df_temp = df_temp.sort_values(['date', sort_by], ascending=False).groupby('date').head(100)
    df_temp = df_temp.groupby("date")['ps_content'].agg(list)
    df_temp = df_temp.reset_index(drop=False, inplace=False)
    df_temp.columns = ["date", "agg_news"]

    # Create top k tweet columns
    new_cols = ["Top {} News".format(i + 1) for i in range(k)]
    df_temp = df_temp.assign(**dict.fromkeys(new_cols, np.NaN))

    # Update every columns
    print("Start transforming dataframe...")
    for index, row in df_temp.iterrows():
        try:
            i = 1
            for news in row["agg_news"]:
                column = "Top {} News".format(i)
                df_temp.loc[index, column] = news
                i += 1
                if i > k:
                    break
        except:
            pass
    df = df_temp.drop("agg_news", axis=1)
    print("Done!")

    return df

class SentenceBert():
    """
    A common approach to zero shot learning using Sentence-BERT.
    Reference from https://joeddav.github.io/blog/2020/05/29/ZSL.html
    """
    def __init__(self):
        self.device = torch.device("cpu")
        self.tokenizer = AutoTokenizer.from_pretrained('deepset/sentence_bert')
        self.model = AutoModel.from_pretrained('deepset/sentence_bert')
        self.model = self.model.to(self.device)
        
    def get_similarity(self, sentence, labels):
        """
        Parameters:
            sentence: str
            label: list
        """
        # Run inputs through model and mean-pool over the sequence dimension to get sequence-level representations
        inputs = self.tokenizer.batch_encode_plus(
            [sentence] + labels,
            return_tensors='pt',
            pad_to_max_length=True, 
            max_length=512)
        input_ids = inputs['input_ids'].to(self.device)
        attention_mask = inputs['attention_mask'].to(self.device)
        with torch.no_grad():
            output = self.model(input_ids, attention_mask=attention_mask)[0]
        sentence_rep = output[:1].mean(dim=1)
        label_reps = output[1:].mean(dim=1)
    
        # Now find the labels with the highest cosine similarities to the sentence
        similarities = F.cosine_similarity(sentence_rep, label_reps)
        closest = similarities.argsort(descending=True)
        
        sim_dict = defaultdict()
        for ind in closest:
            sim_dict[labels[ind]] = (similarities[ind].item())
            
        return sim_dict

def extend_cos_sim(df, col, labels, sort_by):
    """
    :param df: pandas dataframe
    :param col: str column name
    :param labels: list of string
    :param sort_by: str sort by which column
    :return: df: pandas dataframe
    """
    SB = SentenceBert()
    print("Start zero-shot learner...")
    # df[labels] = df.apply(lambda row: pd.Series(SB.get_similarity(row[col], labels=labels)), axis=1)
    for index, row in tqdm(df.iterrows()):
        sim_dict = SB.get_similarity(row[col], labels)
        for i in range(len(labels)):
            df.loc[index, labels[i]] = sim_dict[labels[i]]
    df = df.sort_values(by=sort_by, axis=0, ascending=False)
    df = df.reset_index(drop=True)
    print("Done!")
    return df

In [38]:
df = extend_cos_sim(df, col="ps_content", labels=["finance"], sort_by="finance")

Start zero-shot learner...


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


Done!


In [41]:
df = transform_df(df, sort_by="finance", k=25)

Start transforming dataframe...
Done!


In [None]:
def load_stock(ticker_name, start_date="2012-01-01"):
    ticker = yf.Ticker(ticker_name)
    hist = ticker.history(period="max", start=start_date)
    hist.index = hist.index.set_names(['date'])
    hist = hist.reset_index(drop=False, inplace=False)
    hist["date"] = pd.to_datetime(hist["date"], utc=True)
    hist['date'] = hist['date'].apply(lambda x: x.date())
    hist.sort_values(by='date', inplace=True)
    hist.reset_index(drop=True, inplace=True)
    hist["ticker"] = ticker_name
    hist["label"] = hist["Close"].diff(periods=1)
    hist.dropna(inplace=True)
    hist["label"] = hist["label"].map(lambda x: 1 if float(x) >= 0 else 0)
    return hist

def load_news(df, labels, sort_by, k):
    """
    :param file_name: str
    :param labels: list of str (for zero-shot learner)
    :param sort_by: str (str in labels)
    :param k: int (top k news)
    :return: pandas dataframe
    """
    df.drop_duplicates(subset="title", inplace=True)
    preprocessor = NewsPreprocessor(contractions_dict=contractions_dict)
    df["clean_title"] = df["title"].apply(lambda x: preprocessor.ultimate_clean(x))
    df = extend_df_with_cos_sim(df=df, col="clean_title", labels=labels, sort_by=sort_by)
    df = transform_df(df=df, sort_by=sort_by, k=k)
    df.reset_index(drop=True, inplace=True)
    return df

In [42]:
from preprocessor import load_stock

TRAIN_START_DATE = "2012-01-01"
TRAIN_END_DATE = "2015-12-31"
VALID_START_DATE = "2016-01-01"
VALID_END_DATE = "2016-12-31"
TEST_START_DATE = "2017-01-01"
TEST_END_DATE = "2020-07-01"

train = pd.DataFrame()
valid = pd.DataFrame()
test = pd.DataFrame()

df_merge = df.copy()

for ticker in tqdm(df_merge["ticker"].unique()):
    news = df_merge[df_merge["ticker"] == str(ticker)]
    news = load_news(news, labels=["finance"], sort_by="finance", k=10)
    stock = load_stock(str(ticker), start_date="2012-01-01")
    news_and_stock = pd.merge(news, stock, on=["date"])
    news_and_stock.set_index('date', inplace=True)
    
    train_temp = news_and_stock.loc[pd.to_datetime(TRAIN_START_DATE).date():pd.to_datetime(TRAIN_END_DATE).date()]
    valid_temp = news_and_stock.loc[pd.to_datetime(VALID_START_DATE).date():pd.to_datetime(VALID_END_DATE).date()]
    test_temp = news_and_stock.loc[pd.to_datetime(TEST_START_DATE).date():pd.to_datetime(TEST_END_DATE).date()]
    
    train = pd.concat([train, train_temp], axis=0)
    valid = pd.concat([valid, valid_temp], axis=0)
    test = pd.concat([test, test_temp], axis=0)

Unnamed: 0,date,Top 1 News,Top 2 News,Top 3 News,Top 4 News,Top 5 News,Top 6 News,Top 7 News,Top 8 News,Top 9 News,...,Top 16 News,Top 17 News,Top 18 News,Top 19 News,Top 20 News,Top 21 News,Top 22 News,Top 23 News,Top 24 News,Top 25 News
0,2011-08-23,Romney says he needs a bigger place to have ro...,,,,,,,,,...,,,,,,,,,,
1,2011-08-29,Bank of America Corp is selling about half its...,,,,,,,,,...,,,,,,,,,,
2,2011-09-02,"Reuters provides trusted business, financial, ...",,,,,,,,,...,,,,,,,,,,
3,2011-09-06,"Reuters provides trusted business, financial, ...",,,,,,,,,...,,,,,,,,,,
4,2011-09-08,"Reuters provides trusted business, financial, ...",,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3085,2020-07-16,Gorman said the decision to keep the bank’s co...,Gorman said the decision to keep the bank’s co...,,,Shares in the bank fell about 4% in response t...,Shares in the bank fell about 4% in response t...,July 16 (Reuters) - Morgan Stanley posted a 45...,The S&P 500 is about 5.8% away from its record...,Net income applicable to common shareholders f...,...,(Reuters) - Johnson & Johnson said on Thursday...,(Reporting by Sarah N. Lynch and David Brunnst...,"Reuters provides trusted business, financial, ...","Reuters provides trusted business, financial, ...","Reuters provides trusted business, financial, ...","Reuters provides trusted business, financial, ...","Reuters provides trusted business, financial, ...","Reuters provides trusted business, financial, ...","Reuters provides trusted business, financial, ...","Reuters provides trusted business, financial, ..."
3086,2020-07-17,"But with some of those benefits going away, an...","But with some of those benefits going away, an...",Netflix said on Thursday it added 10.09 millio...,(Reuters) - Retail investing is having a momen...,Small-cap stocks are often considered a barome...,As the second-quarter earnings season gets und...,The New York-based company's net income rose t...,"ET (13:37 GMT), the Toronto Stock Exchange’s S...","The S&P 500 utilities, real estate and healthc...",...,"Reuters provides trusted business, financial, ...","Reuters provides trusted business, financial, ...","Reuters provides trusted business, financial, ...","Reuters provides trusted business, financial, ...","Reuters provides trusted business, financial, ...","Reuters provides trusted business, financial, ...","Reuters provides trusted business, financial, ...","Reuters provides trusted business, financial, ...","Reuters provides trusted business, financial, ...","Reuters provides trusted business, financial, ..."
3087,2020-07-18,BlackRock ended the quarter with $7.32 trillio...,,Small-cap stocks are often considered a barome...,(Reuters) - The S&P 500 ended higher on Friday...,"Reuters provides trusted business, financial, ...","Reuters provides trusted business, financial, ...","Reuters provides trusted business, financial, ...","Reuters provides trusted business, financial, ...","Reuters provides trusted business, financial, ...",...,,,,,,,,,,
3088,2020-07-19,Small-cap stocks are often considered a barome...,"Reuters provides trusted business, financial, ...","Reuters provides trusted business, financial, ...","Reuters provides trusted business, financial, ...",,,,,,...,,,,,,,,,,
