In [1]:
import pandas as pd
import sqlite3
import re
import matplotlib.pyplot as plt

import stanza

from tqdm.notebook import tqdm
tqdm.pandas()

In [2]:
con = sqlite3.connect('data.db')

cur = con.cursor()

query = """SELECT * FROM article
            WHERE text NOT LIKE '%Automated Insights%';
        """

articles = [*cur.execute(query)]

con.close()

In [20]:
len(articles)

210378

In [50]:
df = pd.DataFrame(articles, columns = ['id', 'title', 'body', 'href', 'date']).set_index('id')

df['size'] = df['body'].apply(lambda x: len(x.split()))

df['date'] = pd.to_datetime(df['date'], infer_datetime_format=True)

mask = (df['size'] > 50) & (df['size'] < 750)
df = df[mask]

df = df[(df['date'] >= '2018-01-01') & (df['date'] < '2021-01-01') & (df['date'].dt.weekday < 5)]

# df now represents all articles after 01/01/2007, during weekdays, non-automated, over 50 words

In [4]:
nlp = stanza.Pipeline(lang='en', processors='tokenize,sentiment')

2022-07-21 17:03:17 INFO: Loading these models for language: en (English):
| Processor | Package  |
------------------------
| tokenize  | combined |
| sentiment | sstplus  |

2022-07-21 17:03:17 INFO: Use device: cpu
2022-07-21 17:03:17 INFO: Loading: tokenize
2022-07-21 17:03:17 INFO: Loading: sentiment
2022-07-21 17:03:17 INFO: Done loading processors!


In [3]:
# this extracts sentiment on a sentence level and returns the average over all of them

def extract_sentiment(text):
    doc = nlp(text)
    if len(doc.sentences) == 1:
        return doc.sentences[0].sentiment - 1
    else:
        sentiment = 0
        for sentence in doc.sentences:
            sentiment += sentence.sentiment
        return sentiment / len(doc.sentences) - 1

In [None]:
df['stanza_title'] = df['title'].progress_apply(extract_sentiment)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=37332.0), HTML(value='')))

In [2]:
#df = pd.read_json('temp_stanza.json')
#df.head()

Unnamed: 0,title,body,href,date,size,stanza_title
118,Barron’s Next 50: The Latest Changes to the Index,"We review the Barron’s Next 50 periodically, m...",https://www.marketwatch.com/articles/barrons-n...,2018-07-02,128,0.0
158,The 3 Best Tech Stocks So Far This Year,"August is over, Labor Day is here, and we’re c...",https://www.marketwatch.com/articles/best-tech...,2019-09-02,248,1.0
436,Google Stadia Doesn’t Look Like the Netflix of...,Gamers and investors got the clearest glimpse ...,https://www.marketwatch.com/articles/google-st...,2019-06-06,580,-1.0
437,Is Amazon Investing Enough in Autonomous Driving?,It’s time for (ticker: AMZN) to become a car c...,https://www.marketwatch.com/articles/is-amazon...,2018-09-25,525,0.0
866,Why Netflix Won’t Be Disrupted by New Competit...,As (ticker: AAPL) gets set to announce new str...,https://www.marketwatch.com/articles/netflix-w...,2019-03-25,496,-1.0


In [5]:
df['stanza_body'] = df['body'].progress_apply(extract_sentiment)

  0%|          | 0/37332 [00:00<?, ?it/s]

  return torch.max_pool1d(input, kernel_size, stride, padding, dilation, ceil_mode)


In [6]:
df.to_json('temp_stanza.json')

In [8]:
df[['stanza_title', 'stanza_body']].corr()

Unnamed: 0,stanza_title,stanza_body
stanza_title,1.0,0.27737
stanza_body,0.27737,1.0


In [12]:
from transformers import BertForSequenceClassification, BertTokenizer
from transformers import pipeline

finbert = BertForSequenceClassification.from_pretrained('yiyanghkust/finbert-tone',num_labels=3)
tokenizer = BertTokenizer.from_pretrained('yiyanghkust/finbert-tone')

nlp = pipeline("sentiment-analysis", model=finbert, tokenizer=tokenizer)

In [20]:
nlp('hello')[0]['label']

'Positive'

In [63]:
def bert_sentiment(text):
    if len(text.split()) > 250:
        text = ' '.join(text.split()[:250])
    result = nlp(text)
    if result[0]['label'] == 'Positive':
        return result[0]['score']
    elif result[0]['label'] == 'Negative':
        return -result[0]['score']
    return 0

In [27]:
df['bert_title'] = df['title'].progress_apply(bert_sentiment)

  0%|          | 0/37332 [00:00<?, ?it/s]

In [66]:
import torch

def bert_body(text):
    token = tokenizer.encode(text, return_tensors='pt', max_length=512, truncation=True)
    result = finbert(token)
    return (torch.nn.functional.softmax(result[0], dim=-1) @ torch.tensor([0, 1, -1]).float()).item()

In [77]:
bert_body('just an experience')

0.04029202461242676

In [78]:
df['bert_body'] = df['body'].progress_apply(bert_body)

  0%|          | 0/37332 [00:00<?, ?it/s]

In [79]:
df.to_json('temp_stanza.json')

In [80]:
df.head()

Unnamed: 0,title,body,href,date,size,stanza_title,stanza_body,bert_title,bert_body
118,Barron’s Next 50: The Latest Changes to the Index,"We review the Barron’s Next 50 periodically, m...",https://www.marketwatch.com/articles/barrons-n...,2018-07-02,128,0.0,0.117647,0.0,1.7e-05
158,The 3 Best Tech Stocks So Far This Year,"August is over, Labor Day is here, and we’re c...",https://www.marketwatch.com/articles/best-tech...,2019-09-02,248,1.0,-0.066667,0.999997,0.999988
436,Google Stadia Doesn’t Look Like the Netflix of...,Gamers and investors got the clearest glimpse ...,https://www.marketwatch.com/articles/google-st...,2019-06-06,580,-1.0,-0.222222,0.0,-1.5e-05
437,Is Amazon Investing Enough in Autonomous Driving?,It’s time for (ticker: AMZN) to become a car c...,https://www.marketwatch.com/articles/is-amazon...,2018-09-25,525,0.0,-0.071429,0.0,0.006267
866,Why Netflix Won’t Be Disrupted by New Competit...,As (ticker: AAPL) gets set to announce new str...,https://www.marketwatch.com/articles/netflix-w...,2019-03-25,496,-1.0,0.0,-0.979412,-0.000136


In [115]:
from spacy.lang.en import English 

nlp = English()
nlp.add_pipe('sentencizer')

def split_in_sentences(text):
    doc = nlp(text)
    return [str(sent).strip() for sent in doc.sents]

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
senti = SentimentIntensityAnalyzer()

def extract_vader(doc):
    sentences = split_in_sentences(doc)
    sentiment = 0
    for sent in sentences:
        sentiment += senti.polarity_scores(sent)['compound']
    return sentiment / len(sentences)

In [120]:
extract_vader('hello this is just 1 sentence and its an aweful one it really sucks its the worse')

-0.6801

In [122]:
df['vader_title'] = df['title'].progress_apply(extract_vader)

  0%|          | 0/37332 [00:00<?, ?it/s]

In [123]:
df['vader_body'] = df['body'].progress_apply(extract_vader)

  0%|          | 0/37332 [00:00<?, ?it/s]

In [124]:
df.to_json('temp_stanza.json')

In [131]:
from textblob import TextBlob

#call the classifier
def extract_textblob(doc):
    sentences = split_in_sentences(doc)
    sentiment = 0
    for sent in sentences:
        sentiment += TextBlob(sent).sentiment.polarity
    return sentiment / len(sentences)

In [133]:
df['textblob_title'] = df['title'].progress_apply(extract_textblob)

  0%|          | 0/37332 [00:00<?, ?it/s]

In [134]:
df['textblob_body'] = df['body'].progress_apply(extract_textblob)

  0%|          | 0/37332 [00:00<?, ?it/s]

In [135]:
df.to_json('temp_stanza.json')