<a href="https://colab.research.google.com/github/kenwkliu/ideas/blob/master/colab/NewsSentimentSmallLanguageModel.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import requests, datetime
import pandas as pd
import numpy as np
import yfinance as yf
from bs4 import BeautifulSoup as bs
from textblob import TextBlob
from transformers import pipeline
from bs4 import BeautifulSoup as bs

import matplotlib.pyplot as plt
%matplotlib inline

# Enable Google interactive table
from google.colab import data_table
data_table.enable_dataframe_formatter()

agent_info = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:20.0) Gecko/20100101 Firefox/20.0'}

# Convert the sentiment label
# from: {-1:Negative, 0:Neutral, 1:Positive}
#   to: { 0:Negative, 2:Neutral, 1:Positive}
def convertSentimentLabel(label):
  if label == 0: return -1
  elif label == 2: return 0
  else: return label


# combine multiple words into one single word
# e.g. very good -> very_good
def combineWord(words):
  combined = ""
  for word in words:
    combined += word + "_"

  return combined[:len(combined)-1]


def convertTextBlobSentimentPolarity(polarity, positiveThreshod=0.1, negativeThreshod=-0.1):
  if polarity < negativeThreshod: return -1
  elif polarity > positiveThreshod: return 1
  else: return 0


# get the sentiment polarity and assessment from Textblob
def getTextBlobSentiments(content):
  tb = TextBlob(content)
  assessmentsList = []

  for assessments in tb.sentiment_assessments.assessments:
    assessmentsList.append((combineWord(assessments[0]), assessments[1]))

  return convertTextBlobSentimentPolarity(tb.polarity), tb.polarity, assessmentsList


def getLmScore(lm):
    x = lm[0]

    if x['label'] =="neutral":
        return 0
    elif x['label']  =="positive":
        return 1
    elif x['label']  =="negative":
        return -1

In [None]:
splits = {'train': 'sent_train.csv', 'validation': 'sent_valid.csv'}
df = pd.read_csv("hf://datasets/zeroshot/twitter-financial-news-sentiment/" + splits["train"])

# Convert the sentiment label to {-1:Negative, 0:Neutral, 1:Positive}
df['polarity'] = df['label'].apply(convertSentimentLabel)
df[['text', 'polarity']]

In [None]:
df['textblob_polarity'], df['textblob_sentiment'], df['textblob_assessments'] = zip(*df['text'].apply(getTextBlobSentiments))
df[['text', 'polarity', 'textblob_polarity', 'textblob_sentiment', 'textblob_assessments']]

In [None]:
lm_sentiment = pipeline("text-classification", model="mrm8488/distilroberta-finetuned-financial-news-sentiment-analysis")

In [None]:
print(getLmScore(lm_sentiment("The profit is great")))
print(getLmScore(lm_sentiment("The profit is the same")))
print(getLmScore(lm_sentiment("The profit is bad")))

In [None]:
# Use a saved results
df['lm_polarity'] = df['text'].apply(lambda x: getLmScore(lm_sentiment(x)))
df[['text', 'polarity', 'lm_polarity', 'textblob_polarity', 'textblob_sentiment', 'textblob_assessments']]

In [None]:
df['lm'] = (df['polarity'] == df['lm_polarity']).astype(int)
df['textblob'] = (df['polarity'] == df['textblob_polarity']).astype(int)
df[['text', 'polarity', 'lm_polarity', 'lm', 'textblob', 'textblob_polarity', 'textblob_sentiment', 'textblob_assessments']]

In [None]:
print(df['lm'].sum() / len(df['lm']))
print(df['textblob'].sum() / len(df['lm']))

In [None]:
finviz_url = "https://finviz.com/quote.ashx?t="

tickers = ["LCID","XPEV"]

news_tables = {}

for t in tickers:
    print(t)
    url =finviz_url + t
    print(url)
    t_content = requests.get(url, headers = agent_info)
    content_bs = bs(t_content.content, "html")
    news_tab = content_bs.find(id="news-table")
    news_tables[t] =news_tab
    print("-"*10)

In [None]:
table_array = []

for name, news_table in news_tables.items():
    print(name)

    for x in news_table.findAll('tr'):
        try:
          #headline
          text_content = x.a.get_text()
          #dates
          date_content = x.td.text.split()

          if len(date_content) == 1:
              time = date_content[0]
          else:
              date= date_content[0]
              time = date_content[1]

          table_array.append([name,date, time, text_content])
        except:
          print("Some items cannot be parsed")

In [None]:
table_news = pd.DataFrame(table_array, columns=["Ticker","Date","Time","Headline"])

table_news['Date'] = np.where(table_news['Date']=="Today", datetime.date.today(), table_news['Date'])
table_news['Date'] = pd.to_datetime(table_news['Date']).dt.date

table_news.head()