In [153]:
!pip install streamlit



In [154]:
!pip install beautifulsoup4



In [155]:
!pip install nltk seaborn wordcloud



In [156]:
import streamlit as st
import pandas as pd #helps w/ data manipulation
import re
import requests # helps send & receive response from web browswer
import plotly
import plotly.express as px
import json # for graph plotting in website
import nltk # NLTK VADER for sentiment analysis
import yfinance as yf
#from fireworks.client import Fireworks
import webbrowser
import base64
import numpy as np
nltk.downloader.download('vader_lexicon')
from datetime import datetime
#from streamlit_option_menu import option_menu
from urllib.request import urlopen, Request
from bs4 import BeautifulSoup as bs #important for data scraping
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from prophet import Prophet
from prophet.plot import plot_plotly
from plotly import graph_objs as go #plotly is an interactive graph
from datetime import datetime, timedelta

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [157]:
df.drop(['neg', 'neu', 'pos', 'compound'], axis=1, inplace=True)
df.to_csv("headlines.csv", index=False)
df = pd.read_csv("headlines.csv")

In [158]:
np.random.seed(0)
dates = pd.date_range((datetime.today() - timedelta(days=5)).strftime("%Y-%m-%d"), periods=100, freq='h')
df = pd.DataFrame({
        'date': dates,
        'ticker': 'APPL',
        'sentiment_score': np.random.rand(100)
})

In [159]:
#Function: DISPLAYS SENTIMENT GRAPHS VISUALLY
fig = px.bar(df, x='date', y='sentiment_score', title='Sentiment over Time')
fig.show()

In [160]:
def plot_hourly_sentiment(df, ticker):
        # Group by date and ticker columns from df and calculate the mean
        mean_scores = df.groupby(['date', 'ticker']).mean()

        # Plot a bar chart with plotly
        fig = px.bar(mean_scores, x=mean_scores.index.get_level_values(0), y='sentiment_score', title=ticker + ' Hourly Sentiment')
        fig.update_xaxes(title_text='Hourly Sentiment')  # Update x-axis label
        fig.update_yaxes(title_text='Sentiment Score')  # Update y-axis label
        return fig

In [161]:
def plot_daily_sentiment(df, ticker):
        # Group by date and ticker columns from df and calculate the mean
        mean_scores = df.groupby(['ticker', pd.Grouper(key='date', freq='D')]).mean().reset_index()

        # Plot a bar chart with plotly
        fig = px.bar(mean_scores, x='date', y='sentiment_score', title=ticker + ' Daily Sentiment')
        fig.update_xaxes(title_text='Date')  # Update x-axis label
        fig.update_yaxes(title_text='Sentiment Score')  # Update y-axis label
        return fig

In [162]:
# Scrape news
def get_news(ticker):
    finviz_url = f"https://finviz.com/quote.ashx?t={ticker}"
    req = Request(url=finviz_url, headers={'User-Agent': 'Mozilla/5.0'})
    response = urlopen(req)
    html = bs(response, 'html.parser')
    news_table = html.find(id='news-table')
    return news_table

In [163]:
# Parse News Table
def parse_news(news_table):
    rows = []
    for row in news_table.find_all('tr'):
        try:
            headline = row.a.get_text()
            td = row.td.text.strip().split()
            if len(td) == 1:
                time = td[0]
            else:
                date = td[0]
                time = td[1]
            rows.append([date, time, headline])
        except:
            continue
    df = pd.DataFrame(rows, columns=['date', 'time', 'headline'])
    df['date'] = df['date'].replace('Today', datetime.today().strftime('%Y-%m-%d'))
    df['datetime'] = pd.to_datetime(df['date'] + ' ' + df['time'])
    return df


In [167]:
# Score with VADER
def score_news(parsed_news_df):
    scores = [sia.polarity_scores(text) for text in parsed_news_df['headline']]
    scores_df = pd.DataFrame(scores)
    df_scored = parsed_news_df.join(scores_df)
    df_scored.set_index('datetime', inplace=True)
    df_scored.drop(['date', 'time'], axis=1, inplace=True)
    return df_scored

sia = SentimentIntensityAnalyzer()



In [180]:
# Label Sentiment for Classification
def label_sentiment(score):
    if score >= 0.05:
        return 'positive'
    elif score <= -0.05:
        return 'negative'
    else:
        return 'neutral'

df['sentiment_class'] = df['compound'].apply(label_sentiment)

In [169]:
# Print everything
ticker = 'AAPL'
tableNews = get_news(ticker)
# print(tableNews)
parseNews = parse_news(tableNews)
# print(parseNews)
parseNews.to_csv('temp2.csv', index=False)
df = score_news(parseNews)
df.to_csv('temp.csv', index=False)
st.dataframe(df)

files.download('temp2.csv')  # For parseNews CSV
files.download('temp.csv')

df['sentiment_class'] = df['compound'].apply(label_sentiment) # Now df has a compound column



Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.



<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [179]:
#  Naive Bayes Model
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

# Prepare features and labels
vectorizer = TfidfVectorizer(stop_words='english', max_features=1000)
X = vectorizer.fit_transform(df['headline'])
y = df['sentiment_class']

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train and evaluate
model = MultinomialNB()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

# Results
print("Naive Bayes Accuracy using TF-IDF >>> ", accuracy_score(y_test, y_pred))


Naive Bayes Accuracy using TF-IDF >>>  0.3


In [172]:
from sklearn.feature_extraction.text import CountVectorizer  # BoW instead of TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

# Prepare features and labels using Bag of Words
vectorizer = CountVectorizer(stop_words='english', max_features=1000)  # Changed to BoW
X = vectorizer.fit_transform(df['headline'])
y = df['sentiment_class']

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train and evaluate
model = MultinomialNB()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

# Results
print("Naive Bayes Model Accuracy using BoW >>> ", accuracy_score(y_test, y_pred))



Naive Bayes Model Accuracy using BoW >>>  0.25


In [173]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Vectorize headlines using Bag of Words
bow_vectorizer = CountVectorizer(stop_words='english', max_features=1000)
X_bow = bow_vectorizer.fit_transform(df['headline'])
y = df['sentiment_class']

# Split data
X_train_bow, X_test_bow, y_train, y_test = train_test_split(X_bow, y, test_size=0.2, random_state=42)

# Train SVM
svm_bow = LinearSVC()
svm_bow.fit(X_train_bow, y_train)

# Predict and evaluate
y_pred_bow = svm_bow.predict(X_test_bow)
accuracy_bow = accuracy_score(y_test, y_pred_bow)
print(f"SVM with Bag of Words Accuracy >>  {accuracy_bow:.3f}")


SVM with Bag of Words Accuracy >>  0.350


In [174]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Vectorize headlines using TF-IDF
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=1000)
X_tfidf = tfidf_vectorizer.fit_transform(df['headline'])

# Split data
X_train_tfidf, X_test_tfidf, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=42)

# Train SVM
svm_tfidf = LinearSVC()
svm_tfidf.fit(X_train_tfidf, y_train)

# Predict and evaluate
y_pred_tfidf = svm_tfidf.predict(X_test_tfidf)
accuracy_tfidf = accuracy_score(y_test, y_pred_tfidf)
print(f"SVM with TF-IDF Accuracy >>>  {accuracy_tfidf:.3f}")


SVM with TF-IDF Accuracy >>>  0.300


In [175]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Vectorize headlines using Bag of Words
bow_vectorizer = CountVectorizer(stop_words='english', max_features=1000)
X_bow = bow_vectorizer.fit_transform(df['headline'])
y = df['sentiment_class']

# Split data
X_train_bow, X_test_bow, y_train, y_test = train_test_split(X_bow, y, test_size=0.2, random_state=42)

# Train Logistic Regression
logreg_bow = LogisticRegression(max_iter=1000)
logreg_bow.fit(X_train_bow, y_train)

# Predict and evaluate
y_pred_bow = logreg_bow.predict(X_test_bow)
accuracy_bow = accuracy_score(y_test, y_pred_bow)
print(f"Logistic Regression with Bag of Words Accuracy >>> {accuracy_bow:.4f}")



Logistic Regression with Bag of Words Accuracy >>> 0.4500


In [176]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Vectorize headlines using TF-IDF
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=1000)
X_tfidf = tfidf_vectorizer.fit_transform(df['headline'])

# Split data
X_train_tfidf, X_test_tfidf, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=42)

# Train Logistic Regression
logreg_tfidf = LogisticRegression(max_iter=1000)
logreg_tfidf.fit(X_train_tfidf, y_train)

# Predict and evaluate
y_pred_tfidf = logreg_tfidf.predict(X_test_tfidf)
accuracy_tfidf = accuracy_score(y_test, y_pred_tfidf)
print(f"Logistic Regression with TF-IDF Accuracy >>> {accuracy_tfidf:.4f}")


Logistic Regression with TF-IDF Accuracy >>> 0.2500


In [177]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# BoW vectorization
bow_vectorizer = CountVectorizer(stop_words='english', max_features=1000)
X_bow = bow_vectorizer.fit_transform(df['headline'])
y = df['sentiment_class']

# Train/test split
X_train_bow, X_test_bow, y_train, y_test = train_test_split(X_bow, y, test_size=0.2, random_state=42)

# Train Random Forest
rf_bow = RandomForestClassifier(n_estimators=100, random_state=42)
rf_bow.fit(X_train_bow, y_train)

# Predict and evaluate
y_pred_bow = rf_bow.predict(X_test_bow)
accuracy_bow = accuracy_score(y_test, y_pred_bow)
print(f"Random Forest with Bag of Words Accuracy >>> {accuracy_bow:.4f}")


Random Forest with Bag of Words Accuracy >>> 0.5500


In [178]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# TF-IDF vectorization
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=1000)
X_tfidf = tfidf_vectorizer.fit_transform(df['headline'])
y = df['sentiment_class']

# Train/test split
X_train_tfidf, X_test_tfidf, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=42)

# Train Random Forest
rf_tfidf = RandomForestClassifier(n_estimators=100, random_state=42)
rf_tfidf.fit(X_train_tfidf, y_train)

# Predict and evaluate
y_pred_tfidf = rf_tfidf.predict(X_test_tfidf)
accuracy_tfidf = accuracy_score(y_test, y_pred_tfidf)
print(f"Random Forest with TF-IDF Accuracy >>> {accuracy_tfidf:.4f}")


Random Forest with TF-IDF Accuracy >>> 0.4000
