In [9]:
from datetime import date,timedelta
import json
import csv
import tweepy
import re
from datetime import datetime

from urllib.request import urlopen, Request
from bs4 import BeautifulSoup
import os
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from nltk.sentiment.vader import SentimentIntensityAnalyzer

import yfinance as yf

## 1. Retrieve All Data:

In [92]:
def stock_data(ticker):
    end_date=date.today()+timedelta(days=1)
    start_date=date.today()
    # 1. Request data:
    data = yf.download(ticker, 
                      start=start_date, 
                      end=end_date,
                      interval='30m', 
                      progress=False)
    # 2. Feature Engineering:
    data['Percent Price Change Within Period'] = ((data['Close'] - data['Open'])/data['Open'])*100
    data['Change in Close Price'] = data['Close'] - data['Close'].shift(1)
    data['Scaled Volume'] = data['Volume']/data['Volume'].mean()
    data_SMA = data['Adj Close'].rolling(window=3).mean().shift(1)
    data['SMA(3)'] = data_SMA
    data.reset_index(inplace=True)
    data['Datetime']=data['Datetime'].dt.tz_convert('America/Montreal').dt.tz_localize(None)
    return data

In [126]:
def get_news(ticker_code):
    # 1. Define URL:
    finwiz_url = 'https://finviz.com/quote.ashx?t='
    # 2. Requesting data:
    news_tables = {}
    tickers = [ticker_code]
    for ticker in tickers:
        url = finwiz_url + ticker
        req = Request(url=url,headers={'user-agent': 'my-app/0.0.1'}) 
        response = urlopen(req)    
        # Read the contents of the file into 'html'
        html = BeautifulSoup(response)
        # Find 'news-table' in the Soup and load it into 'news_table'
        news_table = html.find(id='news-table')
        # Add the table to our dictionary
        news_tables[ticker] = news_table
    #3. Parsing news:
    parsed_news = []
    # Iterate through the news
    for file_name, news_table in news_tables.items():
        # Iterate through all tr tags in 'news_table'
        for x in news_table.findAll('tr'):
            # read the text from each tr tag into text
            # get text from a only
            text = x.a.get_text() 
            # splite text in the td tag into a list 
            date_scrape = x.td.text.split()
            # if the length of 'date_scrape' is 1, load 'time' as the only element
            if len(date_scrape) == 1:
                time = date_scrape[0]
            # else load 'date' as the 1st element and 'time' as the second    
            else:
                date = date_scrape[0]
                time = date_scrape[1]
            # Extract the ticker from the file name, get the string up to the 1st '_'  
            ticker = file_name.split('_')[0]
            # Append ticker, date, time and headline as a list to the 'parsed_news' list
            parsed_news.append([ticker, date, time, text])

    # 4. Split into columns and save:
    vader = SentimentIntensityAnalyzer()
    columns = ['ticker', 'date', 'time', 'headline']
    # Convert the parsed_news list into a DataFrame called 'parsed_and_scored_news'
    parsed_and_scored_news = pd.DataFrame(parsed_news, columns=columns)
    # Iterate through the headlines and get the polarity scores using vader
    scores = parsed_and_scored_news['headline'].apply(vader.polarity_scores).tolist()
    # Convert the 'scores' list of dicts into a DataFrame
    scores_df = pd.DataFrame(scores)
    # Join the DataFrames of the news and the list of dicts
    parsed_and_scored_news = parsed_and_scored_news.join(scores_df, rsuffix='_right')
    parsed_and_scored_news.insert(loc=1, column='date_time', value=(pd.to_datetime(parsed_and_scored_news['date'] + ' ' + parsed_and_scored_news['time'])))
    parsed_and_scored_news.drop(columns=['date','time'],axis=1,inplace=True)
    return parsed_and_scored_news

In [99]:
def search_for_hashtags(hashtag_phrase):
    format_hashtag = '$'+hashtag_phrase
    #auth = tweepy.OAuthHandler("psxikKIcDvu19SDl1qbTycOHY", "H2FV5HL4UyuzVUtANd0lo3HpsNzo1woDMuwlabVyv2E7g44SDb")
    #auth.set_access_token("1265371572942102528-XZyshdLemub7C7hrx51dyFrYCBCmvU", "CK4hmNukUYBhzr3CAtO3gvGFx7Ahr9TW3vZMjW6pl8yVd")

    #auth = tweepy.OAuthHandler("4BSOA2XKS8vucCHyBjy502Aw8", "l27Zsdh9X9oGGff25gJ4PX9zN6ZnjYKonu2zISu17jsQlO5Dkb")
    #auth.set_access_token("1265371572942102528-IsosHWjrXRDKHaqQFAbSPuM2FyH41k", "pVjYFAx8pDCBnJE48NxsA4KB6g4eNMw39TOWcTciLOB5u")

    #auth = tweepy.OAuthHandler("dQXGlGv8YFtPrZvQwcCnvbged", "ouKbQUg1dGubyKbp7DrNo45Qdv3nNzd7MyuvCgIKha0vpuNbDA")
    #auth.set_access_token("1265371572942102528-Jmu9hvd4yBep0KwV9U5mHFUnyUi9JV", "CYW3FRkyXJnSzRrHoN9FcLlBdSRHfA7GWJK1PMT7Q1S7P")

    auth = tweepy.OAuthHandler("ljU1UWBCC0YNKlO9pwm1TshUc", "w5CDP6fNeNQhvDs075KiZQEIWI7VY1Z8BxiDc5kUAHTsXzOhCY")
    auth.set_access_token("1309241898419322880-hBG39tjNql0FcHrjOMYg2qTN3TEtnw", "p0UOsVpTB9hu15R3YBMbWGw2zDpJjkUCmK0YadzA894ZF")

    api = tweepy.API(auth)
   
    twitter_posts = pd.DataFrame(columns=['timestamp', 'tweet_text', 'followers_count'])
    timestamp=[]
    tweets=[]
    follow_count=[]
    while True:
        try:
            for tweet in tweepy.Cursor(api.search, q=format_hashtag+' -filter:retweets', lang="en", tweet_mode='extended').items():
                timestamp.append(tweet.created_at)
                tweets.append(tweet.full_text.replace('\n',' ').encode('utf-8'))
                follow_count.append(tweet.user.followers_count)
        except tweepy.TweepError:
            break
        except StopIteration:
            break
    twitter_posts['timestamp']=timestamp
    twitter_posts['tweet_text']=tweets
    twitter_posts['followers_count']=follow_count
      
    return twitter_posts
        

In [100]:
tweets = search_for_hashtags('AAPL')

2293


In [101]:
tweets

Unnamed: 0,timestamp,tweet_text,followers_count
0,2020-09-30 20:16:22,b'#FAANG Stocks Overview: https://t.co/QEVM36K...,1085
1,2020-09-30 20:15:49,"b""$AAPL pushes through Tuesday's high: https:/...",853
2,2020-09-30 20:15:40,"b""Can't Wait $aapl $amzn $abbv $ba $brk $bhc ...",130
3,2020-09-30 20:15:00,b'Get today winning alerts https://t.co/ou6BBH...,18
4,2020-09-30 20:15:00,b'\xf0\x9f\x94\xb4\xf0\x9f\x94\xb4 You are in...,143246
...,...,...,...
2288,2020-09-29 21:14:54,b'Most active Tuesday - $NIO $HUSA $AAPL $ADIL...,12330
2289,2020-09-29 21:13:13,"b""I've made 35k with them . If you want to m...",0
2290,2020-09-29 21:12:55,"b""Most active stocks from today's after-hours ...",11745
2291,2020-09-29 21:12:17,b'@WarrenBuffett geico will acquire $lmnd 170%...,29


In [None]:
def calc_change_sentiment(data, col):
    change_in_sent = []
    change_in_sent.append(data[col][0])
    for i in range(1,len(data[col])):
        if data[col][i] == 0:
            change_in_sent.append(0)
        elif data[col][i] < 0 or data[col][i] > 0:
            dif = data[col][i] - data[col][(i-1)]
            change_in_sent.append(dif)
    return change_in_sent

def remove_pattern(input_txt, pattern):
    r = re.findall(pattern, input_txt)
    for i in r:
        input_txt = re.sub(i, '', input_txt)       
    return input_txt
    
def clean_tweets(tweets):
    #remove twitter Return handles (RT @xxx:)
    tweets = np.vectorize(remove_pattern)(tweets, "RT @[\w]*:") 
    #remove twitter handles (@xxx)
    tweets = np.vectorize(remove_pattern)(tweets, "@[\w]*")
    #remove URL links (httpxxx)
    tweets = np.vectorize(remove_pattern)(tweets, "https?://[A-Za-z0-9./]*")
    tweets = np.vectorize(remove_pattern)(tweets, "b'")
    tweets = np.vectorize(remove_pattern)(tweets, 'b"')
    #remove special characters, numbers, punctuations (except for #)
    tweets = np.core.defchararray.replace(tweets, "[^a-zA-Z]", " ")
    return tweets

In [None]:
def classify_news(dataframe, datetime_column_name):
    day1, day2, day3, day4= [],[],[],[]

    for i in range(len(dataframe[datetime_column_name])):
        if dataframe[datetime_column_name][i].day == dataframe[datetime_column_name][i].day and (dataframe[datetime_column_name][i].hour <= 15 and dataframe[datetime_column_name][i].hour >= 9):
            day1.append(i)
        elif dataframe[datetime_column_name][i].day == dataframe[datetime_column_name][i].day+1 and (dataframe[datetime_column_name][i].hour <= 15 and dataframe[datetime_column_name][i].hour >= 9):
            day2.append(i)
        elif dataframe[datetime_column_name][i].day == dataframe[datetime_column_name][i].day+2 and (dataframe[datetime_column_name][i].hour <= 15 and dataframe[datetime_column_name][i].hour >= 9):
            day3.append(i)       
        elif dataframe[datetime_column_name][i].day == dataframe[datetime_column_name][i].day+3 and (dataframe[datetime_column_name][i].hour <= 15 and dataframe[datetime_column_name][i].hour >= 9):
            day4.append(i)
        else:
            pass
    news_d1, news_d2,news_d3,news_d4 = dataframe.iloc[day1],dataframe.iloc[day2],dataframe.iloc[day3],dataframe.iloc[day4]
    return news_d1, news_d2,news_d3,news_d4

In [None]:
def preprocess_headlines(data):
    data.drop_duplicates(subset='headline',keep=False, inplace=True)
    data.drop('ticker', axis=1, inplace=True)
    data.set_index('date_time', inplace=True)
    data_30m = data.resample('30min').median().ffill().reset_index()
    change_in_sent=calc_change_sentiment(data_30m, 'compound')
    data_30m['change in sentiment headlines'] = change_in_sent
    data_30m['change in sentiment headlines (t-1)'] = data_30m['change in sentiment headlines'].shift(1)

    news_d1, news_d2,news_d3,news_d4 = classify_news(data_30m, 'date_time')
    news_d1_red, news_d2_red,news_d3_red,news_d4_red = news_d1.iloc[1:],news_d2.iloc[1:],news_d3.iloc[1:],news_d4.iloc[1:]

    frames_news = [news_d1_red, news_d2_red,news_d3_red,news_d4_red]
    processed_headlines = pd.concat(frames_news)
    return processed_headlines


In [None]:
def preprocess_posts(dataframe):
    vader = SentimentIntensityAnalyzer()
    dataframe['tweet_text'] = clean_tweets(dataframe['tweet_text'])
    scores = dataframe['tweet_text'].apply(vader.polarity_scores).tolist()
    scores_df = pd.DataFrame(scores)

    df = dataframe.join(scores_df, rsuffix='_right')
    df = df[['timestamp','tweet_text','followers_count','neg','neu','pos','compound']]
    df['timestamp'] = df['timestamp'].dt.tz_localize('UTC').dt.tz_convert('America/Montreal').dt.tz_localize(None)
    df['scaled_followers_count'] =(df['followers_count']/df['followers_count'].max()) + 1
    df['adj compound'] = df['compound']*df['scaled_followers_count']
    df.set_index('timestamp', inplace=True)

    twitter_df_30m = df.resample('30min').median().ffill().reset_index()
    change_in_sent = calc_change_sentiment(twitter_df_30m, 'adj compound')
    twitter_df_30m['change in sentiment twitter'] = change_in_sent
    twitter_df_30m['change in sentiment twitter (t-1)'] = twitter_df_30m['change in sentiment twitter'].shift(1)

    tweet_d1,tweet_d2,tweet_d3,tweet_d4 = classify_news(twitter_df_30m, 'timestamp')
    tweet_d1_red,tweet_d2_red,tweet_d3_red,tweet_d4_red = tweet_d1.iloc[1:],tweet_d2.iloc[1:],tweet_d3.iloc[1:],tweet_d4.iloc[1:]

    frames = [tweet_d1_red,tweet_d2_red,tweet_d3_red,tweet_d4_red]
    processed_tweets = pd.concat(frames)
    return processed_tweets

In [None]:
def evaluate_models(baseline_df, headline_df, twitter_df):
    #1. Baseline:
    baseline_rmse, baseline_r2 = baseline_model(baseline_df)
    baseline_df2 = baseline_df
    baseline_df2['t+1'] = baseline_df2['Adj Close'].shift(-1)
    lm_baseline_rmse, lm_baseline_r2, sgd_baseline_rmse, sgd_baseline_r2 = linear_modeling_no_sentiment(baseline_df2)
    #2. Headline Final Merge:
    headlines_final = preprocess_headlines(headline_df)
    with_headlines_df = stock_df.merge(headlines_final, left_on='Datetime', right_on='date_time').drop('date_time',axis=1)
    with_headlines_df['t+1'] = with_headlines_df['Adj Close'].shift(-1)
    #3. Twitter Final Merge:
    final_twitter = preprocess_posts(twitter_df)
    with_twitter_df = stock_df.merge(final_twitter, left_on='Datetime', right_on='timestamp').drop('timestamp',axis=1)
    with_twitter_df['t+1'] = with_twitter_df['Adj Close'].shift(-1)
    #4. Full Merge:
    full_df = with_twitter_df.merge(headlines_final, left_on='Datetime', right_on='date_time').drop('date_time',axis=1)
    full_df['t+1'] = full_df['Adj Close'].shift(-1)
    #5. Evaluating Models:
    lm_headlines_rmse, lm_headlines_r2, sgd_headlines_rmse, sgd_headlines_r2,xgb_headlines_rmse,xgb_headlines_r2 = linear_modeling_headlines(with_headlines_df)
    lm_twitter_rmse, lm_twitter_r2, sgd_twitter_rmse, sgd_twitter_r2,xgb_twitter_rmse,xgb_twitter_r2 = linear_model_twitter(with_twitter_df)
    lm_all_rmse, lm_all_r2, sgd_all_rmse, sgd_all_r2, xgb_all_rmse, xgb_all_r2, rf_all_rmse, rf_all_r2 = multi_model_full(full_df)
    #6. Store in dict:
    result_dict = {
    'RMSE - Baseline':baseline_rmse, 'R2 - Baseline':baseline_r2, 'Linear RMSE - Baseline':lm_baseline_rmse, 'Linear R2 - Baseline':lm_baseline_r2, 'SGD RMSE - Baseline':sgd_baseline_rmse, 'SGD R2 - Baseline':sgd_baseline_r2,
    'Linear RMSE - Only Headlines': lm_headlines_rmse, 'Linear R2 - Only Headlines':lm_headlines_r2, 'SGD RMSE - Only Headlines':sgd_headlines_rmse, 'SGD R2 - Only Headlines':sgd_headlines_r2, 'XGB RMSE - Only Headlines':xgb_headlines_rmse, 'XGB R2 - Only Headlines':xgb_headlines_r2,
    'Linear RMSE - Only Twitter':lm_twitter_rmse, 'Linear R2 - Only Twitter':lm_twitter_r2, 'SGD RMSE - Only Twitter':sgd_twitter_rmse, 'SGD R2 - Only Twitter':sgd_twitter_r2, 'XGB RMSE - Only Twitter':xgb_twitter_rmse, 'XGB R2 - Only Twitter':xgb_twitter_r2,
    'Linear RMSE - All':lm_all_rmse, 'Linear R2 - All':lm_all_r2, 'SGD RMSE - All':sgd_all_rmse, 'SGD R2 - All':sgd_all_r2, 'XGB RMSE - All':xgb_all_rmse, 'XGB R2 - All':xgb_all_r2, 'RF RMSE - All':rf_all_rmse,'RF R2 - All':rf_all_r2}
    #7. Convert to DataFrame:
    result_df = pd.DataFrame.from_dict(result_dict, orient='index', columns=['Values'])
    #result_df.to_csv('~/LighthouseLabs-Final/Report_Analysis/AAPL_complete_analysis.csv')
    return result_df, full_df