In [1]:
import json
import csv
import tweepy
import re
from datetime import date,datetime,timedelta

from urllib.request import urlopen, Request
from bs4 import BeautifulSoup
import os
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import numpy as np
import yfinance as yf

## 1. Twitter API:

In [2]:
def remove_pattern(input_txt, pattern):
    r = re.findall(pattern, input_txt)
    for i in r:
        input_txt = re.sub(i, '', input_txt)       
    return input_txt
    
def clean_tweets(tweets):
    #remove twitter Return handles (RT @xxx:)
    tweets = np.vectorize(remove_pattern)(tweets, "RT @[\w]*:") 
    #remove twitter handles (@xxx)
    tweets = np.vectorize(remove_pattern)(tweets, "@[\w]*")
    #remove URL links (httpxxx)
    tweets = np.vectorize(remove_pattern)(tweets, "https?://[A-Za-z0-9./]*")
    #remove special characters, numbers, punctuations (except for #)
    tweets = np.core.defchararray.replace(tweets, "[^a-zA-Z]", " ")
    return tweets

def get_tweets(hashtag_phrase):
    format_hashtag = '$'+hashtag_phrase
    start_date = date.today()
    end_date = date.today()+timedelta(days=1)
    
    consumer_key = os.environ['consumer_key']
    consumer_secret = os.environ['consumer_secret']
    access_token = os.environ['twitter_access_token']
    access_token_secret = os.environ['twitter_access_secret']

    auth = tweepy.OAuthHandler(consumer_key,consumer_secret)
    auth.set_access_token(access_token,access_token_secret)

    api = tweepy.API(auth)
   
    twitter_posts = pd.DataFrame(columns=['timestamp', 'tweet_text', 'followers_count'])
    timestamp=[]
    tweets=[]
    follow_count=[]
    # while True:
    #     try:
    for tweet in tweepy.Cursor(api.search, q=format_hashtag+' -filter:retweets', lang="en", tweet_mode='extended',since=start_date, until=end_date).items():
            timestamp.append(tweet.created_at)
            tweets.append(tweet.full_text.replace('\n',' ').encode('utf-8'))
            follow_count.append(tweet.user.followers_count)
        # except tweepy.TweepError:
        #     break
        # except StopIteration:
        #     break
    twitter_posts['timestamp']=timestamp
    twitter_posts['tweet_text']=tweets
    twitter_posts['followers_count']=follow_count
    twitter_posts['tweet_text']=twitter_posts['tweet_text'].str.decode("utf-8")
    twitter_posts['scaled_followers_count'] =twitter_posts['followers_count']/twitter_posts['followers_count'].max()

    vader = SentimentIntensityAnalyzer()
    twitter_posts['tweet_text'] = clean_tweets(twitter_posts['tweet_text'])
    # dataframe.reset_index(drop=False,inplace=True)
    scores = twitter_posts['tweet_text'].apply(vader.polarity_scores).tolist()
    scores_df = pd.DataFrame(scores)
    df = twitter_posts.join(scores_df, rsuffix='_right')
    df['compound'] = df['compound']*(twitter_posts['scaled_followers_count']+1)

    df.to_csv('~/LighthouseLabs-Final/' + hashtag_phrase + '_' + (datetime.today().strftime('%Y-%m-%d')) + '.csv')
    return df

## 2. News Headlines:

In [12]:
def get_news(ticker_code):
    # 1. Define URL:
    finwiz_url = 'https://finviz.com/quote.ashx?t='
    # 2. Requesting data:
    news_tables = {}
    tickers = [ticker_code]
    for ticker in tickers:
        url = finwiz_url + ticker
        req = Request(url=url,headers={'user-agent': 'my-app/0.0.1'}) 
        response = urlopen(req)    
        # Read the contents of the file into 'html'
        html = BeautifulSoup(response)
        # Find 'news-table' in the Soup and load it into 'news_table'
        news_table = html.find(id='news-table')
        # Add the table to our dictionary
        news_tables[ticker] = news_table
    #3. Parsing news:
    parsed_news = []
    # Iterate through the news
    for file_name, news_table in news_tables.items():
        # Iterate through all tr tags in 'news_table'
        for x in news_table.findAll('tr'):
            # read the text from each tr tag into text
            # get text from a only
            text = x.a.get_text() 
            # splite text in the td tag into a list 
            date_scrape = x.td.text.split()
            # if the length of 'date_scrape' is 1, load 'time' as the only element

            if len(date_scrape) == 1:
                time = date_scrape[0]
                
            # else load 'date' as the 1st element and 'time' as the second    
            else:
                date = date_scrape[0]
                time = date_scrape[1]
            # Extract the ticker from the file name, get the string up to the 1st '_'  
            ticker = file_name.split('_')[0]
            
            # Append ticker, date, time and headline as a list to the 'parsed_news' list
            parsed_news.append([ticker, date, time, text])

    # 4. Split into columns and save:
    # Instantiate the sentiment intensity analyzer
    vader = SentimentIntensityAnalyzer()
    # Set column names
    columns = ['ticker', 'date', 'time', 'headline']
    # Convert the parsed_news list into a DataFrame called 'parsed_and_scored_news'
    parsed_and_scored_news = pd.DataFrame(parsed_news, columns=columns)
    # Iterate through the headlines and get the polarity scores using vader
    scores = parsed_and_scored_news['headline'].apply(vader.polarity_scores).tolist()
    # Convert the 'scores' list of dicts into a DataFrame
    scores_df = pd.DataFrame(scores)
    # Join the DataFrames of the news and the list of dicts
    parsed_and_scored_news = parsed_and_scored_news.join(scores_df, rsuffix='_right')
    # Convert the date column from string to datetime
    parsed_and_scored_news['date'] = pd.to_datetime(parsed_and_scored_news.date).dt.date
    parsed_and_scored_news.to_csv('~/LighthouseLabs-Final/Dataset/2. FinViz_Headline_Data/' + ticker + '_data_' + (datetime.today().strftime('%Y-%m-%d-%H')) + '.csv')

In [13]:
get_news('AAPL'), get_news('TSLA'), get_news('AMZN'), get_news('FB'), get_news('GOOG'), get_news('NFLX')

get_news('MSFT'), get_news('NVDA'), get_news('JNJ'), get_news('CVX'), get_news('PFE'), get_news('NKE'),get_news('GS')

print("Done!")

Done!


## 3. Historical Stock Data:

In [6]:
def stock_data(ticker):
    start_date = '2020-09-23'
    end_date = date.today()+timedelta(days=1)
    # 1. Request data:
    data = yf.download(ticker, 
                      start=start_date, 
                      end=end_date,
                      interval='30m', 
                      progress=False)
    # 2. Feature Engineering:
    data['Percent Price Change Within Period'] = ((data['Close'] - data['Open'])/data['Open'])*100
    # data['Change in Close Price'] = data['Close'] - data['Close'].shift(1)
    # data['Scaled Delta Close'] = data['Change in Close Price']/(data['Close'].mean())
    data['Scaled Volume'] = data['Volume']/data['Volume'].mean()
    data_SMA = data['Adj Close'].rolling(window=3).mean().shift(1)
    data['SMA(3)'] = data_SMA
    data['t+1'] = data['Adj Close'].shift(-1)
    data.reset_index(inplace=True)
    data['Datetime']=data['Datetime'].dt.tz_convert('America/Montreal').dt.tz_localize(None)
    data.drop(['Open','High','Low','Close'],axis=1,inplace=True)
    #3. Export data:
    f_name = ticker + "_data"
    # data.to_csv('~/LighthouseLabs-Final/Dataset/1. Stock_Data/' + f_name + ".csv")
    # print('Data saved!')
    return data

In [15]:
stock_data('AAPL'), stock_data('GOOG'), stock_data('FB'), stock_data('AMZN'), stock_data('NFLX'), stock_data('TSLA')

stock_data('JNJ'), stock_data('MSFT'), stock_data('CVX'), stock_data('NKE'), stock_data('PFE'), stock_data('GS'), stock_data('NVDA')


Data saved!
Data saved!
Data saved!
Data saved!
Data saved!
Data saved!
Data saved!
Data saved!
Data saved!
Data saved!
Data saved!
Data saved!
Data saved!


(None, None, None, None, None, None, None)