In [41]:
import requests
import urllib.request
import time, json, os, traceback
from json import JSONDecodeError
from bs4 import BeautifulSoup
from datetime import datetime, timedelta
from time import sleep
from collections import deque
import pandas as pd
import glob

class StockTwitsAPIScraper:
    def __init__(self, symbol, date, maxId):
        self.symbol = symbol
        self.link = "https://api.stocktwits.com/api/2/streams/symbol/{}.json?".format(symbol)
        self.targetDate = date
        self.tweets = []
        self.reqeustQueue = deque()
        self.maxId = maxId
        self.initDir()

    def setLimits(self, size, duration):
        self.size = size
        self.duration = duration
        self.requestInterval = duration // size + 1 if duration % size else duration // size

    # create directions if they don't exist
    def initDir(self):
        if not os.path.isdir("stocks"):
            os.mkdir("stocks")
        if not os.path.isdir("stocks/{}".format(self.symbol)):
            os.mkdir("stocks/{}".format(self.symbol))

    # write tweets we get and the ID of the last tweet in case system break down
    def writeJson(self):
        if self.tweets:
            self.maxId = self.tweets[-1]["id"]
            fileName = "stocks/{}/{}.json".format(self.symbol, self.maxId)
            with open(fileName, "w") as f:
                json.dump(self.tweets, f)
    
    def getCurrentUrl(self):
        return self.link + "max={}".format(self.maxId)

    # request manager
    # can't exceed 200 requests within an hour
    def requestManager(self):
        if len(self.reqeustQueue) == self.size:
            now = datetime.now()
            firstRequest = self.reqeustQueue.popleft()
            if now < firstRequest + timedelta(seconds=self.duration):
                timeDiff = firstRequest - now
                waitTime = timeDiff.total_seconds() + 1 + self.duration                
                print("Reach request limit, wait for {} seconds.".format(waitTime))
                sleep(waitTime)

    def getMessages(self, url):
        self.requestManager()

        response = requests.get(url)
        self.reqeustQueue.append(datetime.now())
        try:
            data = json.loads(response.text)
        except JSONDecodeError:
            if "Bad Gateway" in response.text:
                print("Just a Bad Gateway, wait for 1 minute.")
                sleep(60)
                return True
            print(len(self.reqeustQueue))
            print(self.reqeustQueue[0], datetime.now())
            print(url)
            print(response.text)
            print(traceback.format_exc())
            raise Exception("Something worong with the response.")
        if data and data["response"]["status"] == 200:
            data["cursor"]["max"]
            for m in data["messages"]:
                record = {}            
                createdAt = datetime.strptime(m["created_at"], "%Y-%m-%dT%H:%M:%SZ")
                if createdAt < self.targetDate:
                    return False
                record["id"] = m["id"]
                record["text"] = m["body"]
                record["time"] = createdAt.timestamp()
                record["sentiment"] = m["entities"]["sentiment"]["basic"] if m["entities"]["sentiment"] else ""
                self.tweets.append(record)
        else:
            print(response.text)        
        return True

    def getTweetsAndWriteToFile(self):        
        if not self.getMessages(self.getCurrentUrl()):
            return False
        self.writeJson()
        print("Scrap {} tweets starting from {}.".format(len(self.tweets), self.maxId))
        self.tweets.clear()
        sleep(self.requestInterval)
        return True

    def scrapTweets(self):        
        try:
            doScrap = True
            while doScrap:
                doScrap = self.getTweetsAndWriteToFile()
        except Exception:
            print(traceback.format_exc())

symbol = input("Enter stock symbol: ")
print("This scraper scraps tweets backward.\n\
The ID you put in belongs the most recent tweet you're goint go scrap.\n\
And the scraper will keep going backward to scrap older tweets.")
maxId = input("Enter the starting tweet ID: ")
targetDate = input("Enter the earlest date (mmddyyyy): ")
print("You can only send 200 requests to StockTwits in an hour.")
requestLimit = input("Enter the limit of number of requests within an hour: ")

scraper = StockTwitsAPIScraper(symbol, datetime.strptime(targetDate, "%m%d%Y"), int(maxId))
scraper.setLimits(int(requestLimit), 3600)
scraper.scrapTweets()

Enter stock symbol: AAPL
This scraper scraps tweets backward.
The ID you put in belongs the most recent tweet you're goint go scrap.
And the scraper will keep going backward to scrap older tweets.
Enter the starting tweet ID: 232908259
Enter the earlest date (mmddyyyy): 07012020
You can only send 200 requests to StockTwits in an hour.
Enter the limit of number of requests within an hour: 199
Scrap 30 tweets starting from 232901943.
Scrap 30 tweets starting from 232893386.
Scrap 30 tweets starting from 232889201.
Scrap 30 tweets starting from 232879091.
Scrap 30 tweets starting from 232874580.
Scrap 30 tweets starting from 232868686.
Scrap 30 tweets starting from 232863840.
Scrap 30 tweets starting from 232858622.
Scrap 30 tweets starting from 232854072.
Scrap 30 tweets starting from 232847987.
Scrap 30 tweets starting from 232843385.
Scrap 30 tweets starting from 232838988.
Scrap 30 tweets starting from 232836590.
Scrap 30 tweets starting from 232834496.
Scrap 30 tweets starting from 2

Scrap 30 tweets starting from 232411676.
Scrap 30 tweets starting from 232408875.
Scrap 30 tweets starting from 232407013.
Scrap 30 tweets starting from 232405386.
Scrap 30 tweets starting from 232403206.
Scrap 30 tweets starting from 232401812.
Scrap 30 tweets starting from 232400383.
Scrap 30 tweets starting from 232398868.
Scrap 30 tweets starting from 232396939.
Scrap 30 tweets starting from 232394571.
Scrap 30 tweets starting from 232392958.
Scrap 30 tweets starting from 232390587.
Scrap 30 tweets starting from 232387389.
Scrap 30 tweets starting from 232385461.
Scrap 30 tweets starting from 232383884.
Scrap 30 tweets starting from 232381831.
Scrap 30 tweets starting from 232380371.
Scrap 30 tweets starting from 232379134.
Scrap 30 tweets starting from 232377864.
Scrap 30 tweets starting from 232376452.
Scrap 30 tweets starting from 232374960.
Scrap 30 tweets starting from 232373816.
Scrap 30 tweets starting from 232372602.
Scrap 30 tweets starting from 232371258.
Scrap 30 tweets 

KeyboardInterrupt: 

In [42]:
def merge(symbol):
    '''
    input all .json files 
    merge them into one df
    '''
    path = r'/Users/hau/Desktop/Text_group_project/stock-twits-scraper-master/stocks/' # use the path where .json are saved
    all_files = glob.glob(path + symbol + "/*.json")

    dfs = [] 
    for file in all_files:
        data = pd.read_json(file) 
        dfs.append(data) 
    temp = pd.concat(dfs, ignore_index=True)
    temp['time'] = pd.to_datetime(temp.time, unit = 's') - timedelta(hours = 20)
    
    return temp

In [46]:
aapl = merge('AAPL')
aapl.sort_values(by = 'time')
#aapl.to_csv("aapl.csv")

Unnamed: 0,id,text,time,sentiment
140186,232343263,$AAPL reverse split?!,2020-07-30 04:33:15,
140185,232343281,$aapl 4 for 1 stock split jesussssss,2020-07-30 04:33:17,
140184,232343295,$AAPL 4;1 stock split .. that would pop it ano...,2020-07-30 04:33:19,
140183,232343310,$AMZN $AAPL with a 4-1 split,2020-07-30 04:33:21,
140182,232343313,$AAPL FUUUUUUUCK YES,2020-07-30 04:33:21,
...,...,...,...,...
164067,249505699,"$HPQ $DELL $AAPL Lenovo, HP, and Dell led PCs ...",2020-10-10 01:32:49,
164068,249505698,$AAPL going to be great going into📱,2020-10-10 01:32:49,
164066,249505858,$AAPL any likelihood appl heads to 95 after ea...,2020-10-10 01:35:25,
164065,249505977,$AAPL NEW ARTICLE : 2 Top Growth Stocks You Ca...,2020-10-10 01:37:17,


In [24]:
amzn = merge('AMZN') #240702326
amzn.sort_values(by = 'time')
#amzn.to_csv("amzn.csv")

Unnamed: 0,id,text,time,sentiment
7258,240702326,$AMZN As I said 100%,2020-08-31 09:18:53,
7257,240702737,$AMZN c&#39;mon sluggo bezos do something!,2020-08-31 09:21:22,Bullish
7256,240702757,"$CRWD $ZM at $120b cap, CRWD at $27b cap. this...",2020-08-31 09:21:30,Bullish
7255,240703095,$SPY At some point you have to let go of what ...,2020-08-31 09:23:33,
7254,240703184,"$AMZN nice after hour gains, tomorrow we take ...",2020-08-31 09:24:07,
...,...,...,...,...
25353,249332682,$AMD in out. Just bought $AMZN Calls. At lea...,2020-10-09 00:01:11,Bearish
25352,249333068,$AMZN up $36.00 from next suggested buy entry ...,2020-10-09 00:02:15,Bullish
25351,249333217,$AMZN no more tweets please,2020-10-09 00:02:40,Bullish
25350,249333592,"$AMZN at least today, this applies",2020-10-09 00:03:44,Bullish


In [31]:
fb = merge('FB') 
fb.sort_values(by = 'time')
#amzn.to_csv("amzn.csv")

Unnamed: 0,id,text,time,sentiment
2969,244555857,$FB OMG.. Seriously People.. KIM KARDASHIAN **...,2020-09-17 03:27:38,Bullish
2968,244556226,$fb $aapl $msft $crm most people are beting on...,2020-09-17 03:28:55,
2967,244556378,$FB Glad I sold at 268. Will buy again tomorro...,2020-09-17 03:29:23,
2966,244556599,$FB 242 worst case-but I bought here at 251,2020-09-17 03:30:07,Bullish
2965,244557191,$FB Fakebook has been a source of “discord” wi...,2020-09-17 03:32:02,Bearish
...,...,...,...,...
424,249725015,$FB wow what a chart today,2020-10-12 00:52:09,
423,249726548,"$FB $325 eow, Mark this post 👌",2020-10-12 00:57:26,Bullish
422,249726708,$FB hopefully this will be a multi day rally...,2020-10-12 00:57:56,Bullish
421,249726882,$FB Facebook reverses stance on Holocaust deni...,2020-10-12 00:58:33,
