In [None]:
!pip install neo4j

In [8]:
!python --version

Python 3.8.5


In [17]:
from neo4j import GraphDatabase
import logging
from neo4j.exceptions import ServiceUnavailable
import requests
import datetime as dt
import datetime
import concurrent.futures
import pandas as pd
import py_stringmatching as sm

import snscrape.modules.twitter as sntwitter
import pandas as pd
import os
import json
import concurrent.futures
import time
import datetime as dt
from twitter_scrape import TwitterSearchScraper as tscraper

import tweet_article_lda as tlda

In [18]:
class App:

    def __init__(self, uri, user, password):
        self.driver = GraphDatabase.driver(uri, auth=(user, password))

    def close(self):
        self.driver.close()

    def create_stock(self, ticker, start_date, end_date):
        with self.driver.session() as session:
            result = session.write_transaction(self._create_stock, ticker, start_date, end_date)

    def create_quotes(self, ticker, start_date, end_date):
        with self.driver.session() as session:
            result = session.write_transaction(self._create_quotes, ticker, start_date, end_date)

    def create_user(self, username):
        with self.driver.session() as session:
            result = session.write_transaction(self._create_user, username)  
                
    def create_posts(self, df):  #Finding the stock from text would be a function before calling this one
        with self.driver.session() as session:
            result = session.write_transaction(self._create_posts, df)
            
    def create_topic(self, name):
        with self.driver.session() as session:
            result = session.write_transaction(self._create_topic, name)
            
    def run_query(self, query):
        with self.driver.session() as session:
            result = session.write_transaction(self._run_query, query)


    #Method to create stocks            
    @staticmethod
    def _create_stock(tx, ticker, start_date, end_date):
        print("Making Nodes for Stock: ", ticker)
        start = time.time()
        req = requests.get("https://cloud.iexapis.com/stable/stock/" + ticker + "/company?token=" + iex_key)
        if req.status_code == 200:
            response = req.json()
            compName = response['companyName']
            compIndustry = response['industry']

            query = (
                "MATCH (s:Stock) "
                "WHERE s.ticker = $ticker "
                "RETURN s.ticker"
            )
            result = tx.run(query, ticker=ticker)
            if result.single() == None:
                query = (
                    "CREATE (s1:Stock { ticker: $ticker, compName: $compName, industry: $compIndustry}) "
                    "RETURN s1"
                )
                result = tx.run(query, ticker=ticker, compName=compName, compIndustry = compIndustry)

            App._create_quotes(tx, ticker, start_date, end_date)
        end = time.time()
        print("Completed Nodes for Stock: {}, Time Taken: {:.2f}min".format(ticker, (end-start)/60))
        
        return

    @staticmethod
    def _search_compname(tx, ticker):
        query = (
                "MATCH (s:Stock) "
                "WHERE s.ticker = $ticker "
                "RETURN s.compName"
            )
        cname = tx.run(query, ticker = ticker).value()[0]
        comp_suffix = ['incorporated', 'corporation', 'limited', 'company', 'inc', 'nv', 'ltd', 'corp', 'co', 'llc']
        cname = cname.replace('.', '').replace(',', '').lower()
        for suff in comp_suffix:
            cname = cname.replace(suff, '')
        return cname

    @staticmethod
    def _run_query(tx, query):
        test_res = tx.run(query)
        prop_list = []
        for row in test_res.value():
            prop_list.append(dict(row.items()))
        out_df = pd.DataFrame(prop_list)
        out_df.to_csv('query_results.csv')
        return

    
    @staticmethod
    def _create_quotes(tx, ticker, start_date, end_date):
        date_range = [start_date + dt.timedelta(days=i) for i in range((end_date - start_date).days)]
        #for res in responses:
        with concurrent.futures.ThreadPoolExecutor(8) as executor:
            responses = list(executor.map(App._pull_quote, [ticker for i in range(len(date_range))], date_range, [iex_key for i in range(len(date_range))]))
        responses = list(filter(None, responses))

        cols = ['date', 'open', 'close', 'high', 'low', 'volume', 'ticker', 'day']
        q_df = pd.DataFrame(responses, columns=cols)

        filt_df = pd.DataFrame(columns=cols)

        for day in ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday"]:
            mean = q_df['volume'].loc[q_df['day'] == day].mean()
            filt_df = filt_df.append(q_df.loc[(q_df['day'] == day) & (q_df['volume'] > (mean * 2))])

        filt_df = filt_df.reset_index()
        event_days = list(filt_df['date'])

        for i in range(len(q_df)):
            event_date = q_df['date'][i]
            op = q_df['open'][i]
            close = q_df['close'][i]
            high = q_df['high'][i]
            low = q_df['low'][i]
            volume = int(q_df['volume'][i])
            day = q_df['day'][i]
            event_tag = 'False'
            if event_date in event_days:
                event_tag = 'True'

            query = (
              "MATCH(q:Quote) "
              "WHERE q.date = $event_date AND q.ticker = $ticker "
              "RETURN q.date"
            )
            result = tx.run(query, event_date=event_date, ticker=ticker)
            if result.single() == None:
                query = (
                    "CREATE (q:Quote { ticker: $ticker, date: $event_date, open: $op, close: $close, high: $high, low: $low, volume: $volume, day: $day, event: $event_tag}) "
                )
                result = tx.run(query, ticker=ticker, event_date = event_date, op = op, close = close, high = high, low = low, volume = volume, day=day, event_tag = event_tag)

                query = (
                    "MATCH(q:Quote), (s:Stock) WHERE q.ticker = $ticker AND s.ticker = $ticker AND NOT (q)-[:PriceOf]->(s) "
                    "CREATE (q)-[r:PriceOf]->(s)"
                )
                result = tx.run(query, ticker=ticker)
            if event_tag == 'True':
                #Create Article Nodes related to event
                App._create_articles(tx, ticker, event_date)

                #Create Post Nodes related to event
                App._create_posts(tx, ticker, event_date)

        return

    @staticmethod
    def _pull_quote(ticker, i_date, api_key):
        weekdays = {0:'Monday', 1:'Tuesday', 2:'Wednesday', 3:'Thursday', 4:'Friday', 5:'Saturday', 6:'Sunday'}
        req = requests.get("https://cloud.iexapis.com/stable/stock/"
                        + ticker + "/chart/date/" + i_date.strftime("%Y%m%d") + "?chartByDay=true&token=" + iex_key)
        if req.status_code == 200 and (len(req.json()) > 0):
            content = req.json()[0]
            date = content['date']
            op = content['open']
            close = content['close']
            high = content['high']
            low = content['low']
            volume = content['volume']
            day = weekdays[i_date.weekday()]
            return [date, op, close, high, low, volume, ticker.upper(), day]
        else:
            return

    @staticmethod
    def _create_user(tx, username):
        query = (
            "CREATE (p1:User { username: $username}) "
            "RETURN p1"
        )
        result = tx.run(query, username=username)
        return

    @staticmethod
    def _check_user(tx, username):
        query = (
            "MATCH (u:User) WHERE u.username = $username "
            "RETURN u"
        )
        result = tx.run(query, username=username)
        if result.single() == None:
            App._create_user(tx, username)
        return


    @staticmethod
    def _create_posts(tx, ticker, date):
        df = App._search_tweets(tx, ticker, date)
        
        # Remove duplicate tweets via stringmatching
        al_tok = sm.AlphabeticTokenizer()
        cos = sm.Cosine()
        drop_list = []
        for i, text in enumerate(df['content']):
            tw_tok = al_tok.tokenize(text.lower())
            for j in range(i+1, len(df['content'])):
                tw2_tok = al_tok.tokenize(df['content'][j].lower())
                if cos.get_raw_score(tw_tok, tw2_tok) > .85:
                    drop_list.append(j)
        df = df.drop(drop_list).reset_index(drop=True)

        
        for i in range(len(df)):
            tweet_content = df['content'][i]
            post_date = df['date'][i].strftime("%Y-%m-%d")
            tweet_id = int(df['tweet_id'][i])
            username = df['username'][i]
            App._check_user(tx, username)

            #Check if that post already exists in AuraDB
            query = (
                "MATCH (p:Post) WHERE p.tweet_id = $tweet_id "
                "RETURN p"
            )
            result = tx.run(query, tweet_id = tweet_id)
            if result.single() == None:
                query = (
                    "MATCH (e:Quote {date: $post_date})-[]-(s:Stock {ticker: $ticker}) "
                    "MATCH (u:User) WHERE u.username = $username "
                    "CREATE (p1:Post { tweet_id: $tweet_id, content: $tweet_content, date: $post_date})-[r:RefersTo]->(e) "
                    "CREATE (u)-[:Posted]->(p1)"
                )
                result = tx.run(query, tweet_id = tweet_id, tweet_content=tweet_content, post_date=post_date, ticker = ticker, username = username)
            else:
                query = (
                    "MATCH (p:Post)-[r:RefersTo]-(e:Quote)-[]-(s:Stock {ticker: $ticker}) WHERE p.tweet_id = $tweet_id "
                    "RETURN r"
                )
                result = tx.run(query, tweet_id = tweet_id, ticker = ticker)
                if result.single() == None:
                    query = (
                        "MATCH (p:Post) WHERE p.tweet_id = $tweet_id "
                        "MATCH (e:Quote {date: $post_date})-[]-(s:Stock {ticker: $ticker}) "
                        "CREATE (p)-[:RefersTo]->(e)"
                    )
                    result = tx.run(query, tweet_id = tweet_id, ticker=ticker, post_date = post_date)  
        App._create_topic_tweet(tx, df, ticker, date)
        return

    @staticmethod
    def _search_tweets(tx, ticker, search_date):
        search_date = dt.datetime.strptime(search_date, '%Y-%m-%d')
        search_term = App._search_compname(tx, ticker)
        sdate = dt.datetime(month = search_date.month, day = search_date.day, year = search_date.year, tzinfo=dt.timezone.utc)
        edate = sdate + dt.timedelta(days=1)

        since_date = sdate.strftime("%Y-%m-%d")
        until_date = edate.strftime("%Y-%m-%d")

        company_tweets_df = pd.DataFrame()
        tweets_list = []
        tweet_keys = ["date", "tweet_id", "content", "username", "company", "company_ticker", "lang"]

        company_input_string = "{} since:{} until:{}".format(search_term, since_date, until_date)

        # Using TwitterSearchScraper to scrape data and append tweets to list
        for i, tweet in enumerate(tscraper(company_input_string, top=True).get_items()):
            if i>100:
                break
            tweets_list.append([tweet.date, tweet.id, tweet.content, tweet.user.username, search_term, ticker, tweet.lang])

        tweets_df = pd.DataFrame(tweets_list, columns = tweet_keys)
        # Gets rid of duplicate tweets (by tweet_id) & keep only last instance
        tweets_df = tweets_df.drop_duplicates(subset=['tweet_id'], keep='last')
        return tweets_df


#Check if news source exists, and if not create the node
    @staticmethod
    def _create_news_source(tx, name):
        query = (
            "MATCH (s:Source) "
            "WHERE s.name = $name "
            "RETURN s.name"
        )
        result = tx.run(query, name=name)
        if result.single() == None:
            query = (
                "CREATE (s:Source { name: $name}) "
                "RETURN s"
            )
            result = tx.run(query, name=name)
        return


    def _create_articles(tx, ticker, date):
        art_date = dt.datetime.strptime(date, '%Y-%m-%d')
        compName = App._search_compname(tx, ticker)
        al_tok = sm.AlphabeticTokenizer()
        
        provider_list = ['marketwatch', '4-traders', 'benzinga', 'yahoo', 'onenewspage', 'thestreet', 'americanbankingnews',
                         'reuters', 'autoevolution', 'investors', 'indiatimes', 'bnnbloomberg', 'businessinsider', 'channelnewsasia',
                         'cnbc', 'motorsport', 'barrons', 'fool', 'wsj', 'usatoday', 'washingtonpost', 'motortrend', 'apnews', 'forbes',
                         'investorplace', 'investing', 'themarketsdaily', 'seekingalpha', 'thisismoney', 'investchronicle', 'tickerreport',
                         'zacks', 'wfmz', 'chron', 'com-unik', 'newsbreak', 'voanews', 'smarteranalyst', 'cyprus-mail', 'thehour']
        
        response = App._pull_article(compName, art_date)
        if response != 0:
            acols = ['art_id', 'url', 'content', 'date_published']
            art_df = pd.DataFrame(columns=acols)
            for web_page in response:
                provider = web_page["provider"]["name"]
                title = web_page["title"]
                description = web_page["description"]

                title_tok = al_tok.tokenize(title.lower())
                descrip_tok = al_tok.tokenize(description.lower())
                compName_tok = al_tok.tokenize(compName)

                if (provider in provider_list):
                    if (len([sim for sim in compName_tok if sim in title_tok])/len(compName_tok) > 0.3) | (len([sim for sim in compName_tok if sim in descrip_tok])/len(compName_tok) > 0.3) | (ticker.lower() in title_tok) | (ticker.lower() in descrip_tok):
                        art_id = web_page["id"]
                        url = web_page["url"]
                        date_published = web_page["datePublished"][:10]

                        App._create_news_source(tx, provider)

                        #Check of that article already exists in AuraDB
                        query = (
                            "MATCH (a:Article) WHERE a.id = $art_id "
                            "RETURN a"
                        )
                        result = tx.run(query, art_id = art_id)

                        if result.single() == None:
                            query = (
                                "MATCH (e:Quote)-[:PriceOf]->(:Stock {ticker: $ticker}) WHERE e.date = $date_published "
                                "MATCH (s:Source) WHERE s.name = $provider "
                                "CREATE (a:Article { id: $art_id, url: $url, title: $title, description: $description, date_published: $date_published})-[r:References]->(e) "
                                "CREATE (s)-[:Published]->(a)"
                            )
                            result = tx.run(query, art_id = art_id, url = url, title=title, date=date, description = description, date_published = date_published, provider = provider, ticker = ticker)
                        else:
                            query = (
                                "MATCH (a:Article)-[r:References]-(e:Quote)-[]-(s:Stock {ticker: $ticker}) WHERE a.id = $art_id "
                                "RETURN r"
                            )
                            result = tx.run(query, art_id = art_id, ticker = ticker)
                            if result.single() == None:
                                query = (
                                    "MATCH (a:Article) WHERE a.id = $art_id "
                                    "MATCH (e:Quote)-[:PriceOf]->(s:Stock {ticker: $ticker}) WHERE e.date = $date_published "
                                    "CREATE (a)-[r:References]->(e)"
                                )
                                result = tx.run(query, art_id = art_id, ticker=ticker, date_published = date_published)
                        comb_text = title + description
                        add_df = pd.DataFrame([[art_id, url, comb_text, date_published]], columns=acols)
                        art_df = art_df.append(add_df, ignore_index=True)
            if len(art_df) > 0:
                App._create_topic_article(tx, art_df, ticker, date)
        return
    
    def _pull_article(compName, date):
        URL = "https://rapidapi.p.rapidapi.com/api/search/NewsSearchAPI"
        HEADERS = {
            "x-rapidapi-host": "contextualwebsearch-websearch-v1.p.rapidapi.com",
            "x-rapidapi-key": '2fcde9db2cmsh454104b03a9e375p159a4djsnd00c3e13b8c0'
        }

        query = compName# + " stock"
        page_number = 1
        page_size = 40
        auto_correct = True
        safe_search = False
        with_thumbnails = True
        from_published_date = date.strftime("%m/%d/%Y")
        to_published_date = (date+dt.timedelta(days=1)).strftime("%m/%d/%Y")

        querystring = {"q": query,
                    "pageNumber": page_number,
                    "pageSize": page_size,
                    "autoCorrect": auto_correct,
                    "safeSearch": safe_search,
                    "withThumbnails": with_thumbnails,
                    "fromPublishedDate": from_published_date,
                    "toPublishedDate": to_published_date}

        req = requests.get(URL, headers=HEADERS, params=querystring)
        if req.status_code == 200 and (len(req.json()) > 0):
            res = req.json()
        else:
            return 0

        drop_list = []
        q_tok = sm.QgramTokenizer(qval = 2)
        cos = sm.Cosine()
        for i, article in enumerate(res['value']):
            t_tok = q_tok.tokenize(article['title'].lower())
            for j in range(i+1, len(res['value'])):
                t2_tok = q_tok.tokenize(res['value'][j]['title'].lower())
                if cos.get_raw_score(t_tok, t2_tok) > .92:
                    drop_list.append(j)

        arts = [art for i, art in enumerate(res['value']) if i not in drop_list]

        return arts
    
    @staticmethod
    def _create_topic_tweet(tx, tdf, ticker, post_date):
        classifier = tlda.Topic_Classification(topic_file=1)
        topic_name = classifier.tweet_lda(tdf)
        query = (
            "MATCH (t:Topic) "
            "WHERE t.name = $topic_name "
            "RETURN t.name"
        )
        result = tx.run(query, topic_name=topic_name)
        if result.single() == None:
            query = (
                "MATCH (q:Quote {ticker: $ticker, date: $post_date}) "
                "CREATE (t:Topic {name: $topic_name}) "
                "CREATE (q)-[:TweetAbout]->(t)"
            )
            result = tx.run(query, ticker=ticker, post_date=post_date, topic_name=topic_name)
        else:
            query = (
                "MATCH (q:Quote {ticker: $ticker, date: $post_date})-[a:TweetAbout]-(t:Topic {name: $topic_name}) "
                "RETURN a"
            )
            result = tx.run(query, ticker = ticker, post_date = post_date, topic_name=topic_name)
            if result.single() == None:
                query = (
                    "MATCH (q:Quote {ticker: $ticker, date: $post_date}) "
                    "MATCH (t:Topic {name: $topic_name}) "
                    "CREATE (q)-[:TweetAbout]->(t)"
                )
                result = tx.run(query, ticker = ticker, post_date = post_date, topic_name=topic_name)
        return
    
    
    @staticmethod
    def _create_topic_article(tx, tdf, ticker, post_date):
        classifier = tlda.Topic_Classification(topic_file=0)
        topic_name = classifier.article_lda(tdf)
        query = (
            "MATCH (t:Topic) "
            "WHERE t.name = $topic_name "
            "RETURN t.name"
        )
        result = tx.run(query, topic_name=topic_name)
        if result.single() == None:
            query = (
                "MATCH (q:Quote {ticker: $ticker, date: $post_date}) "
                "CREATE (t:Topic {name: $topic_name}) "
                "CREATE (q)-[:ArticleAbout]->(t)"
            )
            result = tx.run(query, ticker=ticker, post_date=post_date, topic_name=topic_name)
        else:
            query = (
                "MATCH (q:Quote {ticker: $ticker, date: $post_date})-[a:ArticleAbout]-(t:Topic {name: $topic_name}) "
                "RETURN a"
            )
            result = tx.run(query, ticker = ticker, post_date = post_date, topic_name=topic_name)
            if result.single() == None:
                query = (
                    "MATCH (q:Quote {ticker: $ticker, date: $post_date}) "
                    "MATCH (t:Topic {name: $topic_name}) "
                    "CREATE (q)-[:ArticleAbout]->(t)"
                )
                result = tx.run(query, ticker = ticker, post_date = post_date, topic_name=topic_name)
        return


        

In [19]:
iex_key = 'sk_f4ff43b754ef4e7a9f7d21ce5569ef7c'
#Neo4j Aura DB Credentials
#uri = "neo4j+s://08b155ba.databases.neo4j.io"
#user = 'neo4j'
#password = 'MhrmtDI5RJz7Kx7L8E7tT8qp6eiQ6joClHABFpxYcPE'

#iex_key = 'sk_f4ff43b754ef4e7a9f7d21ce5569ef7c'
#Neo4j Aura DB Credentials
uri = "neo4j+s://1e76e017.databases.neo4j.io"
user = 'neo4j'
password = 'Y-mcrOhxhLiaxChRli6zcnkAWS5NH6gWP2jQ-3X80uc'

if __name__ == "__main__":
    app = App(uri, user, password)
    #app.create_stock('F', dt.datetime(month = 1, day = 1, year = 2021), dt.datetime(month = 11, day = 30, year = 2021))
    app.create_stock('TSLA', dt.datetime(month = 1, day = 1, year = 2021), dt.datetime(month =11, day = 30, year = 2021))
    app.create_stock('GM', dt.datetime(month = 1, day = 1, year = 2021), dt.datetime(month =11, day = 30, year = 2021))
    app.create_stock('STLA', dt.datetime(month = 1, day = 1, year = 2021), dt.datetime(month = 11, day = 30, year = 2021))
    app.create_stock('HMC', dt.datetime(month = 1, day = 1, year = 2021), dt.datetime(month = 11, day = 30, year = 2021))
    app.create_stock('RACE', dt.datetime(month = 1, day = 1, year = 2021), dt.datetime(month = 11, day = 30, year = 2021))
    app.create_stock('TM', dt.datetime(month = 1, day = 1, year = 2021), dt.datetime(month = 11, day = 30, year = 2021))
    app.create_stock('TTM', dt.datetime(month = 1, day = 1, year = 2021), dt.datetime(month = 11, day = 30, year = 2021))
    app.create_stock('NIO', dt.datetime(month = 1, day = 1, year = 2021), dt.datetime(month =11, day = 30, year = 2021))
    app.close()

Making Nodes for Stock:  TSLA


2021-12-10 03:20:56,438 : ERROR : Failed to write data to connection IPv4Address(('1e76e017.databases.neo4j.io', 7687)) (IPv4Address(('34.66.78.163', 7687)))


Completed Nodes for Stock: TSLA, Time Taken: 5.46min
Making Nodes for Stock:  GM
Completed Nodes for Stock: GM, Time Taken: 5.32min
Making Nodes for Stock:  STLA
0.0136666665
Completed Nodes for Stock: STLA, Time Taken: 2.53min
Making Nodes for Stock:  HMC
0.017225
0.015666667000000002
Completed Nodes for Stock: HMC, Time Taken: 2.21min
Making Nodes for Stock:  RACE
Completed Nodes for Stock: RACE, Time Taken: 9.20min
Making Nodes for Stock:  TM
Completed Nodes for Stock: TM, Time Taken: 3.21min
Making Nodes for Stock:  TTM
Completed Nodes for Stock: TTM, Time Taken: 9.91min
Making Nodes for Stock:  NIO
Completed Nodes for Stock: NIO, Time Taken: 8.71min


In [None]:
iex_key = 'sk_f4ff43b754ef4e7a9f7d21ce5569ef7c'
#Neo4j Aura DB Credentials
uri = "neo4j+s://08b155ba.databases.neo4j.io"
user = 'neo4j'
password = 'MhrmtDI5RJz7Kx7L8E7tT8qp6eiQ6joClHABFpxYcPE'

if __name__ == "__main__":
    # Aura queries use an encrypted connection using the "neo4j+s" URI scheme
    app = App(uri, user, password)
    res = app.run_query('MATCH (e:Event) -[:PriceOf]-> (s:Stock {ticker: "F"}) RETURN e, s')
    app.close()

## Pulling tweets from Twitter

Run Python with local Runtime (for snscrape)

**In local anaconda environment run:**
<br>
pip3 install git+https://github.com/JustAnotherArchivist/snscrape.git
<br>
pip install jupyter_http_over_ws
<br>
jupyter serverextension enable --py jupyter_http_over_ws
<br>
**Run the following command to open jupyter notebook, allowing colab connection**
<br>
jupyter notebook --NotebookApp.allow_origin='https://colab.research.google.com' --port=8888 --Notebook.port_retries=0


In top right of colab notebook click the down arrow and click "connect to local runtime. Paste link from anaconda prompt in there

In [None]:
!python3 --version
#make sure you have 3.8

In [None]:
import snscrape.modules.twitter as sntwitter
import pandas as pd
import os
import json
import concurrent.futures
import time
import datetime as dt

In [None]:
# List of companies we want to pull 
companies = ['Ferrari']#, 'Toyota', 'Honda', 'Stellantis', 'Ford', 'Tesla', 'Ferrari NV' ]
company_tickers = ['$RACE']#, '$TM', '$hmc', '$STLA', '$F', '$TSLA', '$RACE']

sdate = dt.datetime(month = 11, day = 22, year = 2021, tzinfo=dt.timezone.utc)
edate = sdate + dt.timedelta(days=1)

since_date = sdate.strftime("%Y-%m-%d")
until_date = edate.strftime("%Y-%m-%d")

company_tweets_df = pd.DataFrame()

for company in range(len(companies)):
    tweets_list = []
    
    tweet_keys = ["date", "tweet_id", "content", "username", "company", "company_ticker", "lang"]

    company_input_string = "{} since:{} until:{}".format(companies[company], since_date, until_date)
    
    # Using TwitterSearchScraper to scrape data and append tweets to list
    for i, tweet in enumerate(sntwitter.TwitterSearchScraper(company_input_string, top=True).get_items()):
      if i>500:
        break
      #if "stock" in tweet.content or "stock price" in tweet.content:
      tweets_list.append([tweet.date, tweet.id, tweet.content, tweet.user.username, companies[company], company_tickers[company], tweet.lang])

    tweets_df = pd.DataFrame(tweets_list, columns = tweet_keys)
    #tweets_df = tweets_df.loc[tweets_df['date'] > (sdate + dt.timedelta(hours = 2))]

    # Gets rid of duplicate tweets (by tweet_id) & keep only last instance
    tweets_df = tweets_df.drop_duplicates(subset=['tweet_id'], keep='last')

    company_tweets_df = company_tweets_df.append(tweets_df, ignore_index=True)

In [None]:
company_tweets_df

In [None]:
company_tweets_df.to_csv("Downloads/test_tweet_set.csv")

# Using Multithreading

In [None]:
#remove duplicate tweets

In [None]:
def pull_tweets(i, sdate, edate, companies, company_tickers):
  company_input_string = "{} since:{} until:{}".format(companies[i], sdate, edate)
  company_ticker_input_string = "{} since:{} until:{}".format(company_tickers[i], sdate, edate)
  tweets_list = []

  for j, tweet in enumerate(sntwitter.TwitterSearchScraper(company_input_string).get_items()):
    if "stock" in tweet.content or "price" in tweet.content:
      tweets_list.append([tweet.date, tweet.id, tweet.content, tweet.username, companies[i], company_tickers[i]])

  for j, tweet in enumerate(sntwitter.TwitterSearchScraper(company_ticker_input_string).get_items()):
    tweets_list.append([tweet.date, tweet.id, tweet.content, tweet.username, companies[i], company_tickers[i]])

  return tweets_list

In [None]:
# List of companies we want to pull 
start = time.time()
companies = ['GM' ]
company_tickers = ['$GM']

since_date = '2021-01-13'
until_date = '2021-01-14'

company_tweets_df = pd.DataFrame()


tweets_list = []

tweet_keys = ["date", "tweet_id", "content", "username", "company", "company_ticker"]

"""
company_input_string = "{} since:{} until:{}".format(companies[company], since_date, until_date)
company_ticker_input_string = "{} since:{} until:{}".format(company_tickers[company], since_date, until_date)

# Using TwitterSearchScraper to scrape data and append tweets to list
for i, tweet in enumerate(sntwitter.TwitterSearchScraper(company_input_string).get_items()):

  if "stock" in tweet.content or "price" in tweet.content:
    tweets_list.append([tweet.date, tweet.id, tweet.content, tweet.username, companies[company], company_tickers[company]])

for i, tweet in enumerate(sntwitter.TwitterSearchScraper(company_ticker_input_string).get_items()):
  tweets_list.append([tweet.date, tweet.id, tweet.content, tweet.username, companies[company], company_tickers[company]])
"""

with concurrent.futures.ThreadPoolExecutor(8) as executor:
    responses = list(executor.map(pull_tweets, [x for x in range(len(companies))], [since_date for x in range(len(companies))], [until_date for x in range(len(companies))], companies, company_tickers))

end = time.time()

print("Time Taken: ", (end-start))

In [None]:
for res in responses:
    tweets_df = pd.DataFrame(res, columns = tweet_keys)
    company_tweets_df = company_tweets_df.append(tweets_df, ignore_index=True)

tweets_df = pd.DataFrame(tweets_list, columns = tweet_keys)

# Gets rid of duplicate tweets (by tweet_id) & keep only last instance
tweets_df = tweets_df.drop_duplicates(subset=['tweet_id'], keep='last')

#company_tweets_df.append(tweets_df, ignore_index=True)