In [27]:
import numpy as np
import pandas as pd
import os
from tqdm import tqdm
import snscrape.modules.twitter as sntwitter
import preprocessor as p

## Params

In [2]:
tweets_dir = os.path.join('./data/tweets')
max_results = 1000

In [3]:
# dummy data
companies = np.array(['Apple', 'Coinbase', 'Rivian'])
ipo_dates = ['2020-06-01', '2020-07-01', '2020-08-01']

In [4]:
def company2file(company_name):
    company_name = company_name.lower()
    return "".join([c for c in company_name if c.isalpha() or c.isdigit() or c==' ']).rstrip()

In [30]:
pbar = tqdm(range(len(companies) * max_results))

for company, ipo_date in zip(companies, ipo_dates):
    pbar.set_description(f"Processing {company}")
    company_file = company2file(company)
    out_csv = os.path.join(tweets_dir, company_file+'.csv')

    # date stuff
    ipo_date = pd.to_datetime(ipo_date).date()
    week_before = ipo_date - pd.to_timedelta(7, unit='d')

    # get tweets
    tweets_list = []
    for i, tweet in enumerate(sntwitter.TwitterSearchScraper(f'{company} since:{week_before} until:{ipo_date} lang:en').get_items()):
        if i >= max_results:
            break

        tweets_list.append([
            tweet.date, 
            p.clean(tweet.content), 
            tweet.replyCount,
            tweet.retweetCount,
            tweet.likeCount,
            tweet.quoteCount,
            ])
        pbar.update(1)

    tweets_df = pd.DataFrame(tweets_list, columns=[
        'date', 
        'content', 
        'replyCount', 
        'retweetCount', 
        'likeCount',
        'quoteCount',
    ])

    tweets_df['date'] = pd.to_datetime(tweets_df['date']).dt.date
    tweets_df.to_csv(out_csv)
    
pbar.close()

Processing Rivian: 100%|██████████| 3000/3000 [00:53<00:00, 56.19it/s]
