## Scrape data from Twitter using Advanced search API (twitter_bot_scaraper wrapper)

            ARGS
-t, --hashtags, nargs='*', help='Hashtags'  
-w, --key_words, nargs='*', help="Key words"  
-s, --start_date, help="Start date in format YYYY-MM-DD"  
-e, --end_date, help="End date in format YYYY-MM-DD"  
-l, --lang, default='en', help='Language'  
-c, --scroll_numb, default=100, help="Times to scroll the page down"  
-d, --sleep_time, default=1, help="Delay for page to download"  
-o, --output_fname, default='output.csv', help="Output file name"  

**Example section playground**

In [None]:
%run twitter_bot.py -t btc -w bitcoin -s 2019-11-01 -e 2019-12-03 -c 10 -d 2 -o ./output/test_output.csv

In [None]:
import pandas as pd

#data = pd.read_csv('output.csv', parse_dates=True, index_col='date', encodeing='utf-8')
data = pd.read_csv('./output/test_output.csv', encoding='utf-8')
data = data.sort_values(by='date')
data.head()

### Day by scraping: generate day pairs

In [1]:
import datetime as dt

start_date = dt.date(2018, 12, 1)
end_date = dt.date(2019, 12, 1)

date_range = [dt.date.fromordinal(i) for i in range(start_date.toordinal(), end_date.toordinal()+1)]
# to str
date_range = [date.strftime("%Y-%m-%d") for date in date_range]

date_pairs = list(zip(date_range, date_range[1:] + date_range[:1]))[:-1]

### Meet: Twitter bot scraper itself!

In [2]:

import random
import requests
import argparse
import pandas as pd
from tqdm import tqdm
from time import sleep
from datetime import datetime
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.action_chains import ActionChains


# =============================================================================================
#                                      PARSE
# =============================================================================================
def generate_query(key_words, hashtags, start_date, end_date, lang='en'):
    first_part="https://twitter.com/search?l="+lang+"&q="

    s0 = ""
    for word in key_words:
        s0 += word+"%20"
    s0+="%23"

    s1 = ""
    for hst in hashtags[:-1]:
        s1 += hst+"%20OR%20%23"
    s1 += hashtags[-1]+"%20"

    second_part = "since%3A"+start_date+"%20until%3A"+end_date+"&src=typd"

    return first_part+s0+s1+second_part


def dowload_page(query, scroll_numb, delay=1, proxy_flag=False):
    options = webdriver.FirefoxOptions()
    options.add_argument('--headless')

    if proxy_flag:
        profile = get_proxy_profile(host, port)
        driver = webdriver.Firefox(executable_path='./geckodriver', firefox_profile=profile, options=options)
    else:
        driver = webdriver.Firefox(executable_path='./geckodriver', options=options)

    driver.get(query)

    prev_h = driver.execute_script("return document.body.scrollHeight;")
    for i in range(scroll_numb):
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        #driver.implicitly_wait(3)
        sleep(delay)
        #while driver.execute_script('return document.readyState;') != 'complete':
            #sleep(0.1)

        cur_h = driver.execute_script("return document.body.scrollHeight;")
        if prev_h == cur_h:
            #print("Stopped at {} iter".format(i))
            break

    with open('page.html','w') as f:
        f.write(str(driver.page_source.encode('utf-8')))
        f.close()

    driver.close()
    return 0


def __process_tweet(tweet):
    tweet_div = tweet.find('div', 'tweet')
    if not tweet_div:
        return None
    try:
        # --- text
        soup_html = tweet.find('div', 'js-tweet-text-container').find('p', 'tweet-text')       
        text = soup_html.text
        
        # --- time
        timestamp_epochs = int(tweet.find('span', '_timestamp')['data-time'])
        timestamp = datetime.utcfromtimestamp(timestamp_epochs)

        action_div = tweet_div.find('div', 'ProfileTweet-actionCountList')

        # --- likes
        likes = int(action_div.find('span', 'ProfileTweet-action--favorite') \
                              .find('span', 'ProfileTweet-actionCount')['data-tweet-stat-count'] or '0')
        # --- retweets
        retweets = int(action_div.find('span', 'ProfileTweet-action--retweet') \
                                 .find('span', 'ProfileTweet-actionCount')['data-tweet-stat-count'] or '0')
    except AttributeError as e:
        return None

    return [timestamp, text, likes, retweets]



def parse_page():
    with open('page.html') as f:
        page = f.read()

    soup = BeautifulSoup(page, 'lxml')
    tweets = soup.find_all('li', 'js-stream-item')

    hooy = []
    for tweet in tweets:
        t = __process_tweet(tweet)
        if t:
            hooy.append(t)
    return hooy


# ===================================================================================================
#                                MAIN
# ===================================================================================================
def HOOY(key_words, hashtags, start_date, end_date, 
         lang='en', scroll_numb=100, sleep_time=1, output_fname='output.csv'):

    query = generate_query(key_words=key_words, 
                           hashtags=hashtags,
                           start_date=start_date,
                           end_date=end_date,
                           lang=lang)

    stat = dowload_page(query=query, delay=sleep_time, scroll_numb=scroll_numb)
    lst = parse_page()


    df = pd.DataFrame(lst, columns=["date", "text", "likes", "retweets"])
    df.to_csv(output_fname, header=True, index=False, encoding='utf-8')

    return df

In [3]:
# Calculate approx time
24*2*365 / 60 / 60

4.866666666666666

In [4]:
from tqdm import tqdm

KEY_WORDS = ['bitcoin', 'btc']
HASHTAGS = ['bitcoin', 'btc', 'cryptocurrency', 'crypto', 'finance', 'news']

frames = []
for i, (BEG, END) in enumerate(tqdm(date_pairs)):
    FNAME = './output/pairwise/btc_day{}.csv'.format(i)
    df = HOOY(KEY_WORDS, HASHTAGS, BEG, END, sleep_time=2, scroll_numb=24, output_fname=FNAME, lang='en')
    frames.append(df)
    
    
df_total = pd.concat(frames)
df_total.to_csv('./output/btc_year_full.csv', header=True, index=False, encoding='utf-8')

100%|██████████| 365/365 [6:53:28<00:00, 67.97s/it]     


In [5]:
df_total = pd.concat(frames)
df_total.to_csv('./output/btc_year_full.csv', header=True, index=False, encoding='utf-8')

In [6]:
df_total.describe()

Unnamed: 0,likes,retweets
count,38266.0,38266.0
mean,26.330293,9.2376
std,111.684119,58.673785
min,0.0,0.0
25%,1.0,0.0
50%,3.0,1.0
75%,14.0,4.0
max,5368.0,5316.0
