# DATA EXTRACTION NOTEBOOK

This notebook consists of 4 parts we used for extraction of data that we were interested in.

   * Twitter Stream/Present Data
   * Twitter Historic Data
   * Reddit Stream/Present Data
   * Reddit Historic Data
  
We will explain each part separately under its title.

In [None]:
# DEPENDECIES
import tweepy
import praw
import sqlite3
import datetime
import os
import sys
import requests
from IPython import display
import getpass
# import inspect

In [None]:
# TWITTER API AUTHENTICATION VARIABLES
from config import twitter_config

tw_client_key = twitter_config['CLIENT_KEY']
tw_client_secret = twitter_config['CLIENT_SECRET']
tw_access_key = twitter_config['RESOURCE_OWNER_KEY']
tw_access_secret = twitter_config['RESOURCE_OWNER_SECRET']

In [None]:
# REDDIT API AUTHENTICATION VARIABLES
# W
from config import reddit_config

rd_client_key = reddit_config['CLIENT_KEY']
rd_client_secret = reddit_config['CLIENT_SECRET']
rd_user_agent = reddit_config['AGENT']
rd_user_name = reddit_config['USER_NAME']
rd_pw = getpass.getpass('Enter reddit password:')

## Twitter Stream Data
We extracted only 'pure' tweets, not retweets or quoted tweets, since we think that these are real opinions of people. We saved every tweet and it's properties into a database. We also included only tweets that are written in english and tweets that are written by users that have 'USA' in their profile location.

In [None]:
# Cell won't stop running unless interrupted

# Database connection
conn = sqlite3.connect('data/db.sqlite3')
cur = conn.cursor()

class OurStreamListener(tweepy.StreamListener):
    
    def on_status(self, status):
        api_response = status._json
        # we take only tweets that are not retweets or quote tweets and come from USA -> ideally we remove tweets that are replies so we get 'pure opinion' as first tweet
        if status.user.location:
            if 'USA' in status.user.location and not hasattr(status, 'retweeted_status') and status.is_quote_status == False:
            
                # We analysed what attributes we got from couple status objects so we can filter out tweets even more
                # attrs = inspect.getmembers(status, lambda a:not(inspect.isroutine(a)))
                # for at in attrs:
                #     print(at)
                #     print('-' * 50)
                
                # tweet related
                id_status = status.id
                tw_created_at = status.created_at
                if not status.truncated: # if tweet is not wrapped take text attribute as text, else take extended_tweet['full_text'] as text
                    text = status.text
                else:
                    text = status.extended_tweet['full_text']
                
                # user related
                user_name = status.user.screen_name
                user_created_at = status.user.created_at
                profile_bio = status.user.description
                followers = status.user.followers_count
                user_location = status.user.location


                cur.execute("""INSERT INTO tweets(
                    tweet_id,
                    tweet_created_at,
                    tweet_text,
                    user_name,
                    user_created_at,
                    profile_bio,
                    followers,
                    user_location)
                    values(?,?,?,?,?,?,?,?)""",(
                    id_status,
                    tw_created_at,
                    text, user_name,
                    user_created_at,
                    profile_bio,
                    followers,
                    user_location)
                )
                
                conn.commit()
                print(id_status)
                display.clear_output(wait=True)
                
                
#                 print('commited') # uncomment to see if it's working

def main():
    auth = tweepy.OAuthHandler(tw_client_key, tw_client_secret)
    auth.set_access_token(tw_access_key, tw_access_secret)

    stream_listener = OurStreamListener()
    OurStream = tweepy.Stream(auth=auth, listener=stream_listener, tweet_mode='extended')
    OurStream.filter(track=['trump', 'biden'])
    
if __name__ == '__main__':
    try:
        main()
    except KeyboardInterrupt:
        conn.close() # when interrupted it will close db connection to prevent locking of db and exit the program
        try:
            sys.exit(0)
        except SystemExit:
            os._exit(0)

## Twitter Historic Data
Since our twitter developer accounts are limited to only 5000 historic tweets we had to re-run this piece of code 4 times with different credentials in order to get as much data as possible. This didn't go as planned because accounts provided by you couldn't set up custom application envrionments and that was needed for retreiving historic data from twitter. <br> We only mnaged to get around 3k tweets

In [None]:
# # Database connection
conn = sqlite3.connect('data/db.sqlite3')
cur = conn.cursor()

# Twitter Authentication with tweepy
auth = tweepy.OAuthHandler(tw_client_key, tw_client_secret)
auth.set_access_token(tw_access_key, tw_access_secret)

api = tweepy.API(auth_handler=auth)

# querying history tweets - Capitol Storming Day - 06.01.2021
history = tweepy.Cursor(api.search_full_archive,
                        environment_name='dev',
                        query='trump OR biden lang:en', # -is:retweet cant exclude retweets, premium feature
                        fromDate='202101062200',
                        toDate='202101062300').items(100) # 5000 max out

# Iterating over tweepy cursor and saving each status into db -> tweepy cursor makes pagination easy
counter = 1

for status in history:
    
    # tweet related
    id_status = status.id
    tw_created_at = status.created_at
    if not status.truncated: # if tweet is not wrapped take text attribute as text, else take extended_tweet['full_text'] as text
        text = status.text
    else:
        text = status.extended_tweet['full_text']

    # user related
    user_name = status.user.screen_name
    user_created_at = status.user.created_at
    profile_bio = status.user.description
    followers = status.user.followers_count
    user_location = status.user.location

    cur.execute("""INSERT INTO tweets(
        tweet_id,
        tweet_created_at,
        tweet_text,
        user_name,
        user_created_at,
        profile_bio,
        followers,
        user_location)
        values(?,?,?,?,?,?,?,?)""",
        (
        id_status,
        tw_created_at,
        text, user_name,
        user_created_at,
        profile_bio,
        followers,
        user_location)
    )
    
    print(counter)
    display.clear_output(wait=True)
    counter += 1
    
# commit and close
conn.commit()
cur.close()
conn.close()

## Reddit Stream Data

For streaming reddit data we used PRAW library, it's pretty straight forward.

In [None]:
# Cell won't stop running unless interrupted

# PRAW API object
reddit = praw.Reddit(client_id=rd_client_key,
                     client_secret=rd_client_secret,
                     password=rd_pw,
                     user_agent=rd_user_agent,
                     username=rd_user_name,
                     check_for_async=False)

reddit.read_only = True # We dont want to publish anything so we will use read only mode

# Database connection
conn = sqlite3.connect('data/db.sqlite3')
cur = conn.cursor()

# STREAM
def main():
    counter = 1
    for comment in reddit.subreddit('politics').stream.comments():
        id = comment.id
        author = comment.author.name
        comment_txt = comment.body
        score = comment.score
        pinned = comment.stickied
        created_at = datetime.datetime.fromtimestamp(comment.created_utc) # returns UNIX epoch time so we need to convert it
        
        try:
            cur.execute("""INSERT INTO reddit(
                id,
                author,
                comment,
                score,
                pinned,
                created_at) values(?,?,?,?,?, ?)""",
                (id,
                author,
                comment_txt,
                score,
                pinned,
                created_at)
            )
            
            conn.commit()
            print(counter)
            display.clear_output(wait=True) # it will show us how many comments we scrape
            counter += 1
            
            
        except Exception as e:
            print('Error:', e.text)
        
if __name__ == '__main__':
    try:
        main()
    except KeyboardInterrupt:
        conn.close() # when interrupted it will close db connection to prevent locking of db and exit the program
        try:
            sys.exit(0)
        except SystemExit:
            os._exit(0)

## Reddit Historic Data
For historic reddit data we had to use pushshift.io API endpoints. <br>
We defined a function that can get maximum 100 entries per request, then we will make a loop that will call that function for each minute in a given timeframe we want to scrape.

In [None]:
def load_results(lower_timestamp, upper_timestamp, subreddit, size):
    batch = []
    api_url = f"https://api.pushshift.io/reddit/search/comment/?after={lower_timestamp}&before={upper_timestamp}&sort_type=score&sort=desc&subreddit={subreddit}&size={size}"

    try:
        with requests.get(api_url) as req:
            data = req.json()['data']
#             print(len(data), datetime.datetime.fromtimestamp(data[0]['created_utc'])) # prints out len of batch extracted (max is 100) and date of first comment -> if it increases by minute then it means it works properly
        
        with sqlite3.connect('data/db.sqlite3') as conn:
            cur = conn.cursor()
            for entry in data:
                id = entry['id']
                author = entry['author']
                comment_txt = entry['body']
                created_at = datetime.datetime.fromtimestamp(entry['created_utc'])
                score = entry['score']
                pinned = entry['stickied']

                cur.execute("""INSERT INTO reddit(
                    id,
                    author,
                    comment,
                    score,
                    pinned,
                    created_at) values(?,?,?,?,?, ?)""",(
                    id,
                    author,
                    comment_txt,
                    score,
                    pinned,
                    created_at)
                )
                
                conn.commit()
                print(comment_txt[:20])
                display.clear_output(wait=True)
                
    except Exception as e:
        print('Error text:', e)

subreddit = 'politics'
size = 100
capitol_day = datetime.datetime.strptime('21/01/2021',"%d/%m/%Y")

# minutes from midnight 21/01/2021, we wanted to scrape 6 hours hence 360 minutes, since our function gets 100 entries for each minute timeframe.
delta_limit_up = 360
delta_limit_low = 359

for i in range(360): # range decides the amount of minutes to go from upper timestamp - capitol day
    previous_timestamp = int((capitol_day - datetime.timedelta(minutes=delta_limit_up)).timestamp())
    current_timestamp = int((capitol_day - datetime.timedelta(minutes=delta_limit_low)).timestamp())

    load_results(previous_timestamp, current_timestamp, subreddit, size)
            
    delta_limit_up -= 1
    delta_limit_low -= 1
    
print('DONE')