In [9]:
# importing libraries/packages needed
import requests
import os
import json
import pprint
import time
import pandas as pd
import numpy as np
import re

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
analyzer = SentimentIntensityAnalyzer()

pp = pprint.PrettyPrinter(indent=4)

bearer_token = "BEARER_TOKEN_GOES_HERE"
search_url = "https://api.twitter.com/2/tweets/search/all"

# Optional params: start_time,end_time,since_id,until_id,max_results,next_token,
# expansions,tweet.fields,media.fields,poll.fields,place.fields,user.fields

# NAME FILE HERE FIRST
# NAME FILE HERE FIRST
# NAME FILE HERE FIRST
raw_data = "bitcoin_210531.json"
filename = "bitcoin_210531_final.xlsx"

# CHANGE DAYS HERE
# CHANGE DAYS HERE
# CHANGE DAYS HERE
select_year = 2021
select_month = 5
select_day = 31
start_hour = 0
end_hour = 23

# CHANGE KEYWORD HERE
# CHANGE KEYWORD HERE
# CHANGE KEYWORD HERE
# Define global query parameters
default_query_params = {
    'query': 'bitcoin -is:retweet lang:en place_country:us has:geo',
    'tweet.fields': 'created_at,author_id,text,geo',
    'expansions': "geo.place_id",
    'place.fields': "place_type",
    'max_results': 500,
    'start_time': "",
    'end_time': "",
}

def create_headers(bearer_token):
    headers = {"Authorization": "Bearer {}".format(bearer_token)}
    return headers

def connect_to_endpoint(url, headers, params):
    response = requests.request("GET", search_url, headers=headers, params=params)
    print(response.status_code)
    if response.status_code != 200:
        raise Exception(response.status_code, response.text)
    return response.json()

def build_params(year, month, day, hour, query_params):
    big_t = "T"
    start_minute = "00:00.000Z"
    end_minute = "59:59.999Z"

    start_time = (str(hour), start_minute)
    start_time_formatted = ":".join(start_time)
    full_start_time = big_t + start_time_formatted
    start_date = (str(year), str(month), str(day))
    start_date_formatted = "-".join(start_date)
    full_start_date = start_date_formatted + full_start_time

    end_time = (str(hour), end_minute)
    end_time_formatted = ":".join(end_time)
    full_end_time = big_t + end_time_formatted
    end_date = (str(year), str(month), str(day))
    end_date_formatted = "-".join(end_date)
    full_end_date = end_date_formatted + full_end_time

    query_params["start_time"] = full_start_date
    query_params["end_time"] = full_end_date
    return query_params

def add_region(tweetz, placeDict):
    all_tweets = []
    for tweet in tweetz:
        if 'geo' in tweet:
            place_id = tweet['geo']['place_id']
            place = placeDict[place_id]

            # parse state from place.name
            place_type = place['place_type']
            if place_type == 'city':            
                # split the full name of the place by the comma
                place_name = place['full_name']
                splitnames = place_name.split(', ')

                # get the state code part of the string (it's the second part)
                state_code = splitnames[1]

                # add the state field on tweet_match object
                tweet['state'] = state_code

                # assign region codes to different states
                if state_code == "WA" or state_code == "OR" or state_code == "CA" or state_code == "ID" or state_code == "NV" or state_code == "MT" or state_code == "WY" or state_code == "UT" or state_code == "AZ" or state_code == "CO" or state_code == "NM" or state_code == "AK" or state_code == "HI":
                    region = "West"
                elif state_code == "ND" or state_code == "SD" or state_code == "NE" or state_code == "KS" or state_code == "MN" or state_code == "IA" or state_code == "MO" or state_code == "WI" or state_code == "IL" or state_code == "MI" or state_code == "IN" or state_code == "OH":
                    region = "Midwest"
                elif state_code == "PA" or state_code == "NY" or state_code == "NJ" or state_code == "CT" or state_code == "RI" or state_code == "MA" or state_code == "VT" or state_code == "NH" or state_code == "ME":
                    region = "Northeast"
                else: # all the states not listed in the above 'if' or 'elif' conditionals
                    region = "South"

                # add the region to tweet_match object
                tweet['region'] = region

                # add to new list
                all_tweets.append(tweet)
    return all_tweets


def add_dummy(tweetz):
    all_tweets = []
    for tweet in tweetz:
        region = tweet['region']
        # assign region to dummy variables
        if region == "West":
            W = 1
        else:
            W = 0
        # add the West dummy variable to tweet_match object
        tweet['W'] = W
        
        if region == "Midwest":
            MW = 1
        else:
            MW = 0
        # add the Midwest dummy variable to tweet_match object
        tweet['MW'] = MW

        if region == "Northeast":
            NE = 1
        else:
            NE = 0
        # add the Northeast dummy variable to tweet_match object
        tweet['NE'] = NE

         # add to new list
        all_tweets.append(tweet)
        
    return all_tweets

# format data for pandas dataframe
def build_dataframe(all_tweets):
    input_list = []
    for tweet in all_tweets:
        tweet_as_list = []
        tweet_as_list.append(tweet['author_id'])
        tweet_as_list.append(tweet['created_at'])
        tweet_as_list.append(tweet['geo']['place_id'])
        tweet_as_list.append(tweet['id'])
        tweet_as_list.append(tweet['region'])
        tweet_as_list.append(tweet['state'])
        tweet_as_list.append(tweet['W'])
        tweet_as_list.append(tweet['MW'])
        tweet_as_list.append(tweet['NE'])
        tweet_as_list.append(tweet['text'])
        input_list.append(tweet_as_list)

    # create new dataframe    
    df = pd.DataFrame(input_list, columns=['author_id', 'created_at', 'geo', 'id', 'region', 'state', 'W','MW','NE','text'])
    return df

# clean the tweets
def remove_pattern(input_txt, pattern):
    r = re.findall(pattern, input_txt)
    for i in r:
        input_txt = re.sub(i, '', input_txt)        
    return input_txt

def clean_tweets(all_tweets):
    #remove twitter Return handles (RT @xxx:)
    all_tweets = np.vectorize(remove_pattern)(all_tweets, "RT @[\w]*:") 
    
    #remove twitter handles (@xxx)
    all_tweets = np.vectorize(remove_pattern)(all_tweets, "@[\w]*")
    
    #remove URL links (httpxxx)
    all_tweets = np.vectorize(remove_pattern)(all_tweets, "https?://[A-Za-z0-9./]*")
    
    #remove special characters, numbers, punctuations (except for #)
    all_tweets = np.core.defchararray.replace(all_tweets, "[^a-zA-Z]", " ")
    
    return all_tweets

def score_sentiment(df):
    scores = []
    # declare variables for scores
    compound_list = []
    positive_list = []
    negative_list = []
    neutral_list = []
    for i in range(df['text'].shape[0]):
    # print(analyser.polarity_scores(sentiments_pd['text'][i]))
        compound = analyzer.polarity_scores(df['text'][i])["compound"]
        pos = analyzer.polarity_scores(df['text'][i])["pos"]
        neu = analyzer.polarity_scores(df['text'][i])["neu"]
        neg = analyzer.polarity_scores(df['text'][i])["neg"]

        scores.append({"Compound": compound,
                           "Positive": pos,
                           "Negative": neg,
                           "Neutral": neu
                      })
    return scores

In [10]:
def main():
    headers = create_headers(bearer_token)
    
    tweetz = []
    placeDict = {} # keys = place_id, values = place
    for hour in range(0, end_hour+1):
        # Build and execute query
        query_params = build_params(select_year, select_month, select_day, hour, default_query_params)
        json_response = connect_to_endpoint(search_url, headers, default_query_params)
        
        # Check the result count. If there are no results, don't process the tweets.
        result_count = json_response["meta"]["result_count"]
        print("Hour", hour, ":", result_count, "tweets")
        if result_count == 0:
            time.sleep(1) # Sleep for 1 second
            continue # This skips this run of the for-loop and goes to the next run
                
        # Add tweets to monthly tweets array
        tweetz = tweetz + json_response["data"]
        
        # Add new places to places dictionary
        res_places = json_response["includes"]["places"]
        for place in res_places:
            place_id = place['id']
            if place_id not in placeDict: # to avoid overwriting existing places
                placeDict[place_id] = place

        time.sleep(1) # Sleep for 1 second
        
    print("Total tweets:", len(tweetz))
    print("Total places:", len(placeDict.keys()))
    
    tweets_with_region = add_region(tweetz, placeDict)
    tweet_with_dummies = add_dummy(tweets_with_region)
    df_tweetz = build_dataframe(tweet_with_dummies)
    
    # use clean function on tweet text
    cleaned_tweets = clean_tweets(df_tweetz['text'])
    df_tweetz['text'] = cleaned_tweets

    # score sentiment for dataset
    scored_tweets = score_sentiment(df_tweetz)
    # initialize another pandas dataframe and combine it with original dataframe
    sentiments_score = pd.DataFrame.from_dict(scored_tweets)
    df_final = df_tweetz.join(sentiments_score)
    result3 = df_final.head(10)
    print(result3)
    
    df_final.to_excel(r'/Users/Linde/Library/Mobile Documents/com~apple~CloudDocs/Documents/Master/ESMT/01_Class/M5/Thesis/Data/Curated_Tweets/' + filename, index = False)
    
main()

200
Hour 0 : 11 tweets
200
Hour 1 : 7 tweets
200
Hour 2 : 6 tweets
200
Hour 3 : 8 tweets
200
Hour 4 : 9 tweets
200
Hour 5 : 5 tweets
200
Hour 6 : 4 tweets
200
Hour 7 : 3 tweets
200
Hour 8 : 2 tweets
200
Hour 9 : 2 tweets
200
Hour 10 : 5 tweets
200
Hour 11 : 3 tweets
200
Hour 12 : 9 tweets
200
Hour 13 : 7 tweets
200
Hour 14 : 12 tweets
200
Hour 15 : 13 tweets
200
Hour 16 : 11 tweets
200
Hour 17 : 6 tweets
200
Hour 18 : 12 tweets
200
Hour 19 : 7 tweets
200
Hour 20 : 10 tweets
200
Hour 21 : 7 tweets
200
Hour 22 : 10 tweets
200
Hour 23 : 8 tweets
Total tweets: 177
Total places: 92
             author_id                created_at               geo  \
0            410850631  2021-05-31T00:31:34.000Z  c3f37afa9efcf94b   
1             30585609  2021-05-31T00:29:04.000Z  28db2dbc4240f0b2   
2             20536137  2021-05-31T00:20:11.000Z  d1280141e5f979cf   
3            626152908  2021-05-31T00:15:12.000Z  e09538b2e39d94df   
4             22319823  2021-05-31T00:08:50.000Z  00e9226863a6e5a4