In [2]:
import os
import time
import re
import glob
import pandas as pd
import requests

def get_blobs(bucket_name, folder_name):
    gcs_client = storage.Client()
    bucket = gcs_client.bucket(bucket_name)
    blobs = list(bucket.list_blobs(prefix=folder_name))
    names = []
    for b in blobs: names.append(b.name)
    return names

def upload_to_output(path, bucket_name, folder_name):
    storage_client = storage.Client()
    bucket = storage_client.bucket(bucket_name)
    blob = bucket.blob(folder_name + '/' + path.split('/')[-1])
    blob.upload_from_filename(path)
    
def stack_tweets(new_records, existing_records):
    captured_tokens = list(existing_records.id.unique())
    response_tokens = list(new_records.id.unique())
    already_captured = [x for x in response_tokens if x in captured_tokens]
    
    x = len(captured_tokens)
    y = len(response_tokens)
    z = len(already_captured)
    
    existing_records = existing_records.append(new_records)
    
    print('Existing tokens: ', x, 'Response tokens: ', y, 'Already_captured: ', z, 'Efficiency Rate: ', str(round(z/y,2)*100)+'%')
    return existing_records

def searchRTOfTweetIDQuery(tweet_id):
    return 'retweets_of_tweet_id:'+str(tweet_id)

    
class TwitterAPI():
    def __init__(self, bearer_token, consumer_key, consumer_secret, access_token, access_token_secret):
        import requests
        self.client = tweepy.Client( bearer_token=bearer_token, 
                        consumer_key=consumer_key, 
                        consumer_secret=consumer_secret, 
                        access_token=access_token, 
                        access_token_secret=access_token_secret, 
                        return_type = requests.Response,
                        wait_on_rate_limit=True)
        
        
        self.results = {}
        
    def get_tweets(self, keyword, max_results):
        query = keyword
        from datetime import datetime
        ts = str(datetime.now())
        reference_id = ts+'__'+keyword
        if max_results <= 100:
            tweets = self.client.search_recent_tweets(query=query, 
                                                tweet_fields=['id', 'author_id', 'conversation_id', 'in_reply_to_user_id', 'created_at', 'text', 'public_metrics'],
                                                 max_results=max_results)
            tweets_dict = tweets.json() 
            tweets_data = tweets_dict['data'] 
            df = pd.json_normalize(tweets_data)
            df = df.set_index('id')

            self.results[reference_id] = df

            return df
        
        else:
            
            count = 0
            done = False
            
            tweets = self.client.search_recent_tweets(query=query, 
                                                    tweet_fields=['id', 'author_id', 'conversation_id', 'in_reply_to_user_id', 'created_at', 'text', 'public_metrics'],
                                                     max_results=100)
            tweets_dict = tweets.json() 
            tweets_data = tweets_dict['data'] 
            df = pd.json_normalize(tweets_data)
            existing_records = df.set_index('id')
            
            count = len(df)
            
            while not done:
                
                tweets = self.client.search_recent_tweets(query=query, 
                                                    tweet_fields=['id', 'author_id', 'conversation_id', 'in_reply_to_user_id', 'created_at', 'text', 'public_metrics'],
                                                     max_results=100)
                tweets_dict = tweets.json() 
                tweets_data = tweets_dict['data'] 
                new_records = pd.json_normalize(tweets_data)
                new_records = df.set_index('id')

                x = len(existing_records)
                y = len(new_records)

                existing_records = existing_records.append(new_records, ignore_index=False)
                
                added = len(existing_records) - x
                
                eff = round(added/y,2)
                
                # print('Existing tokens: ', x, 'Response tokens: ', y, 'Already_captured: ', y-added, 'Efficiency Rate: ', str(eff*100)+'%')
                
                count+=added
                
                done = count >= max_results
                
            df = existing_records
            self.results[reference_id] = df
        
        return df
    
    def aggregate_results(self):
        searches = list(self.results.keys())

        src = []
        for x in searches:
            src.append(api.results[x])

        results = pd.concat(src, ignore_index=False)
        
        return results.copy()

    
# IMPORT STATEMENTS
try: import tweepy
except ModuleNotFoundError: 
    os.system('pip install tweepy')
    import tweepy

import pandas as pd


# SCRIPT CONSTANTS
MAX_HOUR_LIMIT = 500
KEYWORDS = ['@SouthwestAir', '#southwestairlines', '#southwestairline', '#airlines']
AUTHENTICATION_COUNT = 1
MAX_RESULT_PULL = AUTHENTICATION_COUNT*MAX_HOUR_LIMIT/len(KEYWORDS)

# How to connect to cloud storage
from google.cloud import storage

# Instantiates a client
storage_client = storage.Client()

# The name for the new bucket
bucket_name = "sw-airlines-data-hub"

# Creates the new bucket
bucket = storage_client.bucket(bucket_name)





# SET TIME LABEL
from datetime import datetime
currentDateAndTime = datetime.now()
timeLabel = ''
timeLabel+=str(currentDateAndTime.year) + '-' 
timeLabel+= str(currentDateAndTime.month) + '-' 
timeLabel+=str(currentDateAndTime.day) + '_HOUR-' 
timeLabel+=str(currentDateAndTime.hour)


# IF YOU GET MORE THAN 1 AUTH
consumer_key = "afxygKWvjyHeYqK2VOudagVF5"
consumer_secret = "s3aiZTpaWJXsqkqzxdhetnGaXSoXyJo5fay0PvaRbe7azGUTdB"
access_token = "133488849-J5PyZGIBiWaZ91gd4EaL9Ni0yaCeUUeevA3pUk3j"
access_token_secret = "39wEmtno1NucvjkT5GrBupwQstmHaPukj3KRFZj48EW31"
bearer_token = "AAAAAAAAAAAAAAAAAAAAANgJegEAAAAA7apqttgkq4Do4dGRoxfB0MD6D6w%3DGTFCK7VJJjA4fR0d0k9XCsrV6h7D36pmLyq6g5Vmnw9ZtZxT7k"

auth1 = {'consumer_key':consumer_key,
'consumer_secret':consumer_secret,
'access_token' : access_token,
'access_token_secret' : access_token_secret,
'bearer_token' : bearer_token} 

# consumer_key = '1r00ftJDzKTcaLCGgb0WWlCak'
# consumer_secret = 'ligckZVYi8ygX0Rs4HhcVMjnQ0NB3vFG8cG9QMQXyFWei1sO2X'
# access_token = '1547339723798093829-2w6gIpXLZf2953rkyZbbN95dkQcs55'
# access_token_secret = 'RUSGsMrqpJIeKbQhCGn5DwAZDpV4EAgxZvudKrQRhXkq6'
# bearer_token = 'AAAAAAAAAAAAAAAAAAAAAG7lfwEAAAAAn3AnJpgxBjK2FmzPlulBSH8440U%3DEiB4dx0ggR5GnRZqmgDS6oHd8iqD58dDAqfLzZAV7tZWNBfRHX'

# auth2 = {'consumer_key':consumer_key,
# 'consumer_secret':consumer_secret,
# 'access_token' : access_token,
# 'access_token_secret' : access_token_secret,
# 'bearer_token' : bearer_token} 

auths = [auth1]
apis = []

for creds in auths:
    apis.append(TwitterAPI(creds['bearer_token'], 
                           creds['consumer_key'], 
                           creds['consumer_secret'], 
                           creds['access_token'], 
                           creds['access_token_secret']))

first = True
for k in KEYWORDS:
    query=''
    query = k + ' -is:retweet'
    print(k)
    for api in apis:
        try:
            if first:
                df = api.get_tweets(query, 100)
                first = False
            else:
                df2 = api.get_tweets(query, 100)
                df = df.append(df2, ignore_index=False)
        except:
            print('API error!')

    file_label = timeLabel + '_' + k + '.csv'

    local_path = file_label
    df.to_csv(local_path)
    
    upload_to_output(local_path, bucket_name, 'data/source')
    print(local_path)

@SouthwestAir
2022-8-15_HOUR-0_@SouthwestAir
#southwestairlines
2022-8-15_HOUR-0_#southwestairlines
#southwestairline
API error!
API error!
2022-8-15_HOUR-0_#southwestairline
#airlines
2022-8-15_HOUR-0_#airlines
