## Dependencies

In [2]:
import requests
import json
from math import ceil
from time import sleep, time
from datetime import datetime
import pandas as pd
import plotly.express as px
import networkx as nx
BEARER_TOKEN = 'AAAAAAAAAAAAAAAAAAAAACzdHwEAAAAAEfabutP9Ydo80QE%2FaaRNRlnz4SY%3DNvtD1dC4n1YtZjdAwOrCHnx3p8IdjK5s1PbG0rhpQvC8zKROnQ'

## Functions and classes

### Twitter API access

In [3]:
#Error Class
class RateLimitError(Exception):
    def __init__(self, text = None):
        RateLimitError.text = text

#### HEADERS TO BE USED IN REQUEST MODULE ####
def create_headers(bearer_token):
    headers = {"Authorization": "Bearer {}".format(bearer_token)}
    return headers

#TODO:
def create_headers_multiple_bearer_tokens():
    pass

def connect_to_endpoint(url, headers = None):
    response = requests.request("GET", url, headers = headers)
    if response.status_code == 429:
        raise RateLimitError(response.text)
    elif response.status_code != 200:
        raise Exception(
            f"""Requested url: {url},
                Request returned an error: {response.status_code} {response.text},
                """)
    return response.json()

### Error handling (rate limit use)

In [4]:
#### HANDLE RATE LIMIT ERROR #####

#Find rate limits that has been used
def rate_limit_use(BEARER_TOKEN):
    headers = create_headers(BEARER_TOKEN)
    path = "https://api.twitter.com/1.1/application/rate_limit_status.json?"
    rate_limit = connect_to_endpoint(path, headers)
    rate_limits_used = {}
    for x in rate_limit['resources']:
        for y in rate_limit['resources'][x]:
            if rate_limit['resources'][x][y]['limit'] != rate_limit['resources'][x][y]['remaining']:
                rate_limits_used.update({y : rate_limit['resources'][x][y]})
    return rate_limits_used

#Find the rate limits that are 0
def rate_limit_exhausted(BEARER_TOKEN):
    d = rate_limit_use(BEARER_TOKEN)
    rate_limit_exhausted = {k : d[k] for k in d if d[k]['remaining'] == 0}
    return rate_limit_exhausted

#Find the waiting time until all rate limits has been recovered 
def rate_limit_recovery_time_max(BEARER_TOKEN):
    dic = rate_limit_exhausted(BEARER_TOKEN)
    epoch_time_max = max([dic[x]['reset'] for x in dic])
    return epoch_time_max

#TODO: #Create new header from BearerToken if zero ratelimit at present
def not_exhausted_bearer_token(BEARER_TOKEN_LIST):
    pass 


### Building Twitter API urls to get user information and followers

In [5]:
#### REQUEST TWITTER API ####

#GET metadata on specific user from either user name or user id
def url_lookup_user(usernames = None, ids = None):
    if ids != None: 
        id_or_user_name = f"{ids}?"
    else:
        id_or_user_name = f"by?usernames={usernames}"    
    
    user_fields = ""
    url = f"https://api.twitter.com/2/users/{id_or_user_name}&{user_fields}"
    return url

#GET data on users following user id in question
def url_get_following(user_id):
    # find all metadata (possible userfields, expansions etc.) that can be returned here: 
    # https://developer.twitter.com/en/docs/twitter-api/users/follows/api-reference/get-users-id-following
    user_fields = "user.fields=public_metrics,location,description,verified"
    return f"https://api.twitter.com/2/users/{user_id}/following?max_results=1000&{user_fields}"

### Connect to twitter API (request)

In [6]:
def connect_to_endpoint_pagenation(url_input, BEARER_TOKEN, max_results = 1000):
    #auth
    headers = create_headers(BEARER_TOKEN)
    
    #Input and output
    tokens = ['next_token','previous_token']
    url = url_input
    json_response_gross = {}
    
    i = 1 #enumerator
    
    while True:    
        try:
            json_response = connect_to_endpoint(url, headers)
        except Exception as e:
            if type(e) == RateLimitError:
                #TODO: FUNCTION TO CHECK OTHER BEARER TOKENS AVALAIBLE SHOULD BE INSERTED HERE
                timestamp_ready = rate_limit_recovery_time_max(BEARER_TOKEN)
                seconds_to_wait, minutes_to_wait = timestamp_ready - time(), round((timestamp_ready - time())/60,2)
                time_to_resume = datetime.fromtimestamp(timestamp_ready).strftime("%d/%m/%Y %H:%M:%S")

                print(f"""{e.text} 
                Rate in question: {rate_limit_exhausted(BEARER_TOKEN)} 
                Waiting {minutes_to_wait} minutes before retrying ({time_to_resume})""")
                sleep(seconds_to_wait)
                continue
    
        #IF THERE ARE PAGEING TOKENS:
        if any(t in json_response['meta'] for t in tokens):
            try:
                json_response_gross.update(json_response['data'])
            except:
                json_response_gross.update(json_response)
            print(f'GET request succesfull ({i}/x): {url}')            
            if 'next_token' not in json_response['meta']:
                break
            else:
                token = json_response['meta']['next_token']
                url = f"{url_input}&pagination_token={token}"
                i += 1
        #IF THERE ARE NO NEXT/PREVIOUS TOKENS RESULTS ARE ON ONE PAGE AND THE WHILE LOOP IS BROKEN.
        else:
            try:
                json_response_gross.update(json_response['data'])
            except:
                json_response_gross.update(json_response)
            print(f'GET request succesfull : {url}')
            break

    return json_response_gross

### Return dictionary of a user's friends'/followings' metadata based on username

In [7]:
def find_following_of_user(username, BEARER_TOKEN, max_results = None):
    #auth
    headers = create_headers(BEARER_TOKEN)
    
    #find user 
    url_user_lookup = url_lookup_user(username)
    user = connect_to_endpoint(url_user_lookup, headers)
    user_id = user['data'][0]['id']
    
    #find followers
    url_following_call = url_get_following(user_id)
    
    return connect_to_endpoint_pagenation(url_following_call, BEARER_TOKEN, max_results) 

## Data Collection

In [12]:
def snowball(seed, depth, regex_pattern_filter = '(?i)Wind'):
    #initiate variables
    all_balls_list = []
    all_results = []
    current_ball_results = []
    
    #first ball is the seed, which is NOT filtered with regex pattern.
    current_ball = [seed]
    
    for depth_level in range(depth + 1):      
        for v,x in enumerate(current_ball):
            print(f'{v+1} out of {len(current_ball)} users.')
            try:
                json_response_list = find_following_of_user(x, BEARER_TOKEN, max_results=1000)['data']
                for y in json_response_list:
                    y['followed_by'] = x
                    y['ball_depth'] = depth_level
                    y['regex_pattern_filter_to_create_ball'] = regex_pattern_filter
            except:
                json_response_list = []
            
            #store results
            current_ball_results.extend(json_response_list)

        #store results
        all_results.extend(current_ball_results)
        all_balls_list.extend(current_ball)
        
        df_to_ball = pd.DataFrame(current_ball_results)
        current_ball = list(set(df_to_ball['username'].loc[(df_to_ball['description'].str.contains(regex_pattern_filter)) | 
                                                           (df_to_ball['username'].str.contains(regex_pattern_filter))]))
        current_ball = [x for x in current_ball if x not in all_balls_list]

    return all_results, all_balls_list

### Script 

In [13]:
all_results_dict, all_balls_list = snowball('windwatchorg',0,regex_pattern_filter = '(?i)Wind*')
df_results = pd.DataFrame(all_results_dict)
df_results.to_pickle('results')
print(f'Scraped the users that these users follows: {all_balls_list}')

1 out of 1 users.
GET request succesfull : https://api.twitter.com/2/users/19776476/following?max_results=1000&user.fields=public_metrics,location,description,verified
Scraped the users that these users follows: ['windwatchorg']


## Data handling

In [None]:
#### SEED IS WINDWATCHORG

df_results = pd.concat([df_results,df_results['public_metrics'].apply(pd.Series)], axis=1)

### Create a map

In [None]:
def gps_of_location(location):
    google_API_key = "AIzaSyBn6H0mv0o04Nr6gNF4rhj19G2PBEiaUB0"
    url = f"https://maps.googleapis.com/maps/api/geocode/json?address={location}&key={google_API_key}"
    json_response = connect_to_endpoint(url)
    if json_response['status'] == 'OK':
        d = json_response['results'][0]['geometry']['location']
        d['types_google_api_response'] = json_response['results'][0]['types']
        return d
    else:
        return 'No GPS coordinates found'

In [None]:
locations = df_results['location'].unique()

In [None]:
#### DON'T JUST RUN THIS CODE <3<3<3<3<3<3<3<3<3 Google API free tier is limited 
gps = {x : gps_of_location(x) for x in locations}

In [None]:
df_gps = pd.DataFrame(gps).transpose()
df = pd.merge(df_results,df_gps, left_on = 'location', right_index = True)

In [None]:
fig = px.scatter_geo(df_results,lat=df['lat'], lon=df['lng'],hover_name = df['username'])
fig.show()

## Fooling around

In [None]:
gps_of_location('Blog')

In [None]:
gps_of_location('copenhagen')

In [None]:
gps_of_location('Frøya, Norge')

In [None]:
locations

In [None]:
gps_of_location('aasdasd is a place')

In [None]:
df

In [None]:
gps_of_location_dev('blog')

In [None]:
df

In [None]:
df_first_ball['location'].unique()

In [None]:
gps_of_location_dev('worldwide')

In [None]:
find_following_of_user('stopthesethings')

In [None]:
a = find_following_of_user('windwatchorg', BEARER_TOKEN, 1000)

In [None]:
b = pd.concat([pd.DataFrame(a['data']),pd.DataFrame(a['data'])['public_metrics'].apply(pd.Series).apply(pd.Series)], axis = 1)

In [None]:
b.sort_values('followers_count').tail(40)

In [35]:
a = find_following_of_user('MaryKayBarton',BEARER_TOKEN,1000)

This is the value of i = 1
GET request succesfull (1/2): https://api.twitter.com/2/users/485717967/following?max_results=1000&user.fields=public_metrics,location,description,verified
This is the value of i = 2
GET request succesfull (2/1): https://api.twitter.com/2/users/485717967/following?max_results=1000&user.fields=public_metrics,location,description,verified&pagination_token=7JVQ0P32ESL1CZZZ
This is the value of i = 3
GET request succesfull (3/1): https://api.twitter.com/2/users/485717967/following?max_results=1000&user.fields=public_metrics,location,description,verified&pagination_token=BHIH794GOP518ZZZ
This is the value of i = 4
GET request succesfull (4/3): https://api.twitter.com/2/users/485717967/following?max_results=1000&user.fields=public_metrics,location,description,verified&pagination_token=MJ2VSAUJAUT16ZZZ
While loop done!


In [28]:
a

{'data': [{'public_metrics': {'followers_count': 6878,
    'following_count': 7523,
    'tweet_count': 23618,
    'listed_count': 2},
   'id': '1300313886697152512',
   'description': 'Proud & Deplorable. End the Lockdowns. Resist The Great Reset. IFBAP. Donald Trump WON and 46 can kiss💋my ass 🇺🇸',
   'name': 'WE🇺🇸ARE🇺🇸MAGA',
   'verified': False,
   'username': 'WeAreMAGA'},
  {'public_metrics': {'followers_count': 298,
    'following_count': 958,
    'tweet_count': 2893,
    'listed_count': 2},
   'id': '764964662781829121',
   'description': 'Once a pharmacist Now full-time mom, content creator & occasional blogger https://t.co/VQl8Djgbtj one of my babies #MAGA "cause the world needs a strong America',
   'name': 'Freedom Lover',
   'location': 'Canada',
   'verified': False,
   'username': 'RealMe4Freedom'},
  {'public_metrics': {'followers_count': 52245,
    'following_count': 3974,
    'tweet_count': 27678,
    'listed_count': 480},
   'id': '15719232',
   'description': 'Senior 

In [1]:
extensionsToCheck = ['a', 'b']
url_string = ['klask']
if any(ext in url_string for ext in extensionsToCheck):
    print(url_string)

In [23]:
a = connect_to_endpoint('https://api.twitter.com/2/users/485717967/following?max_results=1000&user.fields=public_metrics,location,description,verified&pagination_token=MJ2VSAUJAUT16ZZZ',create_headers(BEARER_TOKEN))

In [19]:
    print('nice')

nice


In [24]:
a

{'data': [{'public_metrics': {'followers_count': 61816,
    'following_count': 46324,
    'tweet_count': 13720,
    'listed_count': 875},
   'username': 'FreeStateNH',
   'id': '21779975',
   'name': 'The Free State 🦔',
   'verified': False,
   'description': 'There are two types of libertarians:\n\n1. Those okay with losing\n2. Those that live in New Hampshire\n\nhttps://t.co/sT5rDCS2bo',
   'location': 'New Hampshire'},
  {'public_metrics': {'followers_count': 5482,
    'following_count': 3331,
    'tweet_count': 4799,
    'listed_count': 109},
   'username': 'AmerComm',
   'id': '552744336',
   'name': 'American Commitment',
   'verified': False,
   'description': 'We restore and protect the American Commitment to free markets, economic growth, Constitutionally-limited government, property rights, and individual freedom.'},
  {'public_metrics': {'followers_count': 16120,
    'following_count': 15502,
    'tweet_count': 7370,
    'listed_count': 108},
   'username': 'ShelbyCountyGOP'

In [29]:
tokens = ['next_token','previous_token']

if any(t in a for t in tokens):
    print('nice')