In [24]:
import twitter
import pandas as pd
import numpy as np
from datetime import datetime
from time import sleep

CONSUMER_KEY = ''
CONSUMER_SECRET = ''
OAUTH_TOKEN = ''
OAUTH_TOKEN_SECRET = ''

auth = twitter.oauth.OAuth(OAUTH_TOKEN, OAUTH_TOKEN_SECRET, CONSUMER_KEY, CONSUMER_SECRET)
twitter_api = twitter.Twitter(auth=auth)

In [25]:
def get_friends_followers_count(screen_name):
    
    search_results = twitter_api.users.show(screen_name=screen_name)
    friends_count = search_results['friends_count']
    followers_count = search_results['followers_count']
    
    return (friends_count, followers_count)

In [26]:
# get followers_ids

# relationship: followers or following
def get_ids(screen_name, relationship, count):
    
    max_i = int(np.ceil(count / 5000))
    
    print('getting ids for account: {}, relationship: {}'.format(screen_name, relationship))
    print('total iterations: {}'.format(max_i))
    print('count: {}'.format(count))

    results = []

    next_cursor = -1

    for i in range(max_i): # the number of iterations needed to pull the full dataset
        
        now = datetime.now()
        ts = now.strftime("%Y-%m-%d %H:%M:%S.%f")[:-3]
        
        print('{}/{} - {} - sleeping 60 seconds'.format(i+1, max_i, ts))
        
        if relationship == 'followers':
            search_results = twitter_api.followers.ids(screen_name=screen_name, count=5000, cursor=next_cursor)
        elif relationship == 'following':
            search_results = twitter_api.friends.ids(screen_name=screen_name, count=5000, cursor=next_cursor)
        else:
            print('get_ids: improper relationship; killing process')
            return []
            
        next_cursor = search_results['next_cursor']
        results.append(search_results)

        if next_cursor == 0:
            break
        
        sleep(60) # for twitter usage limits; comment out if you are only scraping tiny networks
    
    ids = pd.DataFrame(results)['ids'].sum()
    return ids

In [27]:
def get_users_df(screen_name, relationship, count):
        
    id_list = get_ids(screen_name, relationship, count)
    if len(id_list) > 0:

        results = []

        max_i = int(np.ceil(len(id_list) / 100))
        
        print('getting userlist for account: {}, relationship: {}'.format(screen_name, relationship))
        print('total iterations: {}'.format(max_i))

        for i in range(max_i):
            
            now = datetime.now()
            ts = now.strftime("%Y-%m-%d %H:%M:%S.%f")[:-3]
            
            print('{}/{} - {}'.format(i+1, max_i, ts))
            
            start = i * 100
            end = ((i+1) * 100 )

            if i != max_i:
                lookup = id_list[start:end]
            else:
                lookup = id_list[start:]
                            
            try:

                search_results = list(twitter_api.users.lookup(user_id=lookup))
                results.append(search_results)
                                
            except:
                
                print('bad batch of IDs, so continuing to the next...')
                
            sleep(1) # for twitter api limits

        keep_cols = ['screen_name', 'location', 'description', 'followers_count', 'friends_count', 'created_at', 'statuses_count']

        df = pd.DataFrame(pd.Series(results).sum())
        return df[keep_cols]
    
    else:
        
        return False

In [28]:
def get_twitter_df(screen_name):
    
    account_details = get_friends_followers_count(screen_name)
    friends_count = account_details[0]
    followers_count = account_details[1]
    

    followers_df = get_users_df(screen_name, 'followers', followers_count)
    following_df = get_users_df(screen_name, 'following', friends_count)

    followers_df['relationship'] = 'followed by'
    following_df['relationship'] = 'following'

    df = pd.concat([followers_df, following_df])
    df['source_account'] = screen_name

    columns = ['source_account', 'relationship', 'screen_name', 'location', 'description', 'followers_count',
               'friends_count', 'statuses_count', 'created_at']
    
    return df[columns]

The above code sets the stage for connecting to the twitter API and pulls follower/friend account in the fastest way the API allows. 

The API allows for performing 15 calls for 5000 IDs of followers/friends every 15 minute. So, I have a 60 second sleep in place to prevent going over the limit. If you are only pulling tiny networks, you can comment that line out and you will get results faster. If you are pulling both large and small networks, I recommend leaving it in place. If you go over the limit, the code will not die gracefully. You will see the original twitter error about going over the usage limits. This is intentional. Try to work around that or twitter may ban your API keys.

Using the above functionality is one line of code:

get_twitter_df('account_name')

Like so:

In [17]:
df = get_twitter_df('cityofhillsboro')

getting ids for account: cityofhillsboro, relationship: followers
total iterations: 2
count: 6941
1/2 - 2020-07-26 22:37:01.765 - sleeping 60 seconds
2/2 - 2020-07-26 22:37:02.087 - sleeping 60 seconds
getting userlist for account: cityofhillsboro, relationship: followers
total iterations: 70
1/70 - 2020-07-26 22:37:02.350
2/70 - 2020-07-26 22:37:04.124
3/70 - 2020-07-26 22:37:05.881
4/70 - 2020-07-26 22:37:07.661
5/70 - 2020-07-26 22:37:09.529
6/70 - 2020-07-26 22:37:11.360
7/70 - 2020-07-26 22:37:13.115
8/70 - 2020-07-26 22:37:14.804
9/70 - 2020-07-26 22:37:16.575
10/70 - 2020-07-26 22:37:18.528
11/70 - 2020-07-26 22:37:20.377
12/70 - 2020-07-26 22:37:22.097
13/70 - 2020-07-26 22:37:23.954
14/70 - 2020-07-26 22:37:25.795
15/70 - 2020-07-26 22:37:27.570
16/70 - 2020-07-26 22:37:29.399
17/70 - 2020-07-26 22:37:31.236
18/70 - 2020-07-26 22:37:32.975
19/70 - 2020-07-26 22:37:34.718
20/70 - 2020-07-26 22:37:36.535
21/70 - 2020-07-26 22:37:38.396
22/70 - 2020-07-26 22:37:40.243
23/70 - 202

The above is doing two things. It is collecting the IDs and then account information for all followers of the account, and then it repeats the process for friends. From a network relationship, the direction is as such.

Followers follow the account (followers -> account)
The account follows its friends (account -> followers)

Now we can investigate the data a little to see what is in there. If you want to collect additional data, print the 'search_results' variable. There's a lot more data in there. I've chosen what helps me, but there is a lot more available.

In [22]:
df.head(1)

Unnamed: 0,source_account,relationship,screen_name,location,description,followers_count,friends_count,statuses_count,created_at
0,cityofhillsboro,followed by,DawgPdx,,,0,375,0,Sat Nov 23 22:10:53 +0000 2019


In [23]:
df.shape

(6895, 9)

Source account is the account that we scraped. Relationship is obviously the relationship. Screen name is the account on the other side of the relationship. Location is where they have posted that they reside. Description are details associated with the account. The rest is self-explanatory.

Please use this software as a force for good, not evil. 

Enjoy.