In [45]:
import psycopg2
import sqlalchemy
import sqlalchemy.orm
import os
import yaml

In [7]:
def connect_to_snapshot(yaml_path):
    file = open(yaml_path, "r")
    db_info = yaml.load(file)
    file.close()
    url = "postgresql+psycopg2://{}:{}@{}:{}/{}".format(db_info["username"],
                                           db_info["password"],
                                           db_info["host"],
                                           db_info["port"],
                                           db_info["dbname"])
    engine = sqlalchemy.create_engine(url)
    conn = engine.connect()
    meta = sqlalchemy.MetaData(bind = engine)
    meta.reflect()
    return engine, conn, meta

In [23]:
engine, conn, meta = connect_to_snapshot("ml-snapshot.yml")

In [101]:
for table in meta.sorted_tables:
    print(table)

account_setup_surveys
admins
bottom_lines
facebook_page_stats
facebook_pages
facebook_posts
insights
pghero_query_stats
quotes
reports
schema_migrations
subscription_plans
top_lines
twitter_competitors
twitter_statuses
twitter_subscriptions
twitter_user_stats
twitter_users
version_associations
versions
clients
competitors
facebook_competitors
facebook_subscriptions
identities
subscription_payments


In [102]:
inspector = sqlalchemy.engine.reflection.Inspector.from_engine(engine)
inspector.get_columns("twitter_users")

[{'autoincrement': False,
  'default': None,
  'name': 'id',
  'nullable': False,
  'type': BIGINT()},
 {'autoincrement': False,
  'default': None,
  'name': 'entity',
  'nullable': True,
  'type': JSONB(astext_type=Text())},
 {'autoincrement': False,
  'default': None,
  'name': 'created_at',
  'nullable': False,
  'type': TIMESTAMP()},
 {'autoincrement': False,
  'default': None,
  'name': 'updated_at',
  'nullable': False,
  'type': TIMESTAMP()},
 {'autoincrement': False,
  'default': 'true',
  'name': 'active',
  'nullable': True,
  'type': BOOLEAN()}]

In [103]:
TwitterUsers = meta.tables["twitter_users"]

In [104]:
from sqlalchemy.sql import select, func
s = select([func.count(TwitterUsers.c.id)])
conn.execute(s).fetchall()

[(274,)]

In [106]:
s = select([TwitterUsers.c.entity])
sample_user = conn.execute(s).fetchone()[0]
loop = [print(key) for key in sorted(sample_user.keys())]

contributors_enabled
created_at
default_profile
default_profile_image
description
entities
favourites_count
follow_request_sent
followers_count
following
friends_count
geo_enabled
has_extended_profile
id
id_str
is_translation_enabled
is_translator
lang
listed_count
location
name
notifications
profile_background_color
profile_background_image_url
profile_background_image_url_https
profile_background_tile
profile_banner_url
profile_image_url
profile_image_url_https
profile_link_color
profile_sidebar_border_color
profile_sidebar_fill_color
profile_text_color
profile_use_background_image
protected
screen_name
status
statuses_count
time_zone
url
utc_offset
verified


In [114]:
import tweepy
file = open("api-keys.yml", "r")
api_keys = yaml.load(file)
file.close()
auth = tweepy.OAuthHandler(api_keys["twitter"]["consumer-key"],
                           api_keys["twitter"]["consumer-secret"])
auth.set_access_token(api_keys["twitter"]["access-token"], 
                      api_keys["twitter"]["access-secret"])
api = tweepy.API(auth)

In [112]:
s = select([TwitterUsers.c.entity])
result = conn.execute(s)
existing_twitter_users = dict((row[0]["screen_name"], row[0]["id"]) for row in result)
len(existing_twitter_users)

274

In [120]:
import time
following_ids = []
follower_ids = []
i = 1
time.sleep(15*60)

for twitter_id in existing_twitter_users.values():
    start = time.time()
    try:
        following_ids += api.friends_ids(twitter_id)
        follower_ids += api.followers_ids(twitter_id)
    except tweepy.TweepError as err:
        print("{}: {!s}".format(err, twitter_id))
        pass
    if i % 7 == 0:
        time.sleep(15*60)
    if i % 50 == 0:
        print("{!s} users processed".format(i))
    i += 1

Not authorized.: 297791394
50 users processed


KeyboardInterrupt: 

In [98]:
def create_screen_name_id_dict(id_list):
    screen_name_dict = {}
    user_ids = set(id_list)
    for idx, user_id in enumerate(user_ids):
        start = time.time()
        screen_name_dict[api.get_user(user_id).screen_name] = user_id
        if idx % 900 == 0:
            delay = (15 * 60) - (time.time() - start)
            time.sleep(delay)
    return screen_name_dict

In [None]:
following = create_screen_name_id_dict(following_ids)

In [None]:
followers = create_screen_name_id_dict(follower_ids)