In [14]:
from pprint import pprint
import json
import csv
import tweepy
import yaml

from pathlib import Path
from time import sleep
import progressbar

In [92]:
# Set up cache directories
tweet_cache_dir = Path('./__cache__/tweets/')
user_cache_dir = Path('./__cache__/users/')

if not tweet_cache_dir.is_dir(): tweet_cache_dir.mkdir(parents=True)
if not user_cache_dir.is_dir(): user_cache_dir.mkdir(parents=True)

In [93]:
# Set up tweepy
class _TwitterCredentials():

    def __init__(self):
        with open('../conference-documentation/credentials.yml') as f: self._ = yaml.load(f)

    def __getitem__(self, i):
        return self._[i]

twitter_credentials = _TwitterCredentials()

auth = tweepy.OAuthHandler(twitter_credentials['consumer_key'], twitter_credentials['consumer_secret'])
auth.set_access_token(twitter_credentials['access_token'], twitter_credentials['access_token_secret'])
api = tweepy.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True)

In [97]:
def expand_tags_tsv(file):
    if Path(file).is_file(): _ = [Path(file)]
    elif Path(file).is_dir(): _ = Path(file).glob(pattern="*.tsv")
    else:
        raise RuntimeError(f"Cannot interpret passed argument: {file}")
    for file in _:
        with Path(file).open("r") as f:
            _len = len(f.readlines())
        with Path(file).open("r") as f:
            bar = progressbar.ProgressBar(max_value=_len).start()
            reader = csv.DictReader(f, delimiter='\t')
            for i, rows in enumerate(reader):
                bar.update(i)
                _dict = tags_to_dict(rows)
                
                tweet_cache = tweet_cache_dir / _dict["id_str"]
                if not tweet_cache.is_file():
                    ## First, check twitter for tweet here... 
                    try:
                        live_tweet = api.get_status(_dict['id_str'], tweet_mode="extended")
                        _json = live_tweet._json
                        _json['json_source'] = 'Twitter'

                        _json_user = _json['user']
                        _json['user'] = _json_user['id']

                        with Path(tweet_cache).open("w+") as f:
                            json.dump(_json, f)

                        user_cache = user_cache_dir / str(_json_user["id"])
                        if not user_cache.is_file():
                            with Path(user_cache).open("w+") as f:
                                _json_user['json_source'] = 'Twitter'
                                json.dump(_json_user, f)
                    
                    except tweepy.TweepError as e:
                        ## No tweet available:
                        with Path(tweet_cache).open("w+") as f:
                            _dict['error'] = str(e)
                            json.dump(_dict, f)
                        try:
                            user_cache = user_cache_dir / str(_dict["user"]["id_str"])
                            if not user_cache.is_file():
                                live_user = api.get_user(_dict["user"]["id_str"])
                                _json_user = live_user._json
                                
                                # remove the user's latest status
                                try: del _json_user['status']
                                except KeyError: pass
                                
                                _json_user['json_source'] = 'Twitter'
                                
                                with Path(user_cache).open("w+") as f:
                                    json.dump(_json_user, f)
                                    
                        except tweepy.TweepError as e:
                            if not user_cache.is_file():
                                _json_user = {"error": str(e)}
                                with Path(user_cache).open("w+") as f:
                                    json.dump(_json_user, f)
                    
            bar.finish()

def tags_to_dict(rows):
    if not rows['id_str']:
        print("STOP")
    
    _dict = {
        'created_at': rows['created_at'],
        'id': int(rows['id_str']),
        'id_str': rows['id_str'],
                        'from_user': rows['from_user'],
        'full_text': rows['text'],
        'geo_coordinates': rows['geo_coordinates'],
        'lang': rows['user_lang'],
        'in_reply_to_user_id_str': rows['in_reply_to_user_id_str'],
        'in_reply_to_screen_name': rows['in_reply_to_screen_name'],
        'user': {
            'id_str': rows['from_user_id_str'],
            'followers_count': rows['user_followers_count'],
            'friends_count': rows['user_friends_count'],
        },
        'in_reply_to_status_id_str': rows['in_reply_to_status_id_str'],
        'source': rows['source'],
        'profile_image_url': rows['profile_image_url'],
        'entities_str': rows['entities_str'],
        'json_source': "TAGS"
    }
    return(_dict)

In [None]:
# The expand_tags_tsv function accepts whole directories or, if you prefer, individual TAGS archive sheets, saved as .tsv files
expand_tags_tsv('../../datasets/tags-tsv/male striptease/')

  3% (832 of 26167) |                    | Elapsed Time: 0:01:27 ETA:   1:42:35