In [82]:
from pprint import pprint
import json, string
import csv
import tweepy
import yaml
import time

from pathlib import Path
from time import sleep
import progressbar
import re

from nltk.corpus import stopwords

In [3]:
# Set up cache directories
tweet_cache_dir = Path('/Users/kallewesterling/_twitter_cache/tweets/')
user_cache_dir = Path('/Users/kallewesterling/_twitter_cache/users/')

if not tweet_cache_dir.is_dir(): tweet_cache_dir.mkdir(parents=True)
if not user_cache_dir.is_dir(): user_cache_dir.mkdir(parents=True)

In [4]:
# Set up tweepy
class _TwitterCredentials():

    def __init__(self):
        with open('../conference-documentation/credentials.yml') as f: self._ = yaml.load(f)

    def __getitem__(self, i):
        return self._[i]

twitter_credentials = _TwitterCredentials()

auth = tweepy.OAuthHandler(twitter_credentials['consumer_key'], twitter_credentials['consumer_secret'])
auth.set_access_token(twitter_credentials['access_token'], twitter_credentials['access_token_secret'])
api = tweepy.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True)

In [72]:
def get_tsvs_from_directory(directory):
    return([x for x in Path(directory).glob(pattern="*.tsv")])
    
def expand_tags_tsv(file, _filter=False, filter_field=None, filter_value=None, done=[]):
    '''
    The expand_tags_tsv function accepts strings of individual directories or, if you prefer, individual TAGS archive sheets, saved as .tsv files, alternatively lists of either of those.
    
    For example:
    - expand_tags_tsv('../../datasets/tags-tsv/burlesk OR burleycue OR burly-q/')
    - expand_tags_tsv('../../datasets/tags-tsv/burlesque/TAGS - burlesque 1 - Archive.tsv', _filter=True, filter_field="full_text", filter_value="burlesque")
    - expand_tags_tsv('../../datasets/tags-tsv/burlesque/', _filter=True, filter_field="full_text", filter_value="burlesque", done=['TAGS - burlesque 60 - Archive.tsv'])
    '''
    if isinstance(file, str): files = [file]
    elif isinstance(file, list): files = file
    else: raise RuntimeError(f"File/path passed must either be a sole file/path or a list of files/paths.")
        
    for file in files:
        if Path(file).is_file(): _ = [Path(file)]
        elif Path(file).is_dir(): _ = get_tsvs_from_directory('../../datasets/tags-tsv/boylesque/')
        else:
            raise RuntimeError(f"Cannot interpret passed argument: {file}.")
        for tsv in _:
            if tsv.name not in done:
                empty_rows, unable_to_interpret, saved_from_twitter, saved_from_tags = 0, 0, 0, 0

                with Path(tsv).open("r") as f:
                    _len = len(f.readlines())
                with Path(tsv).open("r") as f:
                    print(f"Reading {Path(tsv).name}...")
                    time.sleep(1)
                    bar = progressbar.ProgressBar(max_value=_len).start()
                    reader = csv.DictReader(f, delimiter='\t')
                    for i, rows in enumerate(reader):
                        stop = False

                        bar.update(i)
                        if not rows['created_at'] and not rows['from_user'] and not rows['text']:
                            pass
                        try:
                            id_int = int(rows['id_str'])
                            _dict = tags_to_dict(rows)
                            if _filter: 
                                try:
                                    if filter_value in _dict[filter_field]: stop = False
                                    else: stop = True
                                except TypeError:
                                    stop = True

                            if not stop:
                                tweet_cache = tweet_cache_dir / _dict["id_str"]
                                if not tweet_cache.is_file():
                                    ## First, check twitter for tweet here... 
                                    try:
                                        live_tweet = api.get_status(_dict['id_str'], tweet_mode="extended")
                                        _json = live_tweet._json
                                        _json['json_source'] = 'Twitter'

                                        _json_user = _json['user']
                                        _json['user'] = _json_user['id']

                                        with Path(tweet_cache).open("w+") as f:
                                            json.dump(_json, f)

                                        user_cache = user_cache_dir / str(_json_user["id"])
                                        if not user_cache.is_file():
                                            with Path(user_cache).open("w+") as f:
                                                _json_user['json_source'] = 'Twitter'
                                                json.dump(_json_user, f)

                                        saved_from_twitter += 1

                                    except tweepy.TweepError as e:
                                        ## No tweet available:
                                        with Path(tweet_cache).open("w+") as f:
                                            _dict['error'] = str(e)
                                            json.dump(_dict, f)
                                        saved_from_tags += 1

                                        try:
                                            user_cache = user_cache_dir / str(_dict["user"]["id_str"])
                                            if not user_cache.is_file():
                                                live_user = api.get_user(_dict["user"]["id_str"])
                                                _json_user = live_user._json

                                                # remove the user's latest status
                                                try: del _json_user['status']
                                                except KeyError: pass

                                                _json_user['json_source'] = 'Twitter'

                                                with Path(user_cache).open("w+") as f:
                                                    json.dump(_json_user, f)

                                        except tweepy.TweepError as e:
                                            if not user_cache.is_file():
                                                _json_user = {"error": str(e)}
                                                with Path(user_cache).open("w+") as f:
                                                    json.dump(_json_user, f)


                        except ValueError:
                            if rows['id_str'] == "":
                                empty_rows += 1
                            else:
                                try:
                                    # print(f"Warning: Could not interpret row {i} in file {Path(file).name} due to unintelligibility but trying to find tweet ID...")
                                    _ = re.search(pattern="\/statuses\/([0-9]+)'\)", string=str(rows))
                                    g = _.groups()
                                    # print(f"Good news! Found ID: {g}")
                                    tweet_cache = tweet_cache_dir / g[0]
                                    if not tweet_cache.is_file():
                                        ## First, check twitter for tweet here... 
                                        try:
                                            if _filter: 
                                                try:
                                                    if filter_value in _dict[filter_field]: stop = False
                                                    else: stop = True
                                                except TypeError:
                                                    stop = True

                                            if not stop:
                                                live_tweet = api.get_status(g[0], tweet_mode="extended")
                                                _json = live_tweet._json
                                                _json['json_source'] = 'Twitter'

                                                _json_user = _json['user']
                                                _json['user'] = _json_user['id']

                                                with Path(tweet_cache).open("w+") as f:
                                                    json.dump(_json, f)

                                                user_cache = user_cache_dir / str(_json_user["id"])
                                                if not user_cache.is_file():
                                                    with Path(user_cache).open("w+") as f:
                                                        _json_user['json_source'] = 'Twitter'
                                                        json.dump(_json_user, f)
                                        except tweepy.TweepError as e:
                                            ## No tweet available:
                                            with Path(tweet_cache).open("w+") as f:
                                                _dict['error'] = str(e)
                                                json.dump(_dict, f)
                                        except:
                                            print(f"An error occurred in file {f}")
                                except:
                                    unable_to_interpret += 1
                    bar.finish()
                    print(f"---- done processing {Path(file).name} - report: -----")
                    print(f"- {empty_rows} empty rows encountered.")
                    print(f"- Unable to interpret {unable_to_interpret} rows.")
                    print("\n")
                    print(f"- {saved_from_twitter} tweets saved from Twitter.")
                    print(f"- {saved_from_tags} tweets saved from TAGS.")
                    print("\n")

def tags_to_dict(rows):
    if not rows['id_str']:
        print("STOP")
        pprint(rows)
        exit()
    else:
        _dict = {
            'created_at': rows['created_at'],
            'id': int(rows['id_str']),
            'id_str': rows['id_str'],
                            'from_user': rows['from_user'],
            'full_text': rows['text'],
            'geo_coordinates': rows['geo_coordinates'],
            'lang': rows['user_lang'],
            'in_reply_to_user_id_str': rows['in_reply_to_user_id_str'],
            'in_reply_to_screen_name': rows['in_reply_to_screen_name'],
            'user': {
                'id_str': rows['from_user_id_str'],
                'followers_count': rows['user_followers_count'],
                'friends_count': rows['user_friends_count'],
            },
            'in_reply_to_status_id_str': rows['in_reply_to_status_id_str'],
            'source': rows['source'],
            'profile_image_url': rows['profile_image_url'],
            'entities_str': rows['entities_str'],
            'json_source': "TAGS"
        }
    return(_dict)

def get_ids_from_tsv(file):
    if not Path(file).is_file(): raise RuntimeError(f"File {file} does not exist.")
    
    all_ids = []
    with Path(file).open("r") as f:
        reader = csv.DictReader(f, delimiter='\t')
        for i, rows in enumerate(reader):
            if not rows['created_at'] and not rows['from_user'] and not rows['text']: pass # skip empty rows
            try:
                id_int = int(rows['id_str'])
                all_ids.append(id_int)
            except:
                print(f"Warning: ID ({rows['id_str']}) could not be interpreted as number.")
    return(list(set(all_ids)))

def get_json_from_cache(id=None):
    tweet_cache = tweet_cache_dir / str(id)
    if not tweet_cache.is_file():
        raise RuntimeError(f"File {id} could not be opened.")
    else:
        with open(tweet_cache, "r") as f:
            _json = json.load(f)
        return(_json)

In [74]:
all_ids = get_ids_from_tsv(Path('../../datasets/tags-tsv/boylesque/TAGS - boylesque - Archive.tsv'))



In [77]:
for id in all_ids[0:10]:
    _json = get_json_from_cache(id)
    if "boylesque" in _json['full_text']:
        if 'geo' in _json and _json['geo'] is not None: 
            pass # pprint(_json['geo'])
        clean_text(_json['full_text'])

NameError: name 'stopwords' is not defined

In [67]:
import instagram
instagram.__file__

Running Instagram module version 2019-04-29.


'/usr/local/lib/python3.7/site-packages/instagram.py'

In [10]:
expand_tags_tsv(['../../datasets/tags-tsv/boylesque/', '../../datasets/tags-tsv/boy-lesque/'], _filter=True, filter_field="full_text", filter_value="burlesque") # done

Reading TAGS - boylesque - Archive.tsv...






































100% (38577 of 38577) |##################| Elapsed Time: 0:00:17 Time:  0:00:17


---- done processing boylesque - report: -----
- 2 empty rows encountered.
- Unable to interpret 0 rows.


- 0 tweets saved from Twitter.
- 0 tweets saved from TAGS.


Reading TAGS - boy-lesque - Archive.tsv...


100% (87 of 87) |########################| Elapsed Time: 0:00:00 Time:  0:00:00


---- done processing boy-lesque - report: -----
- 0 empty rows encountered.
- Unable to interpret 0 rows.


- 0 tweets saved from Twitter.
- 0 tweets saved from TAGS.




In [22]:
expand_tags_tsv(['../../datasets/tags-tsv/male striptease/','../../datasets/tags-tsv/male burlesque/'])

Reading TAGS - boylesque - Archive.tsv...














  7% (2967 of 38577) |#                  | Elapsed Time: 0:00:05 ETA:   0:01:43

KeyboardInterrupt: 