In [None]:
#!/usr/bin/env python
""" Parses Twitter archives from Archive Team: The Twitter Stream Grab for a list of user-defined keywords

The Archive Team: The Twitter Stream Grab (https://archive.org/details/twitterstream) provides historic 
downloads of Twitter archives by month. This script helps researchers to mine this content for a list of 
words, phrases, or hashtags. This script requires the monthly archives to be downloaded and extracted from 
the .tar archive before use.

Output is a .csv file containing one record per relationship. Relationships are classified as either
(1) reply, (2) mention, or (3) tweet. A reply is a direct response to another user's post. A mention is 
where another user is mentioned, but not a diret reply. A tweet relationship are tweets with neither no 
replies or mentions.

See the modify section below to specify (1) keywords/hashtags, (2) top-level directory, and
(3) output file name.
"""

# Library imports
import json, os, bz2
import unicodecsv as csv    # unicodecsv for non-ascii handling
from IPython.display import clear_output     # to clear screen between directory prints

# Metadata
__author__ = 'Joshua Been'
__copyright__ = 'Copyright 2018, Baylor University'
__credits__ = ['Joshua Been','Ann Mirabito','Clint Ratliff','Carol Schuets']
__license__ = 'MIT License'
__maintainer__ = 'Joshua Been'
__email__ = 'Joshua_Been@baylor.edu'
__status__ = 'Development'

# Modify keywords, top-level directory, and output file name
##################################################

# Keep keywords lower case - Will match all cases
keywords = ['#likeagirl']

# Top level directory - Use forward slashes only (/) - Do not place / at end
directory = 'D:/archiveteam-twitter-stream-2014-06'

# Output .csv table name - Will be placed in same directory as Jupyter script
outfile = 'likeagirl06.csv'

##################################################


def process_json(keywords,directory,outfile):
    # Initializing variables
    cursor='  >>  '
    count_mentions=0
    count_replies=0
    count_tweets=0

    with open(outfile, 'wb') as csvfile:
        writer = csv.writer(csvfile)
        # Write header row
        writer.writerow(['poster','recipient','relationship','tweet date','tweet id','tweet','hashtags','retweet status'])
        # Walk through all subdirectories 
        for dirs, subdirs, files in os.walk(directory):

            # Screen prints
            clear_output()
            print cursor,'mentions:',count_mentions
            print cursor,'replies:',count_replies
            print cursor,'tweets:',count_tweets
            print cursor,'* total:',count_mentions+count_replies+count_tweets
            print '-'*10
            print cursor,'currently searching', dirs

            for file in files:
                if file.endswith('.bz2'):
                    # Extract bz2 archives to memory
                    file = bz2.BZ2File(os.path.join(dirs, file), 'r')
                    for line in file:
                        status=0
                        # Test for search term(s) in tweet record
                        for keyword in keywords:
                            if keyword in line.lower():
                                status=1
                        if status==1:
                            # Load each record as json object
                            tweet = json.loads(line)

                            # Save standard tweet info
                            poster=tweet['user']['screen_name']
                            tweet_date=tweet['created_at']
                            tweet_id=tweet['id']
                            tweet_text=tweet['text']

                            # Test for retweet status
                            if 'retweeted_status' in line.lower():
                                retweet='True'
                            else:
                                retweet='False'

                            # Pass hashtags to list
                            hashes = list()
                            for hashtag in tweet['entities']['hashtags']:
                                text = hashtag['text']
                                hashes.append(text)                       
                            reply_status=0

                            # Test for reply relationship
                            if not tweet['in_reply_to_screen_name'] is None:
                                writer.writerow([poster,tweet['in_reply_to_screen_name'],'reply',tweet_date,tweet_id,tweet_text,hashes,retweet])
                                reply_status=1
                                count_replies+=1

                            # Test for mention relationships
                            mentions = list()
                            for mention in tweet['entities']['user_mentions']:
                                recipient=mention['screen_name']
                                # Ensure the mention is not already a reply
                                if recipient != tweet['in_reply_to_screen_name']:
                                    writer.writerow([poster,recipient,'mentions',tweet_date,tweet_id,tweet_text,hashes,retweet])
                                    reply_status=1
                                    count_mentions+=1

                            # Write relationship as tweet if no reply or mentions
                            if reply_status==0:
                                writer.writerow([poster,poster,'tweet',tweet_date,tweet_id,tweet_text,hashes,retweet])
                                count_tweets+=1


if __name__ == '__main__':
    process_json(keywords,directory,outfile)                                
    print '-'*10
    print 'complete!'
