In [22]:
# for data wrangling and sampling
import pandas as pd
import numpy as np
import random

import requests       # to download files programmatically
import os             # to save/open files and for terminal-like commands to navigate local machine
import tweepy
import pprint as pp   # data pretty printer - https://docs.python.org/2/library/pprint.html
import json           # for json I/O and parsing
import time           # for timing code and dealing with Twitter's rate limit

# Set the random seed to assure the same answers are returned each time 
random.seed(42)

# for plotting
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sb

# for (potential) regression modeling of data
import statsmodels.api as sm;
from patsy import dmatrices
from statsmodels.stats.outliers_influence import variance_inflation_factor

### Read in a Twitter archive (manual / already provided)

In [23]:
df_archive = pd.read_csv("twitter-archive-enhanced.csv")
df_archive.head(1)

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,timestamp,source,text,retweeted_status_id,retweeted_status_user_id,retweeted_status_timestamp,expanded_urls,rating_numerator,rating_denominator,name,doggo,floofer,pupper,puppo
0,892420643555336193,,,2017-08-01 16:23:56 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Phineas. He's a mystical boy. Only eve...,,,,https://twitter.com/dog_rates/status/892420643...,13,10,Phineas,,,,


### Download tweet image predictions (programmatically, from a url)

In [24]:
# get file from a url
url = 'https://d17h27t6h515a5.cloudfront.net/topher/2017/August/599fd2ad_image-predictions/image-predictions.tsv'
response = requests.get(url)

# get the current working directory
folder_name = os.getcwd()

# get the filename
file_name = url.split('/')[-1]

# save the retrieved file to local storage
with open(os.path.join(folder_name,
                      file_name), mode='wb') as file:
    file.write(response.content)

# read in the downloaded file
df_images = pd.read_csv(file_name, sep='\t')
df_images.head()

Unnamed: 0,tweet_id,jpg_url,img_num,p1,p1_conf,p1_dog,p2,p2_conf,p2_dog,p3,p3_conf,p3_dog
0,666020888022790149,https://pbs.twimg.com/media/CT4udn0WwAA0aMy.jpg,1,Welsh_springer_spaniel,0.465074,True,collie,0.156665,True,Shetland_sheepdog,0.061428,True
1,666029285002620928,https://pbs.twimg.com/media/CT42GRgUYAA5iDo.jpg,1,redbone,0.506826,True,miniature_pinscher,0.074192,True,Rhodesian_ridgeback,0.07201,True
2,666033412701032449,https://pbs.twimg.com/media/CT4521TWwAEvMyu.jpg,1,German_shepherd,0.596461,True,malinois,0.138584,True,bloodhound,0.116197,True
3,666044226329800704,https://pbs.twimg.com/media/CT5Dr8HUEAA-lEu.jpg,1,Rhodesian_ridgeback,0.408143,True,redbone,0.360687,True,miniature_pinscher,0.222752,True
4,666049248165822465,https://pbs.twimg.com/media/CT5IQmsXIAAKY4A.jpg,1,miniature_pinscher,0.560311,True,Rottweiler,0.243682,True,Doberman,0.154629,True


NOTE:
* The response variable is in bytes format, not text format.
* As such, the 'wb' flag is used when writing the file locally
* [Link to a StackOverflow post](https://stackoverflow.com/questions/2665866/what-does-wb-mean-in-this-code-using-python) on the subject<br><br>

Template code for future reference:

In [None]:
# template code to make a directory if it doesn't already exist
#folder_name = 'my_new_folder'
#if not os.path.exists(folder_name):
#    os.makedirs(folder_name)

# command to list the current directory's contents
#os.listdir()

### Create an API object to gather Twitter data

In [25]:
# get the API Access Token and Acces Token Secret
from twAPI_tokens_GoodDoggo import API_KEY, API_KEY_SECRET, API_TOKEN, API_TOKEN_SECRET

CONSUMER_KEY = API_KEY
CONSUMER_SECRET = API_KEY_SECRET
ACCESS_TOKEN = API_TOKEN
ACCESS_SECRET = API_TOKEN_SECRET

auth = tweepy.OAuthHandler(CONSUMER_KEY, CONSUMER_SECRET)
auth.set_access_token(ACCESS_TOKEN, ACCESS_SECRET)

# api = tweepy.API(auth)

# code snippet for getting around the twitter rate limit:
api = tweepy.API(auth, 
                 wait_on_rate_limit=True,
                 wait_on_rate_limit_notify=True)

#### Get a list of tweet IDs:

In [26]:
# Check if there are any repeated tweets in the archive
numUniqueValues = df_archive.tweet_id.nunique()
print('Number of tweets: ' + str(len(df_archive)))
print('Number of repeated tweets: ' + str(len(df_archive) - numUniqueValues))

# Create list of tweet IDs
tweet_id_list = df_archive.tweet_id.tolist()

Number of tweets: 2356
Number of repeated tweets: 0


### Use the API to get info for each tweet

* ___Retrieve json data for the first tweet and write it to local storage___
* [StackOverflow article](https://stackoverflow.com/questions/28384588/twitter-api-get-tweets-with-specific-id) on getting JSON data for a specific tweet
* [StackAbuse article](https://stackabuse.com/reading-and-writing-json-to-a-file-in-python/) on reading and writing JSON to a file in Python

In [44]:
# a succinct list for testing
tweet_ids_succinct = tweet_id_list[0:4]

# loop through multiple tweet_id's, retrieving and writing their json data to 'tweet_json.txt' 
with open('tweet_json.txt', mode = 'w') as textFile:
    count = 0
    for tweet_id in tweet_id_list:
        count = count + 1
        start = time.time()
        try:
            status = api.get_status(tweet_id)
            jsonStr = json.dumps(status._json)
        except:
            continue     # tweet no longer exists
        textFile.write(jsonStr + '\n')
        end = time.time()
        currTime = str(time.localtime().tm_hour) + ':' + str(time.localtime().tm_min) + ':' + str(time.localtime().tm_sec)
        print('count: ' + str(count) + ', time elapsed: ' + str(end - start) + ', current time: ' + currTime)
        
# # retrieve json data for the first tweet and write it to local storage
# tweet_id = df_archive['tweet_id'][0]
# with open('tweet_json.txt', mode = 'w') as textFile:
#     status = api.get_status(tweet_id, tweet_mode='extended')
#     jsonStr = json.dumps(status._json)
#     textFile.write(jsonStr + '\n')

count: 1, time elapsed: 0.1855919361114502, current time: 6:55:47
count: 2, time elapsed: 0.20209717750549316, current time: 6:55:47
count: 3, time elapsed: 0.1695411205291748, current time: 6:55:47
count: 4, time elapsed: 0.1604290008544922, current time: 6:55:48
count: 5, time elapsed: 0.1684889793395996, current time: 6:55:48
count: 6, time elapsed: 0.18327879905700684, current time: 6:55:48
count: 7, time elapsed: 0.18135905265808105, current time: 6:55:48
count: 8, time elapsed: 0.21911001205444336, current time: 6:55:48
count: 9, time elapsed: 0.1818697452545166, current time: 6:55:49
count: 10, time elapsed: 0.20307302474975586, current time: 6:55:49
count: 11, time elapsed: 0.19119691848754883, current time: 6:55:49
count: 12, time elapsed: 0.17891478538513184, current time: 6:55:49
count: 13, time elapsed: 0.1970350742340088, current time: 6:55:49
count: 14, time elapsed: 0.17577195167541504, current time: 6:55:50
count: 15, time elapsed: 0.17529606819152832, current time: 6:5

count: 129, time elapsed: 0.1992349624633789, current time: 6:56:12
count: 130, time elapsed: 0.20294713973999023, current time: 6:56:12
count: 131, time elapsed: 0.2182750701904297, current time: 6:56:12
count: 132, time elapsed: 0.19101309776306152, current time: 6:56:12
count: 134, time elapsed: 0.20110797882080078, current time: 6:56:13
count: 135, time elapsed: 0.17737197875976562, current time: 6:56:13
count: 136, time elapsed: 0.17316102981567383, current time: 6:56:13
count: 137, time elapsed: 0.18703293800354004, current time: 6:56:13
count: 138, time elapsed: 0.3594951629638672, current time: 6:56:13
count: 139, time elapsed: 0.21666789054870605, current time: 6:56:14
count: 140, time elapsed: 0.19523406028747559, current time: 6:56:14
count: 141, time elapsed: 0.22452688217163086, current time: 6:56:14
count: 142, time elapsed: 0.18011903762817383, current time: 6:56:14
count: 143, time elapsed: 0.1993701457977295, current time: 6:56:14
count: 144, time elapsed: 0.1735808849

count: 255, time elapsed: 0.16694402694702148, current time: 6:56:35
count: 256, time elapsed: 0.17322611808776855, current time: 6:56:35
count: 257, time elapsed: 0.1681511402130127, current time: 6:56:35
count: 258, time elapsed: 0.20086884498596191, current time: 6:56:35
count: 259, time elapsed: 0.16301703453063965, current time: 6:56:35
count: 260, time elapsed: 0.18730497360229492, current time: 6:56:35
count: 262, time elapsed: 0.16507887840270996, current time: 6:56:36
count: 263, time elapsed: 0.1884608268737793, current time: 6:56:36
count: 264, time elapsed: 0.16106104850769043, current time: 6:56:36
count: 265, time elapsed: 0.18955492973327637, current time: 6:56:36
count: 266, time elapsed: 0.1636497974395752, current time: 6:56:36
count: 267, time elapsed: 0.16487884521484375, current time: 6:56:37
count: 268, time elapsed: 0.1811990737915039, current time: 6:56:37
count: 269, time elapsed: 0.1631629467010498, current time: 6:56:37
count: 270, time elapsed: 0.17678809165

count: 378, time elapsed: 0.2101147174835205, current time: 6:56:56
count: 379, time elapsed: 0.1636049747467041, current time: 6:56:56
count: 380, time elapsed: 0.16983890533447266, current time: 6:56:56
count: 381, time elapsed: 0.16713595390319824, current time: 6:56:56
count: 382, time elapsed: 0.1608428955078125, current time: 6:56:57
count: 384, time elapsed: 0.16774606704711914, current time: 6:56:57
count: 385, time elapsed: 0.21935701370239258, current time: 6:56:57
count: 386, time elapsed: 0.16494488716125488, current time: 6:56:57
count: 387, time elapsed: 0.18173694610595703, current time: 6:56:58
count: 388, time elapsed: 0.1714329719543457, current time: 6:56:58
count: 389, time elapsed: 0.16606807708740234, current time: 6:56:58
count: 390, time elapsed: 0.16333222389221191, current time: 6:56:58
count: 391, time elapsed: 0.1719040870666504, current time: 6:56:58
count: 392, time elapsed: 0.16288280487060547, current time: 6:56:58
count: 393, time elapsed: 0.17881608009

count: 499, time elapsed: 0.20387506484985352, current time: 6:57:18
count: 500, time elapsed: 0.1662888526916504, current time: 6:57:18
count: 501, time elapsed: 0.1623067855834961, current time: 6:57:18
count: 502, time elapsed: 0.18772006034851074, current time: 6:57:18
count: 503, time elapsed: 0.1686420440673828, current time: 6:57:18
count: 504, time elapsed: 0.17782306671142578, current time: 6:57:19
count: 505, time elapsed: 0.16399526596069336, current time: 6:57:19
count: 506, time elapsed: 0.1823420524597168, current time: 6:57:19
count: 508, time elapsed: 0.18465185165405273, current time: 6:57:19
count: 509, time elapsed: 0.20055389404296875, current time: 6:57:20
count: 510, time elapsed: 0.18357491493225098, current time: 6:57:20
count: 511, time elapsed: 0.17842888832092285, current time: 6:57:20
count: 512, time elapsed: 0.17116808891296387, current time: 6:57:20
count: 513, time elapsed: 0.19210433959960938, current time: 6:57:20
count: 514, time elapsed: 0.2064657211

count: 621, time elapsed: 0.17682790756225586, current time: 6:57:41
count: 622, time elapsed: 0.1717357635498047, current time: 6:57:41
count: 623, time elapsed: 0.18858695030212402, current time: 6:57:41
count: 624, time elapsed: 0.16782379150390625, current time: 6:57:41
count: 625, time elapsed: 0.1669001579284668, current time: 6:57:42
count: 626, time elapsed: 0.16177892684936523, current time: 6:57:42
count: 627, time elapsed: 0.16554784774780273, current time: 6:57:42
count: 628, time elapsed: 0.17503094673156738, current time: 6:57:42
count: 629, time elapsed: 0.17290806770324707, current time: 6:57:42
count: 630, time elapsed: 0.17932915687561035, current time: 6:57:42
count: 631, time elapsed: 0.15502214431762695, current time: 6:57:43
count: 632, time elapsed: 0.16101717948913574, current time: 6:57:43
count: 633, time elapsed: 0.16632986068725586, current time: 6:57:43
count: 634, time elapsed: 0.17981576919555664, current time: 6:57:43
count: 635, time elapsed: 0.17599701

count: 741, time elapsed: 0.3398170471191406, current time: 6:58:2
count: 742, time elapsed: 0.2301039695739746, current time: 6:58:2
count: 743, time elapsed: 0.17896032333374023, current time: 6:58:2
count: 744, time elapsed: 0.1948871612548828, current time: 6:58:3
count: 745, time elapsed: 0.16974496841430664, current time: 6:58:3
count: 746, time elapsed: 0.18752098083496094, current time: 6:58:3
count: 747, time elapsed: 0.17000985145568848, current time: 6:58:3
count: 748, time elapsed: 0.16823792457580566, current time: 6:58:3
count: 749, time elapsed: 0.23981714248657227, current time: 6:58:4
count: 750, time elapsed: 0.16799402236938477, current time: 6:58:4
count: 751, time elapsed: 0.16423606872558594, current time: 6:58:4
count: 752, time elapsed: 0.17925381660461426, current time: 6:58:4
count: 753, time elapsed: 0.16433215141296387, current time: 6:58:4
count: 754, time elapsed: 0.17451810836791992, current time: 6:58:4
count: 755, time elapsed: 0.17507100105285645, curr

count: 863, time elapsed: 0.17493724822998047, current time: 6:58:24
count: 864, time elapsed: 0.17312169075012207, current time: 6:58:24
count: 865, time elapsed: 0.16046810150146484, current time: 6:58:24
count: 866, time elapsed: 0.18503069877624512, current time: 6:58:24
count: 867, time elapsed: 0.17368221282958984, current time: 6:58:24
count: 868, time elapsed: 0.16306686401367188, current time: 6:58:25
count: 869, time elapsed: 0.1766977310180664, current time: 6:58:25
count: 870, time elapsed: 0.16569018363952637, current time: 6:58:25
count: 871, time elapsed: 0.16328811645507812, current time: 6:58:25
count: 872, time elapsed: 0.1704859733581543, current time: 6:58:25
count: 873, time elapsed: 0.18094635009765625, current time: 6:58:25
count: 874, time elapsed: 0.1994168758392334, current time: 6:58:26
count: 875, time elapsed: 0.16266798973083496, current time: 6:58:26
count: 876, time elapsed: 0.17027807235717773, current time: 6:58:26
count: 877, time elapsed: 0.204294204

Rate limit reached. Sleeping for: 696


count: 895, time elapsed: 0.16147780418395996, current time: 6:58:29
count: 896, time elapsed: 701.2810881137848, current time: 7:10:11
count: 897, time elapsed: 0.1842491626739502, current time: 7:10:11
count: 898, time elapsed: 0.1674661636352539, current time: 7:10:11
count: 899, time elapsed: 0.18051600456237793, current time: 7:10:11
count: 900, time elapsed: 0.17761611938476562, current time: 7:10:11
count: 901, time elapsed: 0.18134379386901855, current time: 7:10:12
count: 902, time elapsed: 0.16456317901611328, current time: 7:10:12
count: 903, time elapsed: 0.17784976959228516, current time: 7:10:12
count: 904, time elapsed: 0.17822504043579102, current time: 7:10:12
count: 905, time elapsed: 0.1769402027130127, current time: 7:10:12
count: 906, time elapsed: 0.16826200485229492, current time: 7:10:12
count: 907, time elapsed: 0.17831182479858398, current time: 7:10:13
count: 908, time elapsed: 0.1726539134979248, current time: 7:10:13
count: 909, time elapsed: 0.186105012893

count: 1017, time elapsed: 0.17386293411254883, current time: 7:10:32
count: 1018, time elapsed: 0.16481924057006836, current time: 7:10:32
count: 1019, time elapsed: 0.18266892433166504, current time: 7:10:33
count: 1020, time elapsed: 0.17133283615112305, current time: 7:10:33
count: 1021, time elapsed: 0.16434669494628906, current time: 7:10:33
count: 1022, time elapsed: 0.16705012321472168, current time: 7:10:33
count: 1023, time elapsed: 0.1736760139465332, current time: 7:10:33
count: 1024, time elapsed: 0.18394708633422852, current time: 7:10:33
count: 1025, time elapsed: 0.16290616989135742, current time: 7:10:34
count: 1026, time elapsed: 0.16834187507629395, current time: 7:10:34
count: 1027, time elapsed: 0.1708378791809082, current time: 7:10:34
count: 1028, time elapsed: 0.1683652400970459, current time: 7:10:34
count: 1029, time elapsed: 0.1953279972076416, current time: 7:10:34
count: 1030, time elapsed: 0.16695404052734375, current time: 7:10:34
count: 1031, time elapse

count: 1136, time elapsed: 0.16630983352661133, current time: 7:10:53
count: 1137, time elapsed: 0.18674898147583008, current time: 7:10:53
count: 1138, time elapsed: 0.16687369346618652, current time: 7:10:54
count: 1139, time elapsed: 0.19078612327575684, current time: 7:10:54
count: 1140, time elapsed: 0.16182780265808105, current time: 7:10:54
count: 1141, time elapsed: 0.16255688667297363, current time: 7:10:54
count: 1142, time elapsed: 0.17612290382385254, current time: 7:10:54
count: 1143, time elapsed: 0.1792759895324707, current time: 7:10:54
count: 1144, time elapsed: 0.17066192626953125, current time: 7:10:55
count: 1145, time elapsed: 0.1639089584350586, current time: 7:10:55
count: 1146, time elapsed: 0.18044614791870117, current time: 7:10:55
count: 1147, time elapsed: 0.16910886764526367, current time: 7:10:55
count: 1148, time elapsed: 0.1668870449066162, current time: 7:10:55
count: 1149, time elapsed: 0.17142510414123535, current time: 7:10:55
count: 1150, time elaps

count: 1256, time elapsed: 0.17024803161621094, current time: 7:11:14
count: 1257, time elapsed: 0.166795015335083, current time: 7:11:14
count: 1258, time elapsed: 0.1985940933227539, current time: 7:11:15
count: 1259, time elapsed: 0.19158506393432617, current time: 7:11:15
count: 1260, time elapsed: 0.15876102447509766, current time: 7:11:15
count: 1261, time elapsed: 0.17010807991027832, current time: 7:11:15
count: 1262, time elapsed: 0.1748809814453125, current time: 7:11:15
count: 1263, time elapsed: 0.15707111358642578, current time: 7:11:15
count: 1264, time elapsed: 0.17585492134094238, current time: 7:11:16
count: 1265, time elapsed: 0.17490816116333008, current time: 7:11:16
count: 1266, time elapsed: 0.1795032024383545, current time: 7:11:16
count: 1267, time elapsed: 0.17395973205566406, current time: 7:11:16
count: 1268, time elapsed: 0.16336894035339355, current time: 7:11:16
count: 1269, time elapsed: 0.19246816635131836, current time: 7:11:17
count: 1270, time elapsed

count: 1374, time elapsed: 0.17633914947509766, current time: 7:11:35
count: 1375, time elapsed: 0.15984106063842773, current time: 7:11:36
count: 1376, time elapsed: 0.16207098960876465, current time: 7:11:36
count: 1377, time elapsed: 0.16570782661437988, current time: 7:11:36
count: 1378, time elapsed: 0.18469810485839844, current time: 7:11:36
count: 1379, time elapsed: 0.17603564262390137, current time: 7:11:36
count: 1380, time elapsed: 0.17976999282836914, current time: 7:11:36
count: 1381, time elapsed: 0.16701078414916992, current time: 7:11:37
count: 1382, time elapsed: 0.16613984107971191, current time: 7:11:37
count: 1383, time elapsed: 0.1745610237121582, current time: 7:11:37
count: 1384, time elapsed: 0.17855286598205566, current time: 7:11:37
count: 1385, time elapsed: 0.1721019744873047, current time: 7:11:37
count: 1386, time elapsed: 0.18915677070617676, current time: 7:11:37
count: 1387, time elapsed: 0.16534018516540527, current time: 7:11:38
count: 1388, time elap

count: 1493, time elapsed: 0.16414785385131836, current time: 7:11:56
count: 1494, time elapsed: 0.18096208572387695, current time: 7:11:56
count: 1495, time elapsed: 0.16531014442443848, current time: 7:11:56
count: 1496, time elapsed: 0.16854429244995117, current time: 7:11:57
count: 1497, time elapsed: 0.17048215866088867, current time: 7:11:57
count: 1498, time elapsed: 0.17095184326171875, current time: 7:11:57
count: 1499, time elapsed: 0.18622827529907227, current time: 7:11:57
count: 1500, time elapsed: 0.15934514999389648, current time: 7:11:57
count: 1501, time elapsed: 0.1782362461090088, current time: 7:11:57
count: 1502, time elapsed: 0.17113304138183594, current time: 7:11:58
count: 1503, time elapsed: 0.17197322845458984, current time: 7:11:58
count: 1504, time elapsed: 0.17844724655151367, current time: 7:11:58
count: 1505, time elapsed: 0.1726682186126709, current time: 7:11:58
count: 1506, time elapsed: 0.1783132553100586, current time: 7:11:58
count: 1507, time elaps

count: 1612, time elapsed: 0.17819786071777344, current time: 7:12:17
count: 1613, time elapsed: 0.17351508140563965, current time: 7:12:17
count: 1614, time elapsed: 0.18322420120239258, current time: 7:12:17
count: 1615, time elapsed: 0.18434596061706543, current time: 7:12:17
count: 1616, time elapsed: 0.17224979400634766, current time: 7:12:18
count: 1617, time elapsed: 0.1836719512939453, current time: 7:12:18
count: 1618, time elapsed: 0.16745495796203613, current time: 7:12:18
count: 1619, time elapsed: 0.18030905723571777, current time: 7:12:18
count: 1620, time elapsed: 0.16991829872131348, current time: 7:12:18
count: 1621, time elapsed: 0.16541695594787598, current time: 7:12:19
count: 1622, time elapsed: 0.17206406593322754, current time: 7:12:19
count: 1623, time elapsed: 0.1715240478515625, current time: 7:12:19
count: 1624, time elapsed: 0.18260407447814941, current time: 7:12:19
count: 1625, time elapsed: 0.17432284355163574, current time: 7:12:19
count: 1626, time elap

count: 1732, time elapsed: 0.19371795654296875, current time: 7:12:38
count: 1733, time elapsed: 0.17037510871887207, current time: 7:12:38
count: 1734, time elapsed: 0.17367100715637207, current time: 7:12:38
count: 1735, time elapsed: 0.1574561595916748, current time: 7:12:38
count: 1736, time elapsed: 0.17107105255126953, current time: 7:12:39
count: 1737, time elapsed: 0.16915512084960938, current time: 7:12:39
count: 1738, time elapsed: 0.17894601821899414, current time: 7:12:39
count: 1739, time elapsed: 0.18193578720092773, current time: 7:12:39
count: 1740, time elapsed: 0.1693558692932129, current time: 7:12:39
count: 1741, time elapsed: 0.16578888893127441, current time: 7:12:39
count: 1742, time elapsed: 0.18237805366516113, current time: 7:12:40
count: 1743, time elapsed: 0.15578722953796387, current time: 7:12:40
count: 1744, time elapsed: 0.17885589599609375, current time: 7:12:40
count: 1745, time elapsed: 0.1675548553466797, current time: 7:12:40
count: 1746, time elaps

Rate limit reached. Sleeping for: 742


count: 1795, time elapsed: 0.18203425407409668, current time: 7:12:49
count: 1796, time elapsed: 747.2817389965057, current time: 7:25:16
count: 1797, time elapsed: 0.1767580509185791, current time: 7:25:16
count: 1798, time elapsed: 0.18965911865234375, current time: 7:25:17
count: 1799, time elapsed: 0.18723297119140625, current time: 7:25:17
count: 1800, time elapsed: 0.1808760166168213, current time: 7:25:17
count: 1801, time elapsed: 0.18085098266601562, current time: 7:25:17
count: 1802, time elapsed: 0.17388486862182617, current time: 7:25:17
count: 1803, time elapsed: 0.1715538501739502, current time: 7:25:17
count: 1804, time elapsed: 0.18264985084533691, current time: 7:25:18
count: 1805, time elapsed: 0.20826482772827148, current time: 7:25:18
count: 1806, time elapsed: 0.16235876083374023, current time: 7:25:18
count: 1807, time elapsed: 0.17551422119140625, current time: 7:25:18
count: 1808, time elapsed: 0.16676998138427734, current time: 7:25:18
count: 1809, time elapsed

count: 1914, time elapsed: 0.17865991592407227, current time: 7:25:37
count: 1915, time elapsed: 0.1627960205078125, current time: 7:25:37
count: 1916, time elapsed: 0.19804811477661133, current time: 7:25:38
count: 1917, time elapsed: 0.15924406051635742, current time: 7:25:38
count: 1918, time elapsed: 0.17779088020324707, current time: 7:25:38
count: 1919, time elapsed: 0.17874789237976074, current time: 7:25:38
count: 1920, time elapsed: 0.16788196563720703, current time: 7:25:38
count: 1921, time elapsed: 0.17015886306762695, current time: 7:25:39
count: 1922, time elapsed: 0.17274904251098633, current time: 7:25:39
count: 1923, time elapsed: 0.15694713592529297, current time: 7:25:39
count: 1924, time elapsed: 0.2017519474029541, current time: 7:25:39
count: 1925, time elapsed: 0.19332075119018555, current time: 7:25:39
count: 1926, time elapsed: 0.161423921585083, current time: 7:25:39
count: 1927, time elapsed: 0.16120696067810059, current time: 7:25:40
count: 1928, time elapse

count: 2032, time elapsed: 0.17225384712219238, current time: 7:25:58
count: 2033, time elapsed: 0.17599797248840332, current time: 7:25:58
count: 2034, time elapsed: 0.1931169033050537, current time: 7:25:58
count: 2035, time elapsed: 0.17323899269104004, current time: 7:25:59
count: 2036, time elapsed: 0.18654990196228027, current time: 7:25:59
count: 2037, time elapsed: 0.19088387489318848, current time: 7:25:59
count: 2038, time elapsed: 0.18507790565490723, current time: 7:25:59
count: 2039, time elapsed: 0.18239808082580566, current time: 7:25:59
count: 2040, time elapsed: 0.15967059135437012, current time: 7:25:59
count: 2041, time elapsed: 0.1970658302307129, current time: 7:26:0
count: 2042, time elapsed: 0.16406989097595215, current time: 7:26:0
count: 2043, time elapsed: 0.16477394104003906, current time: 7:26:0
count: 2044, time elapsed: 0.18007397651672363, current time: 7:26:0
count: 2045, time elapsed: 0.15897083282470703, current time: 7:26:0
count: 2046, time elapsed: 

count: 2152, time elapsed: 0.16472697257995605, current time: 7:26:19
count: 2153, time elapsed: 0.19032788276672363, current time: 7:26:20
count: 2154, time elapsed: 0.1660633087158203, current time: 7:26:20
count: 2155, time elapsed: 0.20095586776733398, current time: 7:26:20
count: 2156, time elapsed: 0.17760109901428223, current time: 7:26:20
count: 2157, time elapsed: 0.17643976211547852, current time: 7:26:20
count: 2158, time elapsed: 0.1613008975982666, current time: 7:26:20
count: 2159, time elapsed: 0.20997929573059082, current time: 7:26:21
count: 2160, time elapsed: 0.177994966506958, current time: 7:26:21
count: 2161, time elapsed: 0.17614102363586426, current time: 7:26:21
count: 2162, time elapsed: 0.16861915588378906, current time: 7:26:21
count: 2163, time elapsed: 0.17066693305969238, current time: 7:26:21
count: 2164, time elapsed: 0.2077341079711914, current time: 7:26:22
count: 2165, time elapsed: 0.16549301147460938, current time: 7:26:22
count: 2166, time elapsed

count: 2270, time elapsed: 0.17571115493774414, current time: 7:26:41
count: 2271, time elapsed: 0.21731090545654297, current time: 7:26:41
count: 2272, time elapsed: 0.16182494163513184, current time: 7:26:42
count: 2273, time elapsed: 0.17721295356750488, current time: 7:26:42
count: 2274, time elapsed: 0.1832749843597412, current time: 7:26:42
count: 2275, time elapsed: 0.17525506019592285, current time: 7:26:42
count: 2276, time elapsed: 0.17358112335205078, current time: 7:26:42
count: 2277, time elapsed: 0.17379999160766602, current time: 7:26:42
count: 2278, time elapsed: 0.15820693969726562, current time: 7:26:43
count: 2279, time elapsed: 0.17076802253723145, current time: 7:26:43
count: 2280, time elapsed: 0.17087793350219727, current time: 7:26:43
count: 2281, time elapsed: 0.17256784439086914, current time: 7:26:43
count: 2282, time elapsed: 0.1593151092529297, current time: 7:26:43
count: 2283, time elapsed: 0.17234110832214355, current time: 7:26:43
count: 2284, time elap

__Print first line of 'tweet_json.txt' to check that the above worked__

In [16]:
# print first line of 'tweet_json.txt' to check that the above worked
# with open('tweet_json.txt') as jsonFile:
#     line = jsonFile.readline()
#     tweet = json.loads(line)
#     pp.pprint(tweet)

**Add the tweet data to a dataframe**

In [49]:
# create a local dataframe for storing tweet data
tweetInfo = pd.DataFrame(columns = ['tweetID', 'retweet_count', 'favorite_count'])

# store tweet data to the dataframe
with open('tweet_json.txt') as jsonFile:
    count = 0
    start = time.time()
    for line in jsonFile:
        count = count + 1
        tweet = json.loads(line)
        tweetInfo = tweetInfo.append({
            'tweetID': tweet['id'],
            'retweet_count': tweet['retweet_count'],
            'favorite_count': tweet['favorite_count']
        }, ignore_index=True)
        end = time.time()
        if (np.remainder(count, 200) == 0):
            currTime = str(time.localtime().tm_hour) + ':' + str(time.localtime().tm_min) + ':' + str(time.localtime().tm_sec)
            print('count: ' + str(count) + ', time elapsed: ' + str(end - start) + ', current time: ' + currTime)
        
# # add a single tweet's data to the dataframe
# tweetInfo = tweetInfo.append({
#     'tweetID': tweet['id'],
#     'retweet_count': tweet['favorite_count'],
#     'favorite_count': tweet['retweet_count']
# },ignore_index=True)

#tweetInfo.head()

count: 200, time elapsed: 0.410053014755249, current time: 7:40:16
count: 400, time elapsed: 0.6894791126251221, current time: 7:40:16
count: 600, time elapsed: 1.0328001976013184, current time: 7:40:16
count: 800, time elapsed: 1.326280117034912, current time: 7:40:17
count: 1000, time elapsed: 1.616948127746582, current time: 7:40:17
count: 1200, time elapsed: 1.8989322185516357, current time: 7:40:17
count: 1400, time elapsed: 2.2031588554382324, current time: 7:40:18
count: 1600, time elapsed: 2.482089042663574, current time: 7:40:18
count: 1800, time elapsed: 2.802114963531494, current time: 7:40:18
count: 2000, time elapsed: 3.091928005218506, current time: 7:40:18
count: 2200, time elapsed: 3.3927671909332275, current time: 7:40:19


**Here are two simple changes to make, in order to repeat the above actions for every tweet:**
* Loop through all tweet ID's and retrieve / store their JSON info to a new line of 'tweet_json.txt'
* Loop through each line of 'tweet_json.txt', retrieve the tweet data of interest, and append it to the dataframe
* ___NOTE:___ watch out for deleted tweets and/or missing tweet data. Use try-except blocks as appropriate

In [51]:
#tweetInfo.head()
len(tweetInfo)

2335