In [1]:
# for data wrangling and sampling
import pandas as pd
import numpy as np
import random

# Set the random seed to assure the same answers are returned each time 
random.seed(42)

# for plotting
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sb

# for (potential) regression modeling of data
import statsmodels.api as sm;
from patsy import dmatrices
from statsmodels.stats.outliers_influence import variance_inflation_factor

### Read in a Twitter archive (manual / already provided)

In [2]:
df_archive = pd.read_csv("twitter-archive-enhanced.csv")
df_archive.head(1)

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,timestamp,source,text,retweeted_status_id,retweeted_status_user_id,retweeted_status_timestamp,expanded_urls,rating_numerator,rating_denominator,name,doggo,floofer,pupper,puppo
0,892420643555336193,,,2017-08-01 16:23:56 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Phineas. He's a mystical boy. Only eve...,,,,https://twitter.com/dog_rates/status/892420643...,13,10,Phineas,,,,


### Download tweet image predictions (programmatically, from a url)

In [3]:
# import requests and os libraries to download files programmatically, then save them locally
import requests
import os

# get file from a url
url = 'https://d17h27t6h515a5.cloudfront.net/topher/2017/August/599fd2ad_image-predictions/image-predictions.tsv'
response = requests.get(url)

# get the current working directory
folder_name = os.getcwd()

# get the filename
file_name = url.split('/')[-1]

# save the retrieved file to local storage
with open(os.path.join(folder_name,
                      file_name), mode='wb') as file:
    file.write(response.content)

# read in the downloaded file
df_images = pd.read_csv(file_name, sep='\t')
df_images.head()

Unnamed: 0,tweet_id,jpg_url,img_num,p1,p1_conf,p1_dog,p2,p2_conf,p2_dog,p3,p3_conf,p3_dog
0,666020888022790149,https://pbs.twimg.com/media/CT4udn0WwAA0aMy.jpg,1,Welsh_springer_spaniel,0.465074,True,collie,0.156665,True,Shetland_sheepdog,0.061428,True
1,666029285002620928,https://pbs.twimg.com/media/CT42GRgUYAA5iDo.jpg,1,redbone,0.506826,True,miniature_pinscher,0.074192,True,Rhodesian_ridgeback,0.07201,True
2,666033412701032449,https://pbs.twimg.com/media/CT4521TWwAEvMyu.jpg,1,German_shepherd,0.596461,True,malinois,0.138584,True,bloodhound,0.116197,True
3,666044226329800704,https://pbs.twimg.com/media/CT5Dr8HUEAA-lEu.jpg,1,Rhodesian_ridgeback,0.408143,True,redbone,0.360687,True,miniature_pinscher,0.222752,True
4,666049248165822465,https://pbs.twimg.com/media/CT5IQmsXIAAKY4A.jpg,1,miniature_pinscher,0.560311,True,Rottweiler,0.243682,True,Doberman,0.154629,True


NOTE:
* The response variable is in bytes format, not text format.
* As such, the 'wb' flag is used when writing the file locally
* [Link to a StackOverflow post](https://stackoverflow.com/questions/2665866/what-does-wb-mean-in-this-code-using-python) on the subject<br><br>

Template code for future reference:

In [None]:
# template code to make a directory if it doesn't already exist
#folder_name = 'my_new_folder'
#if not os.path.exists(folder_name):
#    os.makedirs(folder_name)

# command to list the current directory's contents
#os.listdir()

### Create an API object to gather Twitter data

In [4]:
import tweepy

# get the API Access Token and Acces Token Secret
from twAPI_tokens_GoodDoggo import API_KEY, API_KEY_SECRET, API_TOKEN, API_TOKEN_SECRET

CONSUMER_KEY = API_KEY
CONSUMER_SECRET = API_KEY_SECRET
ACCESS_TOKEN = API_TOKEN
ACCESS_SECRET = API_TOKEN_SECRET

auth = tweepy.OAuthHandler(CONSUMER_KEY, CONSUMER_SECRET)
auth.set_access_token(ACCESS_TOKEN, ACCESS_SECRET)

api = tweepy.API(auth)


# # code snippet for getting around the twitter rate limit:
# api = tweepy.API(auth, 
#                  wait_on_rate_limit=True,
#                  wait_on_rate_limit_notify=True)

### Test using the API to get tweet info for the first tweet

* ___Retrieve json data for the first tweet and write it to local storage___
* [StackOverflow article](https://stackoverflow.com/questions/28384588/twitter-api-get-tweets-with-specific-id) on getting JSON data for a specific tweet
* [StackAbuse article](https://stackabuse.com/reading-and-writing-json-to-a-file-in-python/) on reading and writing JSON to a file in Python

In [5]:
# test getting the text of a tweet
#tweet = api.get_status(df_archive['tweet_id'][0])

# data pretty printer - https://docs.python.org/2/library/pprint.html
import pprint as pp

# for json I/O and parsing
import json

# retrieve json data for the first tweet and write it to local storage
tweet_id = df_archive['tweet_id'][0]
with open('tweet_json.txt', mode = 'w') as textFile:
    status = api.get_status(tweet_id, tweet_mode='extended')
    jsonStr = json.dumps(status._json)
    textFile.write(jsonStr + '\n')
    
# # determine multiple tweet_id's and loop through them, retrieving and writing their json data to 'tweet_json.txt' 
# with open('tweet_json.txt', mode = 'w') as file:
#     for tweet_id in tweet_ids:
#         try:
#             status = api.get_status(tweet_id)
#             json_str = json.dumps(status._json)
#         except:
#            # tweet was deleted
#         file.write(json_str + '\n')

__Print first line of 'tweet_json.txt' to check that the above worked__

In [6]:
# print first line of 'tweet_json.txt' to check that the above worked
with open('tweet_json.txt') as jsonFile:
    line = jsonFile.readline()
    tweet = json.loads(line)
    pp.pprint(tweet)

{'contributors': None,
 'coordinates': None,
 'created_at': 'Tue Aug 01 16:23:56 +0000 2017',
 'display_text_range': [0, 85],
 'entities': {'hashtags': [],
              'media': [{'display_url': 'pic.twitter.com/MgUWQ76dJU',
                         'expanded_url': 'https://twitter.com/dog_rates/status/892420643555336193/photo/1',
                         'id': 892420639486877696,
                         'id_str': '892420639486877696',
                         'indices': [86, 109],
                         'media_url': 'http://pbs.twimg.com/media/DGKD1-bXoAAIAUK.jpg',
                         'media_url_https': 'https://pbs.twimg.com/media/DGKD1-bXoAAIAUK.jpg',
                         'sizes': {'large': {'h': 528,
                                             'resize': 'fit',
                                             'w': 540},
                                   'medium': {'h': 528,
                                              'resize': 'fit',
                                    

**Test adding the tweet's data to a dataframe**

In [7]:
# test adding the tweet's data to a dataframe

# create a local dataframe for storing tweet data
tweetInfo = pd.DataFrame(columns = ['tweetID', 'retweet_count', 'favorite_count'])

# add tweet data to the dataframe
tweetInfo = tweetInfo.append({
    'tweetID': tweet['id'],
    'retweet_count': tweet['favorite_count'],
    'favorite_count': tweet['retweet_count']
},ignore_index=True)

tweetInfo.head()

Unnamed: 0,tweetID,retweet_count,favorite_count
0,892420643555336193,37464,8159


**Here are two simple changes to make, in order to repeat the above actions for every tweet:**
* Loop through all tweet ID's and retrieve / store their JSON info to a new line of 'tweet_json.txt'
* Loop through each line of 'tweet_json.txt', retrieve the tweet data of interest, and append it to the dataframe
* ___NOTE:___ watch out for deleted tweets and/or missing tweet data. Use try-except blocks as appropriate