In [1]:
import pandas as pd
import os
import requests
import numpy as np
import json

# Gathering

Here I will read in the data from three sources.

- The twitter archive of the tweets
- A twitter api call on the tweets to get favorite count and retweet count
- The image file from the internet using the requests library

In [2]:
import tweepy

# Read in the config file with your personal keys, secrets, and tokens
with open('config_file.txt') as file:
    lines = file.readlines()
    consumer_key = lines[0].strip('\n')
    consumer_secret = lines[1].strip('\n')
    access_token = lines[2].strip('\n')
    access_secret = lines[3].strip('\n')
    
# Set up the API
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_secret)

api = tweepy.API(auth, wait_on_rate_limit = True, wait_on_rate_limit_notify = True)

In [3]:
# Read in twitter archive
tweets = pd.read_csv('twitter-archive-enhanced.csv')

In [4]:
# Grab the image predictions
url = 'https://d17h27t6h515a5.cloudfront.net/topher/2017/August/599fd2ad_image-predictions/image-predictions.tsv'
images = requests.get(url)

In [5]:
# Create tsv of images
with open('images.csv', 'x') as file:
    file.write(images.text)

In [5]:
images_df = pd.read_csv('images.csv', sep = '\t')

In [7]:
file_name = 'tweet_json.txt'

In [7]:
# Create errors file
errors = open('erros.csv', 'x')
errors.write('tweet_id,deleted\n')

with open(file_name, 'x') as outfile:
    for tweet in tweets['tweet_id']:
        try:
            tweet_json = api.get_status(tweet)._json
            json.dump(tweet_json, outfile)
            outfile.write('\n')
        except Exception as e:
            print(tweet)
            errors.write(str(tweet) + ',y\n')
            
errors.close()
            


888202515573088257
873697596434513921
869988702071779329
867421006826221569
866816280283807744
861769973181624320
842892208864923648
827228250799742977
802247111496568832
775096608509886464
Rate limit reached. Sleeping for: 443
Rate limit reached. Sleeping for: 437


In [8]:
tweet_list = []

with open(file_name) as infile:
    for line in infile:
        data = json.loads(line)
        tweet_id = data['id_str']
        favorites = data['favorite_count']
        retweets = data['retweet_count']
        tweet_list.append({'tweet_id': tweet_id,
                           'favorite_count': favorites,
                           'retweet_count': retweets,
                           'deleted' : 'n'})
        
json_df = pd.DataFrame(tweet_list, columns = ['tweet_id', 'favorite_count', 'retweet_count', 'deleted'])

In [9]:
error_df = pd.read_csv('erros.csv')

In [10]:
error_df.head()

Unnamed: 0,tweet_id,deleted
0,888202515573088257,y
1,873697596434513921,y
2,869988702071779329,y
3,867421006826221569,y
4,866816280283807744,y


In [12]:
json_df = pd.concat([json_df, error_df], ignore_index = True)

# Assessing

## Tidiness

- Have three data frames (tweets, json_df, images_df). Only need two
- Dog categories are seperate columns

## Quality

#### 'tweets' dataframe

- Some of the tweets have been deleted
- Datatypes are incorrect (tweet_id, timestamp, dog types)

#### 'json_df' dataframe

- Datatypes are incorrect (favorite, retweet count)

#### 'images_df'

- Some tweets missing




In [22]:
images_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2075 entries, 0 to 2074
Data columns (total 12 columns):
tweet_id    2075 non-null int64
jpg_url     2075 non-null object
img_num     2075 non-null int64
p1          2075 non-null object
p1_conf     2075 non-null float64
p1_dog      2075 non-null bool
p2          2075 non-null object
p2_conf     2075 non-null float64
p2_dog      2075 non-null bool
p3          2075 non-null object
p3_conf     2075 non-null float64
p3_dog      2075 non-null bool
dtypes: bool(3), float64(3), int64(2), object(4)
memory usage: 152.1+ KB
