In [1]:
import pandas as pd
import os
import requests
import numpy as np
import json

# Gathering

Here I will read in the data from three sources.

- The twitter archive of the tweets
- A twitter api call on the tweets to get favorite count and retweet count
- The image file from the internet using the requests library

In [2]:
import tweepy

# Read in the config file with your personal keys, secrets, and tokens
with open('config_file.txt') as file:
    lines = file.readlines()
    consumer_key = lines[0].strip('\n')
    consumer_secret = lines[1].strip('\n')
    access_token = lines[2].strip('\n')
    access_secret = lines[3].strip('\n')
    
# Set up the API
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_secret)

api = tweepy.API(auth, wait_on_rate_limit = True, wait_on_rate_limit_notify = True)

In [3]:
# Read in twitter archive
tweets = pd.read_csv('twitter-archive-enhanced.csv')

In [4]:
# Grab the image predictions
url = 'https://d17h27t6h515a5.cloudfront.net/topher/2017/August/599fd2ad_image-predictions/image-predictions.tsv'
images = requests.get(url)

In [5]:
# Create tsv of images
with open('images.csv', 'x') as file:
    file.write(images.text)

FileExistsError: [Errno 17] File exists: 'images.csv'

In [6]:
images_df = pd.read_csv('images.csv', sep = '\t')

In [7]:
file_name = 'tweet_json.txt'

In [7]:
# Create errors file
errors = open('erros.csv', 'x')
errors.write('tweet_id,deleted\n')

with open(file_name, 'x') as outfile:
    for tweet in tweets['tweet_id']:
        try:
            tweet_json = api.get_status(tweet)._json
            json.dump(tweet_json, outfile)
            outfile.write('\n')
        except Exception as e:
            print(tweet)
            errors.write(str(tweet) + ',y\n')
            
errors.close()
            


888202515573088257
873697596434513921
869988702071779329
867421006826221569
866816280283807744
861769973181624320
842892208864923648
827228250799742977
802247111496568832
775096608509886464
Rate limit reached. Sleeping for: 443
Rate limit reached. Sleeping for: 437


In [8]:
tweet_list = []

with open(file_name) as infile:
    for line in infile:
        data = json.loads(line)
        tweet_id = data['id_str']
        favorites = data['favorite_count']
        retweets = data['retweet_count']
        tweet_list.append({'tweet_id': tweet_id,
                           'favorite_count': favorites,
                           'retweet_count': retweets,
                           'deleted' : 'n'})
        
json_df = pd.DataFrame(tweet_list, columns = ['tweet_id', 'favorite_count', 'retweet_count', 'deleted'])

In [9]:
error_df = pd.read_csv('erros.csv')

In [10]:
error_df.head()

Unnamed: 0,tweet_id,deleted
0,888202515573088257,y
1,873697596434513921,y
2,869988702071779329,y
3,867421006826221569,y
4,866816280283807744,y


In [11]:
json_df = pd.concat([json_df, error_df], ignore_index = True)

# Assessing

## Tidiness

- Have three data frames (tweets, json_df, images_df). Only need two
- Dog categories are seperate columns

## Quality

#### 'tweets' dataframe

- Some of the tweets have been deleted
- Datatypes are incorrect (tweet_id, timestamp, dog types)
- Some numerators are 0
- Some denominators are not 10

#### 'json_df' dataframe

- Datatypes are incorrect (favorite, retweet count)

#### 'images_df'

- Some tweets missing
- Dog names have various formats
- Some aren't dogs




In [31]:
images_df[images_df.p1_dog == False]


Unnamed: 0,tweet_id,jpg_url,img_num,p1,p1_conf,p1_dog,p2,p2_conf,p2_dog,p3,p3_conf,p3_dog
6,666051853826850816,https://pbs.twimg.com/media/CT5KoJ1WoAAJash.jpg,1,box_turtle,0.933012,False,mud_turtle,4.588540e-02,False,terrapin,1.788530e-02,False
8,666057090499244032,https://pbs.twimg.com/media/CT5PY90WoAAQGLo.jpg,1,shopping_cart,0.962465,False,shopping_basket,1.459380e-02,False,golden_retriever,7.958960e-03,True
17,666104133288665088,https://pbs.twimg.com/media/CT56LSZWoAAlJj2.jpg,1,hen,0.965932,False,cock,3.391940e-02,False,partridge,5.206580e-05,False
18,666268910803644416,https://pbs.twimg.com/media/CT8QCd1WEAADXws.jpg,1,desktop_computer,0.086502,False,desk,8.554740e-02,False,bookcase,7.947970e-02,False
21,666293911632134144,https://pbs.twimg.com/media/CT8mx7KW4AEQu8N.jpg,1,three-toed_sloth,0.914671,False,otter,1.525000e-02,False,great_grey_owl,1.320720e-02,False
22,666337882303524864,https://pbs.twimg.com/media/CT9OwFIWEAMuRje.jpg,1,ox,0.416669,False,Newfoundland,2.784070e-01,True,groenendael,1.026430e-01,True
25,666362758909284353,https://pbs.twimg.com/media/CT9lXGsUcAAyUFt.jpg,1,guinea_pig,0.996496,False,skunk,2.402450e-03,False,hamster,4.608630e-04,False
29,666411507551481857,https://pbs.twimg.com/media/CT-RugiWIAELEaq.jpg,1,coho,0.404640,False,barracouta,2.714850e-01,False,gar,1.899450e-01,False
33,666430724426358785,https://pbs.twimg.com/media/CT-jNYqW4AAPi2M.jpg,1,llama,0.505184,False,Irish_terrier,1.041090e-01,True,dingo,6.207120e-02,False
43,666776908487630848,https://pbs.twimg.com/media/CUDeDoWUYAAD-EM.jpg,1,seat_belt,0.375057,False,miniature_pinscher,1.671750e-01,True,Chihuahua,8.695060e-02,True


# Cleaning

### 'tweets' dataframe

- Change datatypes to correct ones (tweet_id to string, timestamp to datetime) to start

In [36]:
tweets.tweet_id = tweets.tweet_id.astype(str)

In [38]:
tweets.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2356 entries, 0 to 2355
Data columns (total 17 columns):
tweet_id                      2356 non-null object
in_reply_to_status_id         78 non-null float64
in_reply_to_user_id           78 non-null float64
timestamp                     2356 non-null object
source                        2356 non-null object
text                          2356 non-null object
retweeted_status_id           181 non-null float64
retweeted_status_user_id      181 non-null float64
retweeted_status_timestamp    181 non-null object
expanded_urls                 2297 non-null object
rating_numerator              2356 non-null int64
rating_denominator            2356 non-null int64
name                          2356 non-null object
doggo                         2356 non-null object
floofer                       2356 non-null object
pupper                        2356 non-null object
puppo                         2356 non-null object
dtypes: float64(4), int64(2), o

In [40]:
tweets.timestamp = pd.to_datetime(tweets.timestamp, infer_datetime_format = True)

In [42]:
tweets.head()

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,timestamp,source,text,retweeted_status_id,retweeted_status_user_id,retweeted_status_timestamp,expanded_urls,rating_numerator,rating_denominator,name,doggo,floofer,pupper,puppo
0,892420643555336193,,,2017-08-01 16:23:56,"<a href=""http://twitter.com/download/iphone"" r...",This is Phineas. He's a mystical boy. Only eve...,,,,https://twitter.com/dog_rates/status/892420643...,13,10,Phineas,,,,
1,892177421306343426,,,2017-08-01 00:17:27,"<a href=""http://twitter.com/download/iphone"" r...",This is Tilly. She's just checking pup on you....,,,,https://twitter.com/dog_rates/status/892177421...,13,10,Tilly,,,,
2,891815181378084864,,,2017-07-31 00:18:03,"<a href=""http://twitter.com/download/iphone"" r...",This is Archie. He is a rare Norwegian Pouncin...,,,,https://twitter.com/dog_rates/status/891815181...,12,10,Archie,,,,
3,891689557279858688,,,2017-07-30 15:58:51,"<a href=""http://twitter.com/download/iphone"" r...",This is Darla. She commenced a snooze mid meal...,,,,https://twitter.com/dog_rates/status/891689557...,13,10,Darla,,,,
4,891327558926688256,,,2017-07-29 16:00:24,"<a href=""http://twitter.com/download/iphone"" r...",This is Franklin. He would like you to stop ca...,,,,https://twitter.com/dog_rates/status/891327558...,12,10,Franklin,,,,


## Tidiness

- Merge json_df and tweets dataframe on tweet_id

In [66]:
tweets_df = pd.merge(tweets, json_df, on='tweet_id')

In [44]:
tweets_df.head()

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,timestamp,source,text,retweeted_status_id,retweeted_status_user_id,retweeted_status_timestamp,expanded_urls,rating_numerator,rating_denominator,name,doggo,floofer,pupper,puppo,deleted,favorite_count,retweet_count
0,892420643555336193,,,2017-08-01 16:23:56,"<a href=""http://twitter.com/download/iphone"" r...",This is Phineas. He's a mystical boy. Only eve...,,,,https://twitter.com/dog_rates/status/892420643...,13,10,Phineas,,,,,n,39152.0,8707.0
1,892177421306343426,,,2017-08-01 00:17:27,"<a href=""http://twitter.com/download/iphone"" r...",This is Tilly. She's just checking pup on you....,,,,https://twitter.com/dog_rates/status/892177421...,13,10,Tilly,,,,,n,33532.0,6388.0
2,891815181378084864,,,2017-07-31 00:18:03,"<a href=""http://twitter.com/download/iphone"" r...",This is Archie. He is a rare Norwegian Pouncin...,,,,https://twitter.com/dog_rates/status/891815181...,12,10,Archie,,,,,n,25276.0,4244.0
3,891689557279858688,,,2017-07-30 15:58:51,"<a href=""http://twitter.com/download/iphone"" r...",This is Darla. She commenced a snooze mid meal...,,,,https://twitter.com/dog_rates/status/891689557...,13,10,Darla,,,,,n,42510.0,8805.0
4,891327558926688256,,,2017-07-29 16:00:24,"<a href=""http://twitter.com/download/iphone"" r...",This is Franklin. He would like you to stop ca...,,,,https://twitter.com/dog_rates/status/891327558...,12,10,Franklin,,,,,n,40672.0,9588.0


- Convert dog type columns (pupper, floofer, etc) into one categorical column

In [45]:
melt1 = pd.melt(tweets_df[['tweet_id', 'doggo', 'floofer', 'pupper', 'puppo']], id_vars = 'tweet_id', 
                value_vars = ['doggo', 'floofer', 'pupper', 'puppo'], var_name = 'dog_type')

In [57]:
melt1 = melt1[melt1.value != 'None']

In [62]:
melt1.drop('value', axis = 1, inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [67]:
tweets_df = tweets_df.merge(melt1, how = 'left', on = 'tweet_id')

In [68]:
tweets_df.head(20)

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,timestamp,source,text,retweeted_status_id,retweeted_status_user_id,retweeted_status_timestamp,expanded_urls,...,rating_denominator,name,doggo,floofer,pupper,puppo,deleted,favorite_count,retweet_count,dog_type
0,892420643555336193,,,2017-08-01 16:23:56,"<a href=""http://twitter.com/download/iphone"" r...",This is Phineas. He's a mystical boy. Only eve...,,,,https://twitter.com/dog_rates/status/892420643...,...,10,Phineas,,,,,n,39152.0,8707.0,
1,892177421306343426,,,2017-08-01 00:17:27,"<a href=""http://twitter.com/download/iphone"" r...",This is Tilly. She's just checking pup on you....,,,,https://twitter.com/dog_rates/status/892177421...,...,10,Tilly,,,,,n,33532.0,6388.0,
2,891815181378084864,,,2017-07-31 00:18:03,"<a href=""http://twitter.com/download/iphone"" r...",This is Archie. He is a rare Norwegian Pouncin...,,,,https://twitter.com/dog_rates/status/891815181...,...,10,Archie,,,,,n,25276.0,4244.0,
3,891689557279858688,,,2017-07-30 15:58:51,"<a href=""http://twitter.com/download/iphone"" r...",This is Darla. She commenced a snooze mid meal...,,,,https://twitter.com/dog_rates/status/891689557...,...,10,Darla,,,,,n,42510.0,8805.0,
4,891327558926688256,,,2017-07-29 16:00:24,"<a href=""http://twitter.com/download/iphone"" r...",This is Franklin. He would like you to stop ca...,,,,https://twitter.com/dog_rates/status/891327558...,...,10,Franklin,,,,,n,40672.0,9588.0,
5,891087950875897856,,,2017-07-29 00:08:17,"<a href=""http://twitter.com/download/iphone"" r...",Here we have a majestic great white breaching ...,,,,https://twitter.com/dog_rates/status/891087950...,...,10,,,,,,n,20402.0,3180.0,
6,890971913173991426,,,2017-07-28 16:27:12,"<a href=""http://twitter.com/download/iphone"" r...",Meet Jax. He enjoys ice cream so much he gets ...,,,,"https://gofundme.com/ydvmve-surgery-for-jax,ht...",...,10,Jax,,,,,n,11958.0,2118.0,
7,890729181411237888,,,2017-07-28 00:22:40,"<a href=""http://twitter.com/download/iphone"" r...",When you watch your owner call another dog a g...,,,,https://twitter.com/dog_rates/status/890729181...,...,10,,,,,,n,66163.0,19296.0,
8,890609185150312448,,,2017-07-27 16:25:51,"<a href=""http://twitter.com/download/iphone"" r...",This is Zoey. She doesn't want to be one of th...,,,,https://twitter.com/dog_rates/status/890609185...,...,10,Zoey,,,,,n,28002.0,4341.0,
9,890240255349198849,,,2017-07-26 15:59:51,"<a href=""http://twitter.com/download/iphone"" r...",This is Cassie. She is a college pup. Studying...,,,,https://twitter.com/dog_rates/status/890240255...,...,10,Cassie,doggo,,,,n,32217.0,7575.0,doggo


In [69]:
tweets_df.drop(['doggo','floofer','pupper','puppo'], axis = 1, inplace = True)

In [70]:
tweets_df.head()

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,timestamp,source,text,retweeted_status_id,retweeted_status_user_id,retweeted_status_timestamp,expanded_urls,rating_numerator,rating_denominator,name,deleted,favorite_count,retweet_count,dog_type
0,892420643555336193,,,2017-08-01 16:23:56,"<a href=""http://twitter.com/download/iphone"" r...",This is Phineas. He's a mystical boy. Only eve...,,,,https://twitter.com/dog_rates/status/892420643...,13,10,Phineas,n,39152.0,8707.0,
1,892177421306343426,,,2017-08-01 00:17:27,"<a href=""http://twitter.com/download/iphone"" r...",This is Tilly. She's just checking pup on you....,,,,https://twitter.com/dog_rates/status/892177421...,13,10,Tilly,n,33532.0,6388.0,
2,891815181378084864,,,2017-07-31 00:18:03,"<a href=""http://twitter.com/download/iphone"" r...",This is Archie. He is a rare Norwegian Pouncin...,,,,https://twitter.com/dog_rates/status/891815181...,12,10,Archie,n,25276.0,4244.0,
3,891689557279858688,,,2017-07-30 15:58:51,"<a href=""http://twitter.com/download/iphone"" r...",This is Darla. She commenced a snooze mid meal...,,,,https://twitter.com/dog_rates/status/891689557...,13,10,Darla,n,42510.0,8805.0,
4,891327558926688256,,,2017-07-29 16:00:24,"<a href=""http://twitter.com/download/iphone"" r...",This is Franklin. He would like you to stop ca...,,,,https://twitter.com/dog_rates/status/891327558...,12,10,Franklin,n,40672.0,9588.0,
