# Wrangling and Analyzing WeRateDogs Twitter Dataset

## 1. Gathering Data

- Importing relevant modules

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sb
import os
import requests
import tweepy
import json
from decouple import config

%matplotlib inline

### 1.1. WeRateDogs Twitter archive

In [3]:
twitter_archive = pd.read_csv('twitter-archive-enhanced.csv')
twitter_archive

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,timestamp,source,text,retweeted_status_id,retweeted_status_user_id,retweeted_status_timestamp,expanded_urls,rating_numerator,rating_denominator,name,doggo,floofer,pupper,puppo
0,892420643555336193,,,2017-08-01 16:23:56 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Phineas. He's a mystical boy. Only eve...,,,,https://twitter.com/dog_rates/status/892420643...,13,10,Phineas,,,,
1,892177421306343426,,,2017-08-01 00:17:27 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Tilly. She's just checking pup on you....,,,,https://twitter.com/dog_rates/status/892177421...,13,10,Tilly,,,,
2,891815181378084864,,,2017-07-31 00:18:03 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Archie. He is a rare Norwegian Pouncin...,,,,https://twitter.com/dog_rates/status/891815181...,12,10,Archie,,,,
3,891689557279858688,,,2017-07-30 15:58:51 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Darla. She commenced a snooze mid meal...,,,,https://twitter.com/dog_rates/status/891689557...,13,10,Darla,,,,
4,891327558926688256,,,2017-07-29 16:00:24 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Franklin. He would like you to stop ca...,,,,https://twitter.com/dog_rates/status/891327558...,12,10,Franklin,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2351,666049248165822465,,,2015-11-16 00:24:50 +0000,"<a href=""http://twitter.com/download/iphone"" r...",Here we have a 1949 1st generation vulpix. Enj...,,,,https://twitter.com/dog_rates/status/666049248...,5,10,,,,,
2352,666044226329800704,,,2015-11-16 00:04:52 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is a purebred Piers Morgan. Loves to Netf...,,,,https://twitter.com/dog_rates/status/666044226...,6,10,a,,,,
2353,666033412701032449,,,2015-11-15 23:21:54 +0000,"<a href=""http://twitter.com/download/iphone"" r...",Here is a very happy pup. Big fan of well-main...,,,,https://twitter.com/dog_rates/status/666033412...,9,10,a,,,,
2354,666029285002620928,,,2015-11-15 23:05:30 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is a western brown Mitsubishi terrier. Up...,,,,https://twitter.com/dog_rates/status/666029285...,7,10,a,,,,


### 1.2. The tweet image predictions
- Downloading a file containing image predictions after running every image in the twitter archive through a neural newtwork that can classify breeds of dogs

In [18]:
# request code
url = 'https://d17h27t6h515a5.cloudfront.net/topher/2017/August/599fd2ad_image-predictions/image-predictions.tsv'
response = requests.get(url)
response.status_code

200

In [7]:
# split from the right side
file_name = url.rsplit('/', 1)[-1]

# download the file if it does not exist
if not os.path.isfile(file_name):
    with open(file_name, mode='wb') as file:
        file.write(response.content)

In [8]:
image_predictions = pd.read_csv('image-predictions.tsv', sep='\t')
image_predictions

Unnamed: 0,tweet_id,jpg_url,img_num,p1,p1_conf,p1_dog,p2,p2_conf,p2_dog,p3,p3_conf,p3_dog
0,666020888022790149,https://pbs.twimg.com/media/CT4udn0WwAA0aMy.jpg,1,Welsh_springer_spaniel,0.465074,True,collie,0.156665,True,Shetland_sheepdog,0.061428,True
1,666029285002620928,https://pbs.twimg.com/media/CT42GRgUYAA5iDo.jpg,1,redbone,0.506826,True,miniature_pinscher,0.074192,True,Rhodesian_ridgeback,0.072010,True
2,666033412701032449,https://pbs.twimg.com/media/CT4521TWwAEvMyu.jpg,1,German_shepherd,0.596461,True,malinois,0.138584,True,bloodhound,0.116197,True
3,666044226329800704,https://pbs.twimg.com/media/CT5Dr8HUEAA-lEu.jpg,1,Rhodesian_ridgeback,0.408143,True,redbone,0.360687,True,miniature_pinscher,0.222752,True
4,666049248165822465,https://pbs.twimg.com/media/CT5IQmsXIAAKY4A.jpg,1,miniature_pinscher,0.560311,True,Rottweiler,0.243682,True,Doberman,0.154629,True
...,...,...,...,...,...,...,...,...,...,...,...,...
2070,891327558926688256,https://pbs.twimg.com/media/DF6hr6BUMAAzZgT.jpg,2,basset,0.555712,True,English_springer,0.225770,True,German_short-haired_pointer,0.175219,True
2071,891689557279858688,https://pbs.twimg.com/media/DF_q7IAWsAEuuN8.jpg,1,paper_towel,0.170278,False,Labrador_retriever,0.168086,True,spatula,0.040836,False
2072,891815181378084864,https://pbs.twimg.com/media/DGBdLU1WsAANxJ9.jpg,1,Chihuahua,0.716012,True,malamute,0.078253,True,kelpie,0.031379,True
2073,892177421306343426,https://pbs.twimg.com/media/DGGmoV4XsAAUL6n.jpg,1,Chihuahua,0.323581,True,Pekinese,0.090647,True,papillon,0.068957,True


### 1.3. Querying Twitter Data
- Using the tweet IDs to store each tweets entire set of JSON data in tweet_json.txt file
- Each tweets JSON data should be written to it's own line

In [9]:
api_key = config('API_KEY')
api_key_secret = config('API_KEY_SECRET')
bearer_token = config('BEARER_TOKEN')
access_token = config('ACCESS_TOKEN')
access_token_secret = config('ACCESS_TOKEN_SECRET')

In [10]:
auth = tweepy.OAuth1UserHandler(
    consumer_key=api_key, consumer_secret=api_key_secret, access_token=access_token, access_token_secret=access_token_secret
)
api = tweepy.API(auth)

In [12]:
tweet_ids = []
for id in twitter_archive.tweet_id:
    tweet_ids.append(id)
len(tweet_ids)

2356

### Writing and Reading Twitter JSON

In [12]:
with open('tweet_json.txt', 'w') as outfile:
    for id in tweet_ids:
        try:
            ranking = tweet_ids.index(id) + 1
            # printing out each id after querying
            print(ranking)
            tweet = api.get_status(id, tweet_mode='extended')
            json.dump(tweet._json, outfile)
            outfile.write('\n')
        except tweepy.TweepyException as e:
        # except Exception as e:
            print(str(ranking) + "_" + str(id) + ": " + str(e))
            continue

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
20_888202515573088257: 404 Not Found
144 - No status found with that ID.
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
96_873697596434513921: 404 Not Found
144 - No status found with that ID.
97
98
99
100
101
102
102_872668790621863937: 404 Not Found
144 - No status found with that ID.
103
104
105
105_872261713294495745: 404 Not Found
144 - No status found with that ID.
106
107
108
109
110
111
112
113
114
115
116
117
118
119
119_869988702071779329: 404 Not Found
144 - No status found with that ID.
120
121
122
123
124
125
126
127
128
129
130
131
132
133
133_866816280283807744: 404 Not Found
144 - No status found with that ID.
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
156_861769973181624320: 404 Not Found
144 

- Reading the `.txt` file into a pandas dataframe

In [13]:
api_data =[]

with open('tweet_json.txt', 'r') as file:
    for line in file:
        tweet = json.loads(line)
        tweet_id = tweet['id']
        retweet_count = tweet['retweet_count']
        likes = tweet['favorite_count']
        tweet_text = tweet['full_text']
        api_data.append({
            'tweet_id': tweet_id,
            'retweet_count': retweet_count,
            'likes': likes,
            'tweet_text': tweet_text
        })

df3 = pd.DataFrame(api_data)
df3.head()

Unnamed: 0,tweet_id,retweet_count,likes,tweet_text
0,892420643555336193,7015,33836,This is Phineas. He's a mystical boy. Only eve...
1,892177421306343426,5302,29351,This is Tilly. She's just checking pup on you....
2,891815181378084864,3483,22071,This is Archie. He is a rare Norwegian Pouncin...
3,891689557279858688,7229,36962,This is Darla. She commenced a snooze mid meal...
4,891327558926688256,7768,35299,This is Franklin. He would like you to stop ca...


## 2. Assessing Data