In [1]:
import pandas as pd
import numpy as np
import requests
import tweepy
from tweepy import OAuthHandler
import json
import os
from timeit import default_timer as timer
import datetime
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

## Gathering

In [2]:
ck = os.environ.get('CONSUMER_KEY') #(API key) 
csk = os.environ.get('CONSUMER_SECRET') #(API secret key)
oat = os.environ.get('ACCESS_TOKEN') #(Access token)
oats = os.environ.get('ACCESS_TOKEN_SECRET') #(Access token secret)
auth = OAuthHandler(ck, csk)
auth.set_access_token(oat,oats)
api=tweepy.API(auth,wait_on_rate_limit=True)
print(api)

<tweepy.api.API object at 0x0AD86F70>


In [3]:
archive = pd.read_csv("twitter-archive-enhanced.csv")
archive.head(2)

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,timestamp,source,text,retweeted_status_id,retweeted_status_user_id,retweeted_status_timestamp,expanded_urls,rating_numerator,rating_denominator,name,doggo,floofer,pupper,puppo
0,892420643555336193,,,2017-08-01 16:23:56 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Phineas. He's a mystical boy. Only eve...,,,,https://twitter.com/dog_rates/status/892420643...,13,10,Phineas,,,,
1,892177421306343426,,,2017-08-01 00:17:27 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Tilly. She's just checking pup on you....,,,,https://twitter.com/dog_rates/status/892177421...,13,10,Tilly,,,,


Imported image prediction data

In [4]:
url = 'https://d17h27t6h515a5.cloudfront.net/topher/2017/August/599fd2ad_image-predictions/image-predictions.tsv'
response = requests.get(url)
with open("image-predictions.tsv", mode = 'wb') as file:
    file.write(response.content)


In [5]:
image_predictions = pd.read_csv("image-predictions.tsv", delimiter = '\t')
image_predictions.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2075 entries, 0 to 2074
Data columns (total 12 columns):
tweet_id    2075 non-null int64
jpg_url     2075 non-null object
img_num     2075 non-null int64
p1          2075 non-null object
p1_conf     2075 non-null float64
p1_dog      2075 non-null bool
p2          2075 non-null object
p2_conf     2075 non-null float64
p2_dog      2075 non-null bool
p3          2075 non-null object
p3_conf     2075 non-null float64
p3_dog      2075 non-null bool
dtypes: bool(3), float64(3), int64(2), object(4)
memory usage: 119.6+ KB


Retrieving Retweet Count and Favourite Count

api.get_status(1078824826243088384, tweet_mode='extended')._json['full_text']

start = timer()
missing_tweets = {}
with open('tweet_json.txt', mode='w') as file:
    for ids in archive.tweet_id:
        try:
            tweet=api.get_status(ids,tweet_mode='extended')._json
            json.dump(tweet,file)
            file.write('\n')
        except tweepy.TweepError as e:
            print("Fail")
            missing_tweets[ids]=e
            pass            
            
end = timer()
print(end - start)

with open('missing_tweets.txt',mode='w') as file:
    for key in missing_tweets.keys():
        file.write(str(key))
        file.write('\n')
    

Reading tweet_json

In [6]:
df_list = []
with open('tweet_json.txt', mode='r') as file:
    for line in file:
        tweet = json.loads(line)
        tweet_id = tweet['id']
        favorite_count = tweet['favorite_count'] 
        retweet_count = tweet['retweet_count']
        df_list.append({'tweet_id':tweet_id,
                        'favorite_count':favorite_count,
                        'retweet_count':retweet_count})
api_tweets = pd.DataFrame(df_list, columns = ['tweet_id','favorite_count','retweet_count'])
api_tweets.head()

Unnamed: 0,tweet_id,favorite_count,retweet_count
0,892420643555336193,38043,8320
1,892177421306343426,32665,6148
2,891815181378084864,24579,4068
3,891689557279858688,41397,8470
4,891327558926688256,39572,9163


## Assessing

In [7]:
archive.head()

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,timestamp,source,text,retweeted_status_id,retweeted_status_user_id,retweeted_status_timestamp,expanded_urls,rating_numerator,rating_denominator,name,doggo,floofer,pupper,puppo
0,892420643555336193,,,2017-08-01 16:23:56 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Phineas. He's a mystical boy. Only eve...,,,,https://twitter.com/dog_rates/status/892420643...,13,10,Phineas,,,,
1,892177421306343426,,,2017-08-01 00:17:27 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Tilly. She's just checking pup on you....,,,,https://twitter.com/dog_rates/status/892177421...,13,10,Tilly,,,,
2,891815181378084864,,,2017-07-31 00:18:03 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Archie. He is a rare Norwegian Pouncin...,,,,https://twitter.com/dog_rates/status/891815181...,12,10,Archie,,,,
3,891689557279858688,,,2017-07-30 15:58:51 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Darla. She commenced a snooze mid meal...,,,,https://twitter.com/dog_rates/status/891689557...,13,10,Darla,,,,
4,891327558926688256,,,2017-07-29 16:00:24 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Franklin. He would like you to stop ca...,,,,https://twitter.com/dog_rates/status/891327558...,12,10,Franklin,,,,


In [8]:
archive.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2356 entries, 0 to 2355
Data columns (total 17 columns):
tweet_id                      2356 non-null int64
in_reply_to_status_id         78 non-null float64
in_reply_to_user_id           78 non-null float64
timestamp                     2356 non-null object
source                        2356 non-null object
text                          2356 non-null object
retweeted_status_id           181 non-null float64
retweeted_status_user_id      181 non-null float64
retweeted_status_timestamp    181 non-null object
expanded_urls                 2297 non-null object
rating_numerator              2356 non-null int64
rating_denominator            2356 non-null int64
name                          2356 non-null object
doggo                         2356 non-null object
floofer                       2356 non-null object
pupper                        2356 non-null object
puppo                         2356 non-null object
dtypes: float64(4), int64(3), ob

In [9]:
archive[archive['tweet_id']==888202515573088257]

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,timestamp,source,text,retweeted_status_id,retweeted_status_user_id,retweeted_status_timestamp,expanded_urls,rating_numerator,rating_denominator,name,doggo,floofer,pupper,puppo
19,888202515573088257,,,2017-07-21 01:02:36 +0000,"<a href=""http://twitter.com/download/iphone"" r...",RT @dog_rates: This is Canela. She attempted s...,8.87474e+17,4196984000.0,2017-07-19 00:47:34 +0000,https://twitter.com/dog_rates/status/887473957...,13,10,Canela,,,,


In [10]:
api_tweets[api_tweets['tweet_id']==888202515573088257]

NameError: name 'df' is not defined

In [None]:
archive.info()

In [None]:
archive.describe()

In [None]:
archive[(archive["rating_denominator"]==0) | (archive['rating_numerator'])];

In [None]:
len(archive[archive["pupper"]!='None']),len(archive[archive["doggo"]!='None']), len(archive[archive["puppo"]!='None'])

In [None]:
archive.sample(25);

In [None]:
archive[archive.text.str.extract('(^This)').isnull()]

In [None]:
#RTs
archive[archive.text.str.extract('(^RT)').notnull()]

### Image_predictions

In [None]:
image_predictions.head()

In [None]:
image_predictions.sample(20)

In [None]:
image_predictions[(image_predictions["p1"].str.extract(pat='(retriever$)').notnull())]

In [None]:
image_predictions.info();

In [None]:
image_predictions.describe();

In [None]:
image_predictions[(image_predictions["p1_dog"]==False) & (image_predictions["p2_dog"]==False) & (image_predictions["p3_dog"]==False)]

In [None]:
image_predictions[image_predictions["img_num"]!=1]

## Api_tweets

In [None]:
api_tweets.info()

In [None]:
api_tweets.head()

In [None]:
api_tweets.describe()

### Quality

#### archive table
- timestamp is not datetime
- archive contains tweets that were deleted and therefore would not have full info on those tweets
- min denominator rating of 0, id 835246439529840640, this was actually a corrective tweet
- Drop columns we have no intentions of using 
- Drop RT's
- Replace Nan with None in in_reply_to_user_id
- 891087950875897856 Missing dog name, (Marlo)
- 885518971528720385 Missing dog name, (Howard)
- Dog stage columns should be 1 for whether a dog of that stage is present in the tweet and 0 otherwise


#### image_predictions
- upper and lower case references to object names (p1,p2,p3)
- drop columns we don't intend to use


### Tidiness

#### image_predictions
- Data should be in one table
- Duplicated columns such as text and source across the archiveand image_predictions table

## Cleaning

In [None]:
archive2 = archive.copy()
image_predictions2=image_predictions.copy()
api_tweets2 = api_tweets.copy()

### Missing Values

**Define**

Use fillna to fill Nan values in in_reply_to_user_id with None

**Code**

In [None]:
archive2["in_reply_to_user_id"].fillna('None',inplace = True) 

**Testing**

In [None]:
archive2.info()

**Define**

Replace None in Tweet "891087950875897856" name with Marlo

**Code**

In [None]:
archive2[archive2["tweet_id"]==891087950875897856]["name"]

In [None]:
archive2.loc[5,"name"]="Marlo"

**Testing**

In [None]:
archive2[archive2["tweet_id"]==891087950875897856]

**Define**

Replace None in Tweet "885518971528720385" name with Marlo

In [None]:
archive2[archive2["tweet_id"]==885518971528720385]["name"]

**Code**

In [None]:
archive2.loc[35,"name"]="Howard"

**Testing**

In [None]:
archive2[archive2["tweet_id"]==885518971528720385]

### Tidyness

**Define**

Join all 3 tables on tweet_id

**Code**

In [None]:
master = archive2.merge(image_predictions, on='tweet_id', how = 'inner')
master = master.merge(api_tweets2, on = 'tweet_id', how='inner')

**Test**

In [None]:
master.head()

In [None]:
master.info()

In [None]:
master.columns.value_counts()

So we also fixed the issue of duplicated columns by merging

## Quality

**Define**

Use pandas drop to remove unwanted columns

**Code**

In [None]:
master=master.drop(labels = ['retweeted_status_timestamp','retweeted_status_id','retweeted_status_user_id','expanded_urls','in_reply_to_status_id','jpg_url','source',], axis=1)

**Test**

In [None]:
master.columns

**Define**

Use RegEx to find and drop all retweets

**Code**

In [None]:
master.info()

In [None]:
#RTs
master = master[(master.text.str.extract(pat ='(^RT)').isnull())]

**Test**

In [None]:
master.info()

In [None]:
master[(master.text.str.extract(pat ='(^RT)').notnull())]

**Define**

Fix tweet with denominator of 0 but inputting correct denominator

**Test**

In [None]:
master[master.tweet_id== 835246439529840640]

In [None]:
master.describe()

**Define**

Truncate tailing 0's and use apply datetime.strip function

**Code**

In [None]:
master.timestamp

In [None]:
master.timestamp = master.timestamp.str.slice(start=0, stop=-6)

In [None]:
len(master[master.timestamp.str.len()==19])

In [None]:
master.timestamp = master.timestamp.apply(lambda x: datetime.datetime.strptime(x, "%Y-%m-%d %H:%M:%S"))

**Testing**

In [None]:
master.info()

**Define**

Use the .eq and mul functions to assign 1 for presence of dog stage in the photo and 0 otherwise

**Code**

In [None]:
#https://stackoverflow.com/questions/40901770/is-there-a-simple-way-to-change-a-column-of-yes-no-to-1-0-in-a-pandas-dataframe
master.doggo=master.doggo.eq('doggo').mul(1)
master.puppo=master.puppo.eq('puppo').mul(1)
master.pupper=master.pupper.eq('pupper').mul(1)
master.floofer =master.floofer.eq('floofer').mul(1)

**Test**

In [None]:
master[['doggo','puppo','floofer','pupper']]

**Define**

Use str.lower to put p1,p2,p3 to lower case

**Code**

In [None]:
master.p1 = master.p1.str.lower()
master.p2 = master.p2.str.lower()
master.p3 = master.p3.str.lower()

**Test**

In [None]:
master[['p1','p2','p3']]

In [None]:
master.head()

In [None]:
master.info()

### Storage

In [None]:
master.to_csv('twitter_archive_master.csv', encoding='utf-8', index=False, date_format='%Y-%m-%d %H:%M:%S')

In [None]:
master.info()

In [None]:
master

In [None]:
df = pd.read_csv("twitter_archive_master.csv")


In [None]:
df.info()

In [None]:
df.head()

## Analysis and Visualization

In [None]:
df.describe()

In [None]:
df[df.favorite_count==164354]

In [None]:
sum(df.doggo),sum(df.puppo), sum(df.pupper), sum(df.floofer)

**Insights**

 Tweet 744234799360020481 has received the most likes - 164354

Considering Tweets up to August 1st 2017, each tweet containing a photo has received on average 2653.84 Rts

Puppers are the most recorded dog stage

**Visualization**

In [None]:
df.timestamp = df.timestamp.apply(lambda x: datetime.datetime.strptime(x, "%Y-%m-%d %H:%M:%S"))

In [None]:
df.timestamp = df.timestamp.dt.hour

In [None]:
by_hour=df.groupby(df.timestamp).sum()


In [None]:
no_of_tweets = df.groupby(df.timestamp).size()

In [None]:
means=df.groupby(df.timestamp).mean()
means["favorite_count"]

In [None]:
fig = plt.figure(figsize=(12,4))
ax1 = plt.subplot(1,2,1)
ax2 = fig.add_subplot(1,2,2)
ax2.bar(by_hour.index, by_hour['retweet_count'], color='blue', alpha=0.2)
ax1.bar(by_hour.index, no_of_tweets, color='#1DA1F2')

ax1.spines['right'].set_visible(False)
ax1.spines['top'].set_visible(False)
ax1.spines['left'].set_visible(False)
ax1.tick_params(left = 'off', bottom = 'off')
ax1.set_yticklabels('')
ax1.set_title('Tweets')
ax1.spines['bottom'].set_linewidth(0.6)

ax2.spines['right'].set_visible(False)
ax2.spines['top'].set_visible(False)
ax2.spines['left'].set_visible(False)
ax2.tick_params(left = 'off', bottom = 'off')
ax2.set_yticklabels('')
ax2.set_title('Likes')
ax2.spines['bottom'].set_linewidth(0.6)

plt.show()

Here we see that the distribution of likes generally follows the distribution of tweets, this shows that WeRate dogs follows a strict schedule of posts between 3:00pm - 5:00am. Perhaps this is the schedule that they believe captures peak times.