
# Twitter data analysis

## Table of Contents
<ul>
<li><a href="#intro">Introduction</a></li>
<li><a href="#gathering">Data gathering</a></li>
<li><a href="#Assessing">Assessing</a></li>
<li><a href="#Cleaning">Cleaning</a></li>
<li><a href="#conclusions">Conclusions</a></li>
</ul>

In [1]:
# imports 
import pandas as pd
import numpy as np
import requests
import tweepy
import json
import os

<a id='intro'></a>
## Introduction

> In this project, I analyze tweets from [@WeRateDogs](https://twitter.com/dog_rates) twitter account. WeRateDogs is a Twitter account with more than eight million followers that rates people's dogs with a humorous comment about the dog.
<br>
I will gather the data using the twitter API.

<a id='intro'></a>
## Data gathering

In [2]:
#Read twitter-archive-enhanced.csv file
df_csv = pd.read_csv('twitter-archive-enhanced.csv', index_col='tweet_id')
df_csv.head(1)

Unnamed: 0_level_0,in_reply_to_status_id,in_reply_to_user_id,timestamp,source,text,retweeted_status_id,retweeted_status_user_id,retweeted_status_timestamp,expanded_urls,rating_numerator,rating_denominator,name,doggo,floofer,pupper,puppo
tweet_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
892420643555336193,,,2017-08-01 16:23:56 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Phineas. He's a mystical boy. Only eve...,,,,https://twitter.com/dog_rates/status/892420643...,13,10,Phineas,,,,


In [3]:
#Download image_predictions.tsv file
r = requests.get('https://d17h27t6h515a5.cloudfront.net/topher/2017/August/599fd2ad_image-predictions/image-predictions.tsv')
with open(os.path.join('image_predictions.tsv'), mode='wb') as file:
    file.write(r.content)

# Read the downloaded file    
df_tsv = pd.read_csv('image_predictions.tsv', sep='\t', index_col='tweet_id')
df_tsv.head(2)

Unnamed: 0_level_0,jpg_url,img_num,p1,p1_conf,p1_dog,p2,p2_conf,p2_dog,p3,p3_conf,p3_dog
tweet_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
666020888022790149,https://pbs.twimg.com/media/CT4udn0WwAA0aMy.jpg,1,Welsh_springer_spaniel,0.465074,True,collie,0.156665,True,Shetland_sheepdog,0.061428,True
666029285002620928,https://pbs.twimg.com/media/CT42GRgUYAA5iDo.jpg,1,redbone,0.506826,True,miniature_pinscher,0.074192,True,Rhodesian_ridgeback,0.07201,True


In [5]:
# Connect to twitter APIs
# I have stored authuntication information in auth.json file for security reasons

auth = json.load(open('auth.json'))

consumer_key = auth['consumer_key']
consumer_secret = auth['consumer_secret']
access_token = auth['access_token']
access_secret = auth['access_secret']

auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_secret)

api = tweepy.API(auth)

In [39]:
# Check contents of a tweet response
tweet = api.get_status(666029285002620928)
for key in tweet._json.keys():
    print(key)
print(tweet.possibly_sensitive)

created_at
id
id_str
text
truncated
entities
extended_entities
source
in_reply_to_status_id
in_reply_to_status_id_str
in_reply_to_user_id
in_reply_to_user_id_str
in_reply_to_screen_name
user
geo
coordinates
place
contributors
is_quote_status
retweet_count
favorite_count
favorited
retweeted
possibly_sensitive
possibly_sensitive_appealable
lang
False


In [None]:
# Read tweets with tweet id

df_list=[]
for t in np.array(df_csv.index):
    try:
        tweet = api.get_status(t)
        favorite_count = tweet.favorite_count
        retweet_count = tweet.retweet_count
        tweet_status_ok = True
    except:
        tweet_status_ok = False
    
    df_list.append({'tweet_id':t,
                    'favorite_count':favorite_count,
                    'retweet_count':retweet_count,
                    'tweet_status_ok': tweet_status_ok})

df_api = pd.DataFrame(df_list, columns= ['tweet_id','favorite_count','retweet_count','tweet_status_ok'])

<a id='intro'></a>
## Assessing

<a id='Cleaning'></a>
## Cleaning

<a id='conclusions'></a>
## Conclusions