In [1]:
import pandas as pd
import numpy as np
import os
import requests
import tweepy
import json
import re

### 收集数据
#### WeRateDog的twitter档案

In [2]:
twitter_archive = pd.read_csv('twitter-archive-enhanced.csv')


#### twitter图像预测数据

In [3]:
# download the file of image predictions from the URL

url = 'https://raw.githubusercontent.com/udacity/new-dand-advanced-china/master/%E6%95%B0%E6%8D%AE%E6%B8%85%E6%B4%97/WeRateDogs%E9%A1%B9%E7%9B%AE/image-predictions.tsv'
response = requests.get(url)

with open(url.split('/')[-1], mode='wb') as file:
        file.write(response.content)
        

In [4]:
# upload the date from the file
image_predictions = pd.read_csv(url.split('/')[-1], sep='\t')


#### twitter附加数据

In [None]:
# get twitter API
consumer_key = 'YOUR CONSUMER KEY'
consumer_secret = 'YOUR CONSUMER SECRET'
access_token = 'YOUR ACCESS TOKEN'
access_secret = 'YOUR ACCESS SECRET'

auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_secret)

api = tweepy.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True)


In [None]:
# find the twitter content
for tweet_id in twitter_archive['tweet_id']:
    try:
        # download the content of the specific id
        tweet = api.get_status(tweet_id, tweet_mode='extended')
        # append data into txt
        with open('tweet_json.txt', 'w') as f:
            f.write(json.dump(tweet, f, ensure_ascii=False))
    # not best practice to catch all exceptions        
    except Exception as e:
        print('%: The content of % has been deleted'.format(e, tweet_id))
        

In [5]:
# read txt data to build a list of dictionary
json_list = []

with open('tweet_json.txt', 'r') as f:
    for line in f.readlines():
        df = json.loads(line)

        json_list.append(df)


In [6]:
# create DataFrame from list of dictionary
df_list = []
for item in json_list:
    tweet_ID = item['id']
    retweet_count = item['retweet_count']
    favorite_count = item['favorite_count']
    
    df_list.append({'tweet_id': tweet_ID,
                   'retweet_count': retweet_count,
                   'favorite_count': favorite_count})
    
tweet_extra_data = pd.DataFrame(df_list, columns = ['tweet_id', 'retweet_count', 'favorite_count'])


### 数据评估

通过目测评估和编程评估的方式对数据进行质量及整洁度的评估

#### 质量 quality  

*twitter_archive table*
* 'tweet_id' is an int not a string
* The columns 'in_reply_to_status_id', 'in_reply_to_user_id', 'retweeted_status_id', 'retweeted_status_user_id', 'retweeted_status_timestamp' have too much missing data 
* The dataset doesn't need retweet contents
* 'timestamp' is a string not a datatype and contains '+0000'
* 'source' includes useless infomation such like http link
* 'expanded_urls' has null value or duplicated urls in a row
* Min and max numbers of the 'rating_numerator' and 'rating_denominator' are not correct
* Many None or a/an in 'name'
* 'doggo', 'floofer', 'pupper', 'puppo' contain too many None
* Delete the data after 8/1/2017

*image_predictions table*
1. 'tweet_id' and 'img_num' is an int not a string
2. The duplicates in 'jpg_url' column


#### 整洁度 tidiness
* Dog stage is in four different columns (doggo, floofer, pupper, puppo)
* Three tables can be merged


In [None]:
twitter_archive.sample(30)

In [None]:
twitter_archive.info()

In [None]:
twitter_archive.describe()

In [None]:
twitter_archive.sample(5)

In [None]:
twitter_archive['name'].value_counts()

In [None]:
image_predictions.info()

In [None]:
image_predictions.describe()

In [None]:
image_predictions.sample(10)

In [None]:
image_predictions[image_predictions.jpg_url.duplicated()]

In [None]:
tweet_extra_data.info()

In [None]:
tweet_extra_data.describe()

In [None]:
tweet_extra_data.sample(10)

### 数据清理


In [7]:
twitter_archive_clean = twitter_archive.copy()
image_predictions_clean = image_predictions.copy()
tweet_extra_data_clean = tweet_extra_data.copy()

#### 缺失数据

`twitter_archive`: 
* The columns 'in_reply_to_status_id', 'in_reply_to_user_id', 'retweeted_status_id', 'retweeted_status_user_id', 'retweeted_status_timestamp' have too much missing data. 
* The dataset doesn't need retweet contents

###### define

* Delete the columns 'in_reply_to_status_id', 'in_reply_to_user_id', 'retweeted_status_id', 'retweeted_status_user_id', 'retweeted_status_timestamp'
* Delete the rows including the 'in_reply_to_status_id' and 'retweeted_status_user_id'

###### code

In [8]:
# Delete the rows including the 'in_reply_to_status_id' and 'retweeted_status_user_id'

twitter_archive_clean = twitter_archive_clean[twitter_archive_clean['in_reply_to_status_id'].isnull()]
twitter_archive_clean = twitter_archive_clean[twitter_archive_clean['retweeted_status_user_id'].isnull()]

In [9]:
# Delete the columns 'in_reply_to_status_id', 'in_reply_to_user_id', 'retweeted_status_id', 'retweeted_status_user_id', 'retweeted_status_timestamp'

twitter_archive_clean.drop(['in_reply_to_status_id', 'in_reply_to_user_id', 'retweeted_status_id', 
                            'retweeted_status_user_id', 'retweeted_status_timestamp'], axis=1, inplace=True)

###### test

In [None]:
twitter_archive_clean.sample(5)

##### Tidiness


`twitter_archive`:
* Dog stage is in four different columns (doggo, floofer, pupper, puppo)

###### define

* Find the stage from the text
* Delete the 'doggo', 'floofer', 'pupper', 'puppo' columns

###### code

In [10]:
# Find the stage from the text
twitter_archive_clean['stage'] = twitter_archive_clean.text.str.lower().str.findall('(doggo|floofer|pupper|puppo)')

# Transfer 'stage' list to string
twitter_archive_clean['stage'] = twitter_archive_clean.stage.str.join(',')


In [11]:
# Delete the 'doggo', 'floofer', 'pupper', 'puppo' columns
twitter_archive_clean.drop(['doggo', 'floofer', 'pupper', 'puppo'], axis=1, inplace=True)

##### test

In [None]:
twitter_archive_clean.sample(5)

In [None]:
twitter_archive_clean['stage'].value_counts()

In [None]:
twitter_archive_clean.to_csv('test.csv')

##### redefine
* Check the text again and find the reasons
* If the stages are the same then keep it.
* If there are two or more stages in a text which there are two or more dogs so that delete it.

##### code

In [12]:
# change the stage status
twitter_archive_clean['stage'].replace({'pupper,pupper':'pupper', 'doggo,doggo': 'doggo', 
                                        'pupper,pupper,pupper': 'pupper'}, inplace=True)

In [13]:
# delete the rows which stage contains two or more stages.
twitter_archive_clean = twitter_archive_clean[(~twitter_archive_clean.stage.str.contains(','))]

##### test

In [None]:
twitter_archive_clean['stage'].value_counts()

##### merge three tables at the final step

#### Quality

#### `twitter_archive` & `image_predictions` & `tweet_extra_data`:
* 'tweet_id' is an int not a string

###### define

* Cover the 'tweet_id' column's data from float to a string

###### code

In [14]:
# Cover the 'tweet_id' column's data from float to a string
twitter_archive_clean['tweet_id'] = twitter_archive_clean['tweet_id'].astype(str)
image_predictions_clean['tweet_id'] = image_predictions_clean['tweet_id'].astype(str)
tweet_extra_data_clean['tweet_id'] = tweet_extra_data_clean['tweet_id'].astype(str)

##### test

In [None]:
twitter_archive_clean.info()

In [None]:
image_predictions_clean.info()


#### `twitter_archive`:
* 'timestamp' is a string not a datatype


##### define
* Slice the 'timestamp' without +0000
* Cover the 'timestamp' column's data from string to datatime

##### code

In [15]:
# Slice the 'timestamp' without +0000
# Cover the 'timestamp' column's data from string to datatime
twitter_archive_clean.timestamp = pd.to_datetime(twitter_archive_clean.timestamp[:-6])

##### test

In [None]:
twitter_archive_clean.head()

In [None]:
twitter_archive_clean.info()

#### `twitter_archive`:
* 'source' includes useless infomation such like http link

##### define
* Just cut out the info of where the source comes, such like Twitter for iPhone, Twitter Web Client and Vine - Make a Scene

##### code

In [16]:
twitter_archive_clean['source'] = twitter_archive_clean.source.str.extract('(>(.+)<)', expand=True)
twitter_archive_clean['source'] = twitter_archive_clean.source.str.extract('([A-Z]\D+[a-z])', expand=True)

##### test

In [None]:
twitter_archive_clean.sample()

#### `twitter_archive`:
* 'expanded_urls' has null value or duplicated urls in a row

##### define
* Keep the rows which is not null using `notnull()`

##### code

In [17]:
# Keep the rows which is not null using notnull()

twitter_archive_clean[twitter_archive_clean['expanded_urls'].isnull()]

Unnamed: 0,tweet_id,timestamp,source,text,expanded_urls,rating_numerator,rating_denominator,name,stage
375,828361771580813312,2017-02-05 21:56:51,Twitter Web Client,Beebop and Doobert should start a band 12/10 w...,,12,10,,
707,785515384317313025,2016-10-10 16:20:36,Twitter for iPhone,"Today, 10/10, should be National Dog Rates Day",,10,10,,
1445,696518437233913856,2016-02-08 02:18:30,Twitter for iPhone,Oh my god 10/10 for every little hot dog pupper,,10,10,,pupper


In [18]:
twitter_archive_clean = twitter_archive_clean[twitter_archive_clean['expanded_urls'].notnull()]

#####  测试

In [None]:
twitter_archive_clean['expanded_urls'].value_counts()

In [None]:
twitter_archive_clean.sample(5)

#### `twitter_archive`:
* Min and max numbers of the 'rating_numerator' and 'rating_denominator' are not correct

##### define
* Reselect the rating numbers from the texts
* Check the outlier
* If the denominator is not 10, double check the text or delete the row

##### code

In [19]:
# Reselect the rating numbers from the texts

twitter_archive_clean['rating'] = twitter_archive_clean.text.str.extract('(\d+\/\d+)',expand=True)

twitter_archive_clean['rating'].value_counts()

12/10      479
10/10      433
11/10      412
13/10      284
9/10       152
8/10        98
7/10        51
14/10       38
5/10        33
6/10        32
3/10        19
4/10        15
2/10         9
1/10         4
88/80        1
27/10        1
7/11         1
75/10        1
50/50        1
1776/10      1
121/110      1
1/2          1
204/170      1
26/10        1
165/150      1
99/90        1
4/20         1
0/10         1
9/11         1
24/7         1
84/70        1
44/40        1
144/120      1
80/80        1
420/10       1
60/50        1
45/50        1
Name: rating, dtype: int64

In [20]:
# list all rows contains outliers

twitter_archive_clean[
    twitter_archive_clean['rating'].isin([
        '26/10','88/80','9/11','84/70','27/10','204/170','165/150', '75/10',
    '60/50', '0/10', '1/2', '99/90', '1776/10', '80/80', '121/110', '7/11',
    '44/40', '24/7', '420/10', '50/50', '45/50', '144/120', '4/20'
    ])][['text', 'rating', 'rating_numerator', 'rating_denominator']]

Unnamed: 0,text,rating,rating_numerator,rating_denominator
315,When you're so blinded by your systematic plag...,0/10,0,10
433,The floofs have been released I repeat the flo...,84/70,84,70
516,Meet Sam. She smiles 24/7 &amp; secretly aspir...,24/7,24,7
695,"This is Logan, the Chow who lived. He solemnly...",75/10,75,10
763,This is Sophie. She's a Jubilant Bush Pupper. ...,27/10,27,10
902,Why does this never happen at my front door......,165/150,165,150
979,This is Atticus. He's quite simply America af....,1776/10,1776,10
1068,"After so many requests, this is Bretagne. She ...",9/11,9,11
1120,Say hello to this unbelievably well behaved sq...,204/170,204,170
1165,Happy 4/20 from the squad! 13/10 for all https...,4/20,4,20


In [21]:
# check the outlier
## most large numbers are for the group of dogs so it can be converted to xx/10, such 84/70, 165/150, 204/170...
## replace the correct rating to the old ones
twitter_archive_clean['rating'].replace({'84/70': '12/10', '165/150': '11/10', '204/170': '12/10','99/90': '11/10', 
                                         '80/80': '10/10', '45/50': '9/10', '60/50': '12/10', '44/40': '11/10', 
                                         '121/110': '11/10', '144/120': '12/10', '88/80': '11/10'}, inplace=True)

In [22]:
## 1776/10, 420/10 are correct so keep it
## 26/10, 75/10, 27/10 are only keep the float of the numerators
twitter_archive_clean['rating'].replace({'26/10': '11.26/10', '75/10': '9.75/10', '27/10': '11.27/10'}, inplace=True)

In [23]:
## 24/7, 9/11, 4/20, 50/50, 7/11, 1/2 slice the wrong number, delete it after split 'rating'
## split 'rating' and conver the original 'rating_numerator' and 'rating_denominator'
twitter_archive_clean['rating_numerator'],twitter_archive_clean['rating_denominator'] = twitter_archive_clean['rating'].str.split('/').str

In [24]:
## delete the rows when rating_denominator is not 10
twitter_archive_clean = twitter_archive_clean[twitter_archive_clean['rating_denominator'] == '10']

In [25]:
## delete the original 'rating_numerator' and 'rating_denominator'
twitter_archive_clean.drop(['rating'], axis=1, inplace=True)

##### test

In [None]:
twitter_archive_clean.sample(5)

In [None]:
twitter_archive_clean['rating_numerator'].value_counts()

In [None]:
twitter_archive_clean['rating_denominator'].value_counts()

#### `twitter_archive`:
* Many None or a/an in 'name'

##### define
* Reselect the name from the text
* If the value is None, replacing with NaN

##### code

In [26]:
# check the text and find name usually following This is, named, Meet, Say hello to, name is, Here we have, Here is
# reselect the name from the text
twitter_archive_clean['name'] = twitter_archive_clean.text.str.extract(
    '(?:This is|named|Meet|Say hello to|name is|Here we have|Here is)\s([A-Z][a-z+]*)',expand=True)

##### test

In [None]:
twitter_archive_clean['name'].value_counts()

In [None]:
twitter_archive_clean.head(5)

#### `image_predictions`:
* The duplicates in 'jpg_url' column

##### define
* Delete the duplicates

##### code

In [27]:
# Delete the duplicates and only keep the first one
image_predictions_clean.drop_duplicates(['jpg_url'], inplace=True)

##### test

In [None]:
image_predictions_clean['jpg_url'].value_counts()

####  Tidiness
#### `twitter_archive` & `image_predictions` & `tweet_extra_data`:
* Three tables can be merged
* No need the data after 8/1/2017

##### define
* Merge three tables based on 'tweet_id'
* Delete the data after '2017-08-02 00:00:00'
* Drop unnecessary columns and change the heading names

##### code

In [28]:
# Merge three tables based on 'tweet_id'
# Only keep the data having pictures which means to inner join
wrd_clean = pd.merge(twitter_archive_clean, image_predictions_clean, on='tweet_id', how='inner')

wrd_clean = pd.merge(wrd_clean, tweet_extra_data_clean, on='tweet_id', how='left')

In [29]:
# Delete the data after '2017-08-02 00:00:00'
wrd_clean = wrd_clean.query("timestamp < '2017-08-02 00:00:00'")

In [30]:
wrd_clean.head()

Unnamed: 0,tweet_id,timestamp,source,text,expanded_urls,rating_numerator,rating_denominator,name,stage,jpg_url,...,p1_conf,p1_dog,p2,p2_conf,p2_dog,p3,p3_conf,p3_dog,retweet_count,favorite_count
0,892420643555336193,2017-08-01 16:23:56,Twitter for iPhone,This is Phineas. He's a mystical boy. Only eve...,https://twitter.com/dog_rates/status/892420643...,13,10,Phineas,,https://pbs.twimg.com/media/DGKD1-bXoAAIAUK.jpg,...,0.097049,False,bagel,0.085851,False,banana,0.07611,False,8842,39492
1,892177421306343426,2017-08-01 00:17:27,Twitter for iPhone,This is Tilly. She's just checking pup on you....,https://twitter.com/dog_rates/status/892177421...,13,10,Tilly,,https://pbs.twimg.com/media/DGGmoV4XsAAUL6n.jpg,...,0.323581,True,Pekinese,0.090647,True,papillon,0.068957,True,6480,33786
2,891815181378084864,2017-07-31 00:18:03,Twitter for iPhone,This is Archie. He is a rare Norwegian Pouncin...,https://twitter.com/dog_rates/status/891815181...,12,10,Archie,,https://pbs.twimg.com/media/DGBdLU1WsAANxJ9.jpg,...,0.716012,True,malamute,0.078253,True,kelpie,0.031379,True,4301,25445
3,891689557279858688,2017-07-30 15:58:51,Twitter for iPhone,This is Darla. She commenced a snooze mid meal...,https://twitter.com/dog_rates/status/891689557...,13,10,Darla,,https://pbs.twimg.com/media/DF_q7IAWsAEuuN8.jpg,...,0.170278,False,Labrador_retriever,0.168086,True,spatula,0.040836,False,8925,42863
4,891327558926688256,2017-07-29 16:00:24,Twitter for iPhone,This is Franklin. He would like you to stop ca...,https://twitter.com/dog_rates/status/891327558...,12,10,Franklin,,https://pbs.twimg.com/media/DF6hr6BUMAAzZgT.jpg,...,0.555712,True,English_springer,0.22577,True,German_short-haired_pointer,0.175219,True,9721,41016


In [31]:
# Delete 'text' and 'name'
wrd_clean.drop(['text'], axis=1, inplace=True)


##### test 

In [None]:
wrd_clean.head()

In [None]:
wrd_clean.info()

#### 数据存储

In [32]:
wrd_clean.to_csv('twitter_archive_master.csv', index=False)

## 数据分析及可视化

In [None]:
twitter_master = pd.read_csv('twitter_archive_master.csv')

In [None]:
twitter_master.head()

In [None]:
df_favorite = twitter_master[['favorite_count','rating_numerator']]
#评分存在异常值，这里取评分低于30分的值
df_favorite = twitter_master[twitter_master['rating_numerator']<40]


In [None]:
import matplotlib.pyplot as plt
%matplotlib inline


plt.scatter(df_favorite['rating_numerator'],df_favorite['favorite_count'])
plt.xlabel('rating_numerator')
plt.ylabel('favorite_count')
plt.title('scatter')
plt.show()

In [None]:
df_stage = twitter_master[['stage','rating_numerator']]


In [None]:
df_stage['stage'].value_counts()

In [None]:
stage_list = []
for rating in df_stage['rating_numerator']:
    

In [None]:
plt.boxplot(df_stage)
plt.show()