In [5]:
import pandas as pd
import os
import requests
import tweepy
import json
import re

### 收集数据
#### WeRateDog的twitter档案

In [3]:
twitter_archive = pd.read_csv('twitter-archive-enhanced.csv')


#### twitter图像预测数据

In [8]:
# download the file of image predictions from the URL

url = 'https://raw.githubusercontent.com/udacity/new-dand-advanced-china/master/%E6%95%B0%E6%8D%AE%E6%B8%85%E6%B4%97/WeRateDogs%E9%A1%B9%E7%9B%AE/image-predictions.tsv'
response = requests.get(url)

with open(url.split('/')[-1], mode='wb') as file:
        file.write(response.content)
        

In [9]:
# upload the date from the file
image_predictions = pd.read_csv(url.split('/')[-1], sep='\t')


#### twitter附加数据

In [None]:
# get twitter API
consumer_key = 'YOUR CONSUMER KEY'
consumer_secret = 'YOUR CONSUMER SECRET'
access_token = 'YOUR ACCESS TOKEN'
access_secret = 'YOUR ACCESS SECRET'

auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_secret)

api = tweepy.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True)


In [None]:
# find the twitter content
for tweet_id in twitter_archive['tweet_id']:
    try:
        # download the content of the specific id
        tweet = api.get_status(tweet_id, tweet_mode='extended')
        # append data into txt
        with open('tweet_json.txt', 'w') as f:
            f.write(json.dump(tweet, f, ensure_ascii=False))
    # not best practice to catch all exceptions        
    except Exception as e:
        print('%: The content of % has been deleted'.format(e, tweet_id))
        

In [11]:
# read txt data to build a list of dictionary
json_list = []

with open('tweet_json.txt', 'r') as f:
    for line in f.readlines():
        df = json.loads(line)

        json_list.append(df)


In [12]:
# create DataFrame from list of dictionary
df_list = []
for item in json_list:
    tweet_ID = item['id']
    retweet_count = item['retweet_count']
    favorite_count = item['favorite_count']
    
    df_list.append({'tweet_id': tweet_ID,
                   'retweet_count': retweet_count,
                   'favorite_count': favorite_count})
    
tweet_extra_data = pd.DataFrame(df_list, columns = ['tweet_id', 'retweet_count', 'favorite_count'])


### 数据评估

通过目测评估和编程评估的方式对数据进行质量及整洁度的评估

#### 质量 quality  

*twitter_archive table*
* 'tweet_id' is an int not a string
* The columns 'in_reply_to_status_id', 'in_reply_to_user_id', 'retweeted_status_id', 'retweeted_status_user_id', 'retweeted_status_timestamp' have too much missing data 
* The dataset doesn't need retweet contents
* 'timestamp' is a string not a datatype and contains '+0000'
* 'source' includes useless infomation such like http link
* 'expanded_urls' has null value or duplicated urls in a row
* Min and max numbers of the 'rating_numerator' and 'rating_denominator' are not correct
* Many None or a/an in 'name'
* 'doggo', 'floofer', 'pupper', 'puppo' contain too many None
* Delete the data after 8/1/2017

*image_predictions table*
1. 'tweet_id' and 'img_num' is an int not a string
2. The duplicates in 'jpg_url' column


#### 整洁度 tidiness
* Dog stage is in four different columns (doggo, floofer, pupper, puppo)
* Three tables can be merged


In [None]:
twitter_archive.sample(30)

In [18]:
twitter_archive.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2356 entries, 0 to 2355
Data columns (total 17 columns):
tweet_id                      2356 non-null int64
in_reply_to_status_id         78 non-null float64
in_reply_to_user_id           78 non-null float64
timestamp                     2356 non-null object
source                        2356 non-null object
text                          2356 non-null object
retweeted_status_id           181 non-null float64
retweeted_status_user_id      181 non-null float64
retweeted_status_timestamp    181 non-null object
expanded_urls                 2297 non-null object
rating_numerator              2356 non-null int64
rating_denominator            2356 non-null int64
name                          2356 non-null object
doggo                         2356 non-null object
floofer                       2356 non-null object
pupper                        2356 non-null object
puppo                         2356 non-null object
dtypes: float64(4), int64(3), ob

In [None]:
twitter_archive.describe()

In [None]:
twitter_archive.sample(5)

In [4]:
twitter_archive['name'].value_counts()

None         745
a             55
Charlie       12
Cooper        11
Oliver        11
Lucy          11
Penny         10
Tucker        10
Lola          10
Winston        9
Bo             9
the            8
Sadie          8
Toby           7
an             7
Daisy          7
Buddy          7
Bailey         7
Jax            6
Scout          6
Koda           6
Dave           6
Rusty          6
Jack           6
Milo           6
Bella          6
Oscar          6
Stanley        6
Leo            6
Finn           5
            ... 
Maxwell        1
Bookstore      1
Grizzwald      1
Gilbert        1
Pavlov         1
Glenn          1
his            1
General        1
Rilo           1
Moofasa        1
Cuddles        1
Glacier        1
Edgar          1
Duddles        1
Buckley        1
Mona           1
Bodie          1
Boots          1
Mauve          1
Terrenth       1
Swagger        1
Stewie         1
Corey          1
Ester          1
Ralphie        1
Travis         1
Ralpher        1
Taco          

In [None]:
image_predictions.info()

In [None]:
image_predictions.describe()

In [None]:
image_predictions.sample(10)

In [None]:
image_predictions[image_predictions.jpg_url.duplicated()]

In [None]:
tweet_extra_data.info()

In [None]:
tweet_extra_data.describe()

In [None]:
tweet_extra_data.sample(10)

### 数据清理


In [63]:
twitter_archive_clean = twitter_archive.copy()
image_predictions_clean = image_predictions.copy()
tweet_extra_data_clean = tweet_extra_data.copy()

#### 缺失数据

`twitter_archive`: 
* The columns 'in_reply_to_status_id', 'in_reply_to_user_id', 'retweeted_status_id', 'retweeted_status_user_id', 'retweeted_status_timestamp' have too much missing data. 
* The dataset doesn't need retweet contents

###### 定义

* Delete the columns 'in_reply_to_status_id', 'in_reply_to_user_id', 'retweeted_status_id', 'retweeted_status_user_id', 'retweeted_status_timestamp'
* Delete the rows including the 'in_reply_to_status_id' and 'retweeted_status_user_id'

###### 代码

In [64]:
# Delete the rows including the 'in_reply_to_status_id' and 'retweeted_status_user_id'

twitter_archive_clean = twitter_archive_clean[twitter_archive_clean['in_reply_to_status_id'].isnull()]
twitter_archive_clean = twitter_archive_clean[twitter_archive_clean['retweeted_status_user_id'].isnull()]

In [65]:
# Delete the columns 'in_reply_to_status_id', 'in_reply_to_user_id', 'retweeted_status_id', 'retweeted_status_user_id', 'retweeted_status_timestamp'

twitter_archive_clean.drop(['in_reply_to_status_id', 'in_reply_to_user_id', 'retweeted_status_id', 
                            'retweeted_status_user_id', 'retweeted_status_timestamp'], axis=1, inplace=True)

###### 测试

In [66]:
twitter_archive_clean.sample(5)

Unnamed: 0,tweet_id,timestamp,source,text,expanded_urls,rating_numerator,rating_denominator,name,doggo,floofer,pupper,puppo
428,821149554670182400,2017-01-17 00:18:04 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Luca. He got caught howling. H*ckin em...,https://twitter.com/dog_rates/status/821149554...,12,10,Luca,,,,
284,838921590096166913,2017-03-07 01:17:48 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Arlo. He's officially the king of snow...,https://twitter.com/dog_rates/status/838921590...,13,10,Arlo,,,,
1861,675483430902214656,2015-12-12 01:12:54 +0000,"<a href=""http://twitter.com/download/iphone"" r...",Rare shielded battle dog here. Very happy abou...,https://twitter.com/dog_rates/status/675483430...,5,10,,,,,
1742,679462823135686656,2015-12-23 00:45:35 +0000,"<a href=""http://twitter.com/download/iphone"" r...",Meet Hurley. He's the curly one. He hugs every...,https://twitter.com/dog_rates/status/679462823...,11,10,Hurley,,,,
813,771102124360998913,2016-08-31 21:47:27 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Charlie. He works for @TODAYshow. Supe...,https://twitter.com/dog_rates/status/771102124...,12,10,Charlie,,,,


##### 整洁度


`twitter_archive`:
* Dog stage is in four different columns (doggo, floofer, pupper, puppo)

###### 定义

* Find the stage from the text
* Delete the 'doggo', 'floofer', 'pupper', 'puppo' columns

###### 代码

In [67]:
# Find the stage from the text
twitter_archive_clean['stage'] = twitter_archive_clean.text.str.lower().str.findall('(doggo|floofer|pupper|puppo)')

# Transfer 'stage' list to string
twitter_archive_clean['stage'] = twitter_archive_clean.stage.str.join('')

# Delete the 'doggo', 'floofer', 'pupper', 'puppo' columns
twitter_archive_clean.drop(['doggo', 'floofer', 'pupper', 'puppo'], axis=1, inplace=True)

##### 测试

In [68]:
twitter_archive_clean.sample(5)

Unnamed: 0,tweet_id,timestamp,source,text,expanded_urls,rating_numerator,rating_denominator,name,stage
1383,700847567345688576,2016-02-20 01:00:55 +0000,"<a href=""http://twitter.com/download/iphone"" r...",Meet Crouton. He's a Galapagos Boonwiddle. Has...,https://twitter.com/dog_rates/status/700847567...,10,10,Crouton,
2354,666029285002620928,2015-11-15 23:05:30 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is a western brown Mitsubishi terrier. Up...,https://twitter.com/dog_rates/status/666029285...,7,10,a,
1936,673956914389192708,2015-12-07 20:07:04 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is one esteemed pupper. Just graduated co...,https://twitter.com/dog_rates/status/673956914...,10,10,one,pupper
1212,715342466308784130,2016-03-31 00:58:29 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Oscar. He's a world renowned snowball ...,https://twitter.com/dog_rates/status/715342466...,10,10,Oscar,
196,854732716440526848,2017-04-19 16:25:34 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Marlee. She fetched a flower and immed...,https://twitter.com/dog_rates/status/854732716...,12,10,Marlee,


##### merge three tables at the final step

#### 质量

`twitter_archive` & `image_predictions`:
* 'tweet_id' is an int not a string

###### 定义

* Cover the 'tweet_id' column's data from float to a string

###### 代码

In [69]:
# Cover the 'tweet_id' column's data from float to a string
twitter_archive_clean['tweet_id'] = twitter_archive_clean['tweet_id'].astype(str)
image_predictions_clean['tweet_id'] = image_predictions_clean['tweet_id'].astype(str)


##### 测试

In [70]:
twitter_archive_clean.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2097 entries, 0 to 2355
Data columns (total 9 columns):
tweet_id              2097 non-null object
timestamp             2097 non-null object
source                2097 non-null object
text                  2097 non-null object
expanded_urls         2094 non-null object
rating_numerator      2097 non-null int64
rating_denominator    2097 non-null int64
name                  2097 non-null object
stage                 2097 non-null object
dtypes: int64(2), object(7)
memory usage: 163.8+ KB


In [71]:
image_predictions_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2075 entries, 0 to 2074
Data columns (total 12 columns):
tweet_id    2075 non-null object
jpg_url     2075 non-null object
img_num     2075 non-null int64
p1          2075 non-null object
p1_conf     2075 non-null float64
p1_dog      2075 non-null bool
p2          2075 non-null object
p2_conf     2075 non-null float64
p2_dog      2075 non-null bool
p3          2075 non-null object
p3_conf     2075 non-null float64
p3_dog      2075 non-null bool
dtypes: bool(3), float64(3), int64(1), object(5)
memory usage: 152.1+ KB


`twitter_archive`:
* 'timestamp' is a string not a datatype


##### 定义
* Slice the 'timestamp' without +0000
* Cover the 'timestamp' column's data from string to datatime

##### 代码

In [72]:
# Slice the 'timestamp' without +0000
# Cover the 'timestamp' column's data from string to datatime
twitter_archive_clean.timestamp = pd.to_datetime(twitter_archive_clean.timestamp[:-6])

##### 测试

In [73]:
twitter_archive_clean.head()

Unnamed: 0,tweet_id,timestamp,source,text,expanded_urls,rating_numerator,rating_denominator,name,stage
0,892420643555336193,2017-08-01 16:23:56,"<a href=""http://twitter.com/download/iphone"" r...",This is Phineas. He's a mystical boy. Only eve...,https://twitter.com/dog_rates/status/892420643...,13,10,Phineas,
1,892177421306343426,2017-08-01 00:17:27,"<a href=""http://twitter.com/download/iphone"" r...",This is Tilly. She's just checking pup on you....,https://twitter.com/dog_rates/status/892177421...,13,10,Tilly,
2,891815181378084864,2017-07-31 00:18:03,"<a href=""http://twitter.com/download/iphone"" r...",This is Archie. He is a rare Norwegian Pouncin...,https://twitter.com/dog_rates/status/891815181...,12,10,Archie,
3,891689557279858688,2017-07-30 15:58:51,"<a href=""http://twitter.com/download/iphone"" r...",This is Darla. She commenced a snooze mid meal...,https://twitter.com/dog_rates/status/891689557...,13,10,Darla,
4,891327558926688256,2017-07-29 16:00:24,"<a href=""http://twitter.com/download/iphone"" r...",This is Franklin. He would like you to stop ca...,https://twitter.com/dog_rates/status/891327558...,12,10,Franklin,


In [74]:
twitter_archive_clean.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2097 entries, 0 to 2355
Data columns (total 9 columns):
tweet_id              2097 non-null object
timestamp             2091 non-null datetime64[ns]
source                2097 non-null object
text                  2097 non-null object
expanded_urls         2094 non-null object
rating_numerator      2097 non-null int64
rating_denominator    2097 non-null int64
name                  2097 non-null object
stage                 2097 non-null object
dtypes: datetime64[ns](1), int64(2), object(6)
memory usage: 163.8+ KB


`twitter_archive`:
* 'source' includes useless infomation such like http link

##### 定义
* Just cut out the info of where the source comes, such like Twitter for iPhone, Twitter Web Client and Vine - Make a Scene

##### 代码

In [None]:
twitter_archive_clean['source'] = twitter_archive_clean.source