In [22]:
# for data wrangling and sampling
import pandas as pd
import numpy as np
import random

import requests       # to download files programmatically
import os             # to save/open files and for terminal-like commands to navigate local machine
import tweepy
import pprint as pp   # data pretty printer - https://docs.python.org/2/library/pprint.html
import json           # for json I/O and parsing
import time           # for timing code and dealing with Twitter's rate limit

# Set the random seed to assure the same answers are returned each time 
random.seed(42)

# for plotting
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sb

# for (potential) regression modeling of data
import statsmodels.api as sm;
from patsy import dmatrices
from statsmodels.stats.outliers_influence import variance_inflation_factor

### Read in a Twitter archive (manual / already provided)

In [23]:
df_archive = pd.read_csv("twitter-archive-enhanced.csv")
df_archive.head(1)

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,timestamp,source,text,retweeted_status_id,retweeted_status_user_id,retweeted_status_timestamp,expanded_urls,rating_numerator,rating_denominator,name,doggo,floofer,pupper,puppo
0,892420643555336193,,,2017-08-01 16:23:56 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Phineas. He's a mystical boy. Only eve...,,,,https://twitter.com/dog_rates/status/892420643...,13,10,Phineas,,,,


### Download tweet image predictions (programmatically, from a url)

In [24]:
# get file from a url
url = 'https://d17h27t6h515a5.cloudfront.net/topher/2017/August/599fd2ad_image-predictions/image-predictions.tsv'
response = requests.get(url)

# get the current working directory
folder_name = os.getcwd()

# get the filename
file_name = url.split('/')[-1]

# save the retrieved file to local storage
with open(os.path.join(folder_name,
                      file_name), mode='wb') as file:
    file.write(response.content)

# read in the downloaded file
df_images = pd.read_csv(file_name, sep='\t')
df_images.head()

Unnamed: 0,tweet_id,jpg_url,img_num,p1,p1_conf,p1_dog,p2,p2_conf,p2_dog,p3,p3_conf,p3_dog
0,666020888022790149,https://pbs.twimg.com/media/CT4udn0WwAA0aMy.jpg,1,Welsh_springer_spaniel,0.465074,True,collie,0.156665,True,Shetland_sheepdog,0.061428,True
1,666029285002620928,https://pbs.twimg.com/media/CT42GRgUYAA5iDo.jpg,1,redbone,0.506826,True,miniature_pinscher,0.074192,True,Rhodesian_ridgeback,0.07201,True
2,666033412701032449,https://pbs.twimg.com/media/CT4521TWwAEvMyu.jpg,1,German_shepherd,0.596461,True,malinois,0.138584,True,bloodhound,0.116197,True
3,666044226329800704,https://pbs.twimg.com/media/CT5Dr8HUEAA-lEu.jpg,1,Rhodesian_ridgeback,0.408143,True,redbone,0.360687,True,miniature_pinscher,0.222752,True
4,666049248165822465,https://pbs.twimg.com/media/CT5IQmsXIAAKY4A.jpg,1,miniature_pinscher,0.560311,True,Rottweiler,0.243682,True,Doberman,0.154629,True


NOTE:
* The response variable is in bytes format, not text format.
* As such, the 'wb' flag is used when writing the file locally
* [Link to a StackOverflow post](https://stackoverflow.com/questions/2665866/what-does-wb-mean-in-this-code-using-python) on the subject<br><br>

Template code for future reference:

In [None]:
# template code to make a directory if it doesn't already exist
#folder_name = 'my_new_folder'
#if not os.path.exists(folder_name):
#    os.makedirs(folder_name)

# command to list the current directory's contents
#os.listdir()

### Create an API object to gather Twitter data

In [25]:
# get the API Access Token and Acces Token Secret
from twAPI_tokens_GoodDoggo import API_KEY, API_KEY_SECRET, API_TOKEN, API_TOKEN_SECRET

CONSUMER_KEY = API_KEY
CONSUMER_SECRET = API_KEY_SECRET
ACCESS_TOKEN = API_TOKEN
ACCESS_SECRET = API_TOKEN_SECRET

auth = tweepy.OAuthHandler(CONSUMER_KEY, CONSUMER_SECRET)
auth.set_access_token(ACCESS_TOKEN, ACCESS_SECRET)

# api = tweepy.API(auth)

# code snippet for getting around the twitter rate limit:
api = tweepy.API(auth, 
                 wait_on_rate_limit=True,
                 wait_on_rate_limit_notify=True)

#### Get a list of tweet IDs:

In [26]:
# Check if there are any repeated tweets in the archive
numUniqueValues = df_archive.tweet_id.nunique()
print('Number of tweets: ' + str(len(df_archive)))
print('Number of repeated tweets: ' + str(len(df_archive) - numUniqueValues))

# Create list of tweet IDs
tweet_id_list = df_archive.tweet_id.tolist()

Number of tweets: 2356
Number of repeated tweets: 0


### Use the API to get info for each tweet

* ___Retrieve json data for the first tweet and write it to local storage___
* [StackOverflow article](https://stackoverflow.com/questions/28384588/twitter-api-get-tweets-with-specific-id) on getting JSON data for a specific tweet
* [StackAbuse article](https://stackabuse.com/reading-and-writing-json-to-a-file-in-python/) on reading and writing JSON to a file in Python

In [104]:
print('- Tweet retrieval (for 2356 tweets) took 30 minutes to complete, due to Twitter\'s rate limit.\n' +
      '- As a result, it was performed once, then commented out to allow restarting the kernel / debugging\n' + 
      'the rest of the analysis.')

# loop through multiple tweet_id's, retrieving and writing their json data to 'tweet_json.txt' 
# with open('tweet_json.txt', mode = 'w') as textFile:
#     count = 0
#     for tweet_id in tweet_id_list:
#         count = count + 1
#         start = time.time()
#         try:
#             status = api.get_status(tweet_id)
#             jsonStr = json.dumps(status._json)
#         except:
#             continue     # tweet no longer exists
#         textFile.write(jsonStr + '\n')
#         end = time.time()
#         currTime = str(time.localtime().tm_hour) + ':' + str(time.localtime().tm_min) + ':' + str(time.localtime().tm_sec)
#         print('count: ' + str(count) + ', time elapsed: ' + str(end - start) + ', current time: ' + currTime)

- Tweet retrieval (for 2356 tweets) took 30 minutes to complete, due to Twitter's rate limit.
- As a result, it was performed once, then commented out to allow restarting the kernel / debugging
the rest of the analysis.


__Print first line of 'tweet_json.txt' to check that the above worked__

In [105]:
# print first line of 'tweet_json.txt' to check that the above worked
# with open('tweet_json.txt') as jsonFile:
#     line = jsonFile.readline()
#     tweet = json.loads(line)
#     pp.pprint(tweet)

**Add the tweet data to a dataframe**

In [106]:
# create a local dataframe for storing tweet data
df_tweetInfo = pd.DataFrame(columns = ['tweet_id', 'retweet_count', 'favorite_count'])

# store tweet data to the dataframe
with open('tweet_json.txt') as jsonFile:
    count = 0
    start = time.time()
    for line in jsonFile:
        count = count + 1
        tweet = json.loads(line)
        df_tweetInfo = df_tweetInfo.append({
            'tweet_id': tweet['id'],
            'retweet_count': tweet['retweet_count'],
            'favorite_count': tweet['favorite_count']
        }, ignore_index=True)
        end = time.time()
        if (np.remainder(count, 200) == 0):
            currTime = str(time.localtime().tm_hour) + ':' + str(time.localtime().tm_min) + ':' + str(time.localtime().tm_sec)
            print('count: ' + str(count) + ', time elapsed: ' + str(end - start) + ', current time: ' + currTime)
        
# # add a single tweet's data to the dataframe
# df_tweetInfo = df_tweetInfo.append({
#     'tweetID': tweet['id'],
#     'retweet_count': tweet['favorite_count'],
#     'favorite_count': tweet['retweet_count']
# },ignore_index=True)

#tweetInfo.head()

count: 200, time elapsed: 0.2774209976196289, current time: 13:13:31
count: 400, time elapsed: 0.5635969638824463, current time: 13:13:31
count: 600, time elapsed: 0.8698160648345947, current time: 13:13:31
count: 800, time elapsed: 1.1846928596496582, current time: 13:13:32
count: 1000, time elapsed: 1.535491943359375, current time: 13:13:32
count: 1200, time elapsed: 1.8185300827026367, current time: 13:13:32
count: 1400, time elapsed: 2.1488301753997803, current time: 13:13:33
count: 1600, time elapsed: 2.507376194000244, current time: 13:13:33
count: 1800, time elapsed: 2.8268630504608154, current time: 13:13:33
count: 2000, time elapsed: 3.1494250297546387, current time: 13:13:34
count: 2200, time elapsed: 3.4665720462799072, current time: 13:13:34


**Here are two simple changes to make, in order to repeat the above actions for every tweet:**
* Loop through all tweet ID's and retrieve / store their JSON info to a new line of 'tweet_json.txt'
* Loop through each line of 'tweet_json.txt', retrieve the tweet data of interest, and append it to the dataframe
* ___NOTE:___ watch out for deleted tweets and/or missing tweet data. Use try-except blocks as appropriate

In [107]:
df_tweetInfo.head()
#len(df_tweetInfo)

Unnamed: 0,tweet_id,retweet_count,favorite_count
0,892420643555336193,8159,37468
1,892177421306343426,6039,32198
2,891815181378084864,3997,24271
3,891689557279858688,8304,40780
4,891327558926688256,9007,39001


## Assess
Assess the data for Quality and Tidiness. Per Udacity course notes, Quality and Tidiness are defined as follows:

**Quality** issues refers to problems with content, such as missing, duplicate, or incorrect data. Low quality data is sometimes referred to as 'dirty' data. Quality issues generally fall into one of four categories or 'dimensions':
* **Completeness**
  * Have all ___records that should have been obtained___ actually been obtained?
  * Are there any ___missing records___?
  * Are ___specific rows, columns or cells missing___?
  
* **Validity:**
  * Perhaps the records exist, but they're ___not valid___?
    * i.e., they ___don't conform to a defined schema___.
    * A schema is a defined set of rules for data. 
    * These rules can be real-world constraints (e.g. negative height is impossible) and table-specific constraints (e.g. unique key constraints in tables).
    
* **Accuracy:**
  * Inaccurate data:
    * is ___wrong data that is valid___. 
    * ___adheres to the defined schema, but is still incorrect___
    * Example: a patient's weight that is 5 lbs too heavy because the scale was faulty.
    
* **Consistency:**
  * Inconsistent data is both valid and accurate, but ___there are multiple correct ways of referring to the same thing___.
  * Consistency means the data has a **standard format**. For instance, columns that represent the same data across tables and/or within tables is desired.

**Tidiness** refers to the data's structure. Untidy data has structural issues that can slow down or prevent easy analysis. Untidy data is sometimes referred to as 'messy' data. Traits of tidy data include:
* Each variable forms a column.
* Each observation forms a row.
* Each type of observational unit forms a table.

## Quality
* Assess the data for issues with content, such as missing, duplicate, or incorrect data.
  * Start by briefly viewing the data to get a sense of it.
  * Then assess the data with respect to completeness, validity, accuracy, and consistency

In [136]:
df_tweetInfo.head()

Unnamed: 0,tweet_id,retweet_count,favorite_count
0,892420643555336193,8159,37468
1,892177421306343426,6039,32198
2,891815181378084864,3997,24271
3,891689557279858688,8304,40780
4,891327558926688256,9007,39001


In [137]:
df_archive.head(2)

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,timestamp,source,text,retweeted_status_id,retweeted_status_user_id,retweeted_status_timestamp,expanded_urls,rating_numerator,rating_denominator,name,doggo,floofer,pupper,puppo
0,892420643555336193,,,2017-08-01 16:23:56 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Phineas. He's a mystical boy. Only eve...,,,,https://twitter.com/dog_rates/status/892420643...,13,10,Phineas,,,,
1,892177421306343426,,,2017-08-01 00:17:27 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Tilly. She's just checking pup on you....,,,,https://twitter.com/dog_rates/status/892177421...,13,10,Tilly,,,,


In [138]:
df_images.head()

Unnamed: 0,tweet_id,jpg_url,img_num,p1,p1_conf,p1_dog,p2,p2_conf,p2_dog,p3,p3_conf,p3_dog
0,666020888022790149,https://pbs.twimg.com/media/CT4udn0WwAA0aMy.jpg,1,Welsh_springer_spaniel,0.465074,True,collie,0.156665,True,Shetland_sheepdog,0.061428,True
1,666029285002620928,https://pbs.twimg.com/media/CT42GRgUYAA5iDo.jpg,1,redbone,0.506826,True,miniature_pinscher,0.074192,True,Rhodesian_ridgeback,0.07201,True
2,666033412701032449,https://pbs.twimg.com/media/CT4521TWwAEvMyu.jpg,1,German_shepherd,0.596461,True,malinois,0.138584,True,bloodhound,0.116197,True
3,666044226329800704,https://pbs.twimg.com/media/CT5Dr8HUEAA-lEu.jpg,1,Rhodesian_ridgeback,0.408143,True,redbone,0.360687,True,miniature_pinscher,0.222752,True
4,666049248165822465,https://pbs.twimg.com/media/CT5IQmsXIAAKY4A.jpg,1,miniature_pinscher,0.560311,True,Rottweiler,0.243682,True,Doberman,0.154629,True


### Completeness
* Have all ___records that should have been obtained___ actually been obtained?
* Are there any ___missing records___?
* Are ___specific rows, columns or cells missing___?

In [111]:
print('# of records in df_tweetInfo (i.e., JSON data retrieved via API): ' + str(len(df_tweetInfo)))
print('# of records in df_archive (i.e., weRateDogs Tweet archive):      ' + str(len(df_archive)))
print('# of records in df_images (i.e., image analysis):                 ' + str(len(df_images)))
#tweetInfo.head()

# of records in df_tweetInfo (i.e., JSON data retrieved via API): 2335
# of records in df_archive (i.e., weRateDogs Tweet archive):      2356
# of records in df_images (i.e., image analysis):                 2075


<span style="color:blue">The dataframes have a different number of records.</span>
* The slight difference between df_tweetInfo and df_archive is probably due to tweets that have been deleted
* The difference betweeen df_archive and df_images is probably due to not all 

In [134]:
print('# of tweet_id\'s in df_images that are also in df_archive: ' + 
     str(len(df_images.tweet_id.isin(df_archive.tweet_id))))
print('# of tweet_id\'s in df_images that are also in df_tweetInfo: ' + 
     str(len(df_images.tweet_id.isin(df_tweetInfo.tweet_id))))

# of tweet_id's in df_images that are also in df_archive: 2075
# of tweet_id's in df_images that are also in df_tweetInfo: 2075


<span style="color:blue">
    Since all tweet_id's in df_images are also in df_archive and df_tweetInfo, the appropriate set to use is the intersection of the three df's.
</span>

### Validity
* Perhaps the records exist, but they're ___not valid___?
  * i.e., they ___don't conform to a defined schema___.
  * A schema is a defined set of rules for data. 
  * These rules can be real-world constraints (e.g. negative height is impossible) and table-specific constraints (e.g. unique key constraints in tables).

In [141]:
df_archive.describe()

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,retweeted_status_id,retweeted_status_user_id,rating_numerator,rating_denominator
count,2356.0,78.0,78.0,181.0,181.0,2356.0,2356.0
mean,7.427716e+17,7.455079e+17,2.014171e+16,7.7204e+17,1.241698e+16,13.126486,10.455433
std,6.856705e+16,7.582492e+16,1.252797e+17,6.236928e+16,9.599254e+16,45.876648,6.745237
min,6.660209e+17,6.658147e+17,11856340.0,6.661041e+17,783214.0,0.0,0.0
25%,6.783989e+17,6.757419e+17,308637400.0,7.186315e+17,4196984000.0,10.0,10.0
50%,7.196279e+17,7.038708e+17,4196984000.0,7.804657e+17,4196984000.0,11.0,10.0
75%,7.993373e+17,8.257804e+17,4196984000.0,8.203146e+17,4196984000.0,12.0,10.0
max,8.924206e+17,8.862664e+17,8.405479e+17,8.87474e+17,7.874618e+17,1776.0,170.0


<span style="color:blue">
    
<span style="color:black"> Investigate / clean the following issues: </span>
* In **df_archive**, the maximum value for rating_numerator is unrealistically high.
* In **df_archive**, the maximum value for rating_denominator is unrealistically high.
* In **df_archive**, the minimum value for rating_numerator probably should not be zero.
* In **df_archive**, the minimum value for rating_denominator should not be zero.
</span>

#### Check other dataframes for any obvious validity issues

In [142]:
df_images.describe()

Unnamed: 0,tweet_id,img_num,p1_conf,p2_conf,p3_conf
count,2075.0,2075.0,2075.0,2075.0,2075.0
mean,7.384514e+17,1.203855,0.594548,0.1345886,0.06032417
std,6.785203e+16,0.561875,0.271174,0.1006657,0.05090593
min,6.660209e+17,1.0,0.044333,1.0113e-08,1.74017e-10
25%,6.764835e+17,1.0,0.364412,0.05388625,0.0162224
50%,7.119988e+17,1.0,0.58823,0.118181,0.0494438
75%,7.932034e+17,1.0,0.843855,0.1955655,0.09180755
max,8.924206e+17,4.0,1.0,0.488014,0.273419


In [144]:
df_tweetInfo.describe()

Unnamed: 0,tweet_id,retweet_count,favorite_count
count,2335,2335,2335
unique,2335,1712,1979
top,667495797102141441,88,0
freq,1,5,165


In [146]:
df_archive.nunique()

tweet_id                      2356
in_reply_to_status_id           77
in_reply_to_user_id             31
timestamp                     2356
source                           4
text                          2356
retweeted_status_id            181
retweeted_status_user_id        25
retweeted_status_timestamp     181
expanded_urls                 2218
rating_numerator                40
rating_denominator              18
name                           957
doggo                            2
floofer                          2
pupper                           2
puppo                            2
dtype: int64

In [208]:
df_archive.groupby('rating_numerator').rating_numerator.count()

rating_numerator
0         2
1         9
2         9
3        19
4        17
5        37
6        32
7        55
8       102
9       158
10      461
11      464
12      558
13      351
14       54
15        2
17        1
20        1
24        1
26        1
27        1
44        1
45        1
50        1
60        1
75        2
80        1
84        1
88        1
99        1
121       1
143       1
144       1
165       1
182       1
204       1
420       2
666       1
960       1
1776      1
Name: rating_numerator, dtype: int64

<span style="color:blue">

In **df_archive**, some **rating_numerator** values are quite large. Investigate whether this is an issue / consider removing numerators over a certain threshold.

</span>

In [193]:
df_archive.head(2)

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,timestamp,source,text,retweeted_status_id,retweeted_status_user_id,retweeted_status_timestamp,expanded_urls,rating_numerator,rating_denominator,name,doggo,floofer,pupper,puppo
0,892420643555336193,,,2017-08-01 16:23:56 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Phineas. He's a mystical boy. Only eve...,,,,https://twitter.com/dog_rates/status/892420643...,13,10,Phineas,,,,
1,892177421306343426,,,2017-08-01 00:17:27 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Tilly. She's just checking pup on you....,,,,https://twitter.com/dog_rates/status/892177421...,13,10,Tilly,,,,


In [205]:
searchString = 'NaN'
df_archive.query("in_reply_to_status_id != 'NaN'").head(2)

# # example syntax
# #df_images.query('p1_conf > 0.2').head()
# searchString = 'German_shepherd'
# df_images.query("p1 != @searchString").head(3)

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,timestamp,source,text,retweeted_status_id,retweeted_status_user_id,retweeted_status_timestamp,expanded_urls,rating_numerator,rating_denominator,name,doggo,floofer,pupper,puppo
30,886267009285017600,8.862664e+17,2281182000.0,2017-07-15 16:51:35 +0000,"<a href=""http://twitter.com/download/iphone"" r...",@NonWhiteHat @MayhewMayhem omg hello tanner yo...,,,,,12,10,,,,,
55,881633300179243008,8.81607e+17,47384430.0,2017-07-02 21:58:53 +0000,"<a href=""http://twitter.com/download/iphone"" r...",@roushfenway These are good dogs but 17/10 is ...,,,,,17,10,,,,,


In [206]:
df_tweetInfo.nunique()

tweet_id          2335
retweet_count     1712
favorite_count    1979
dtype: int64

In [147]:
df_images.nunique()

tweet_id    2075
jpg_url     2009
img_num        4
p1           378
p1_conf     2006
p1_dog         2
p2           405
p2_conf     2004
p2_dog         2
p3           408
p3_conf     2006
p3_dog         2
dtype: int64

<span style="color:blue">

In **df_images**, the number of jpg_urls does not match the number of tweet_id's. Investigate whether this is an issue. If so, correct it.

</span>

### Accuracy
* Inaccurate data:
  * is ___wrong data that is valid___. 
  * ___adheres to the defined schema, but is still incorrect___
  * Example: a patient's weight that is 5 lbs too heavy because the scale was faulty.

In [240]:
df_archive.head(3)

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,timestamp,source,text,retweeted_status_id,retweeted_status_user_id,retweeted_status_timestamp,expanded_urls,rating_numerator,rating_denominator,name,doggo,floofer,pupper,puppo
0,892420643555336193,,,2017-08-01 16:23:56 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Phineas. He's a mystical boy. Only eve...,,,,https://twitter.com/dog_rates/status/892420643...,13,10,Phineas,,,,
1,892177421306343426,,,2017-08-01 00:17:27 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Tilly. She's just checking pup on you....,,,,https://twitter.com/dog_rates/status/892177421...,13,10,Tilly,,,,
2,891815181378084864,,,2017-07-31 00:18:03 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Archie. He is a rare Norwegian Pouncin...,,,,https://twitter.com/dog_rates/status/891815181...,12,10,Archie,,,,


In [241]:
df_images.head()

Unnamed: 0,tweet_id,jpg_url,img_num,p1,p1_conf,p1_dog,p2,p2_conf,p2_dog,p3,p3_conf,p3_dog
0,666020888022790149,https://pbs.twimg.com/media/CT4udn0WwAA0aMy.jpg,1,Welsh_springer_spaniel,0.465074,True,collie,0.156665,True,Shetland_sheepdog,0.061428,True
1,666029285002620928,https://pbs.twimg.com/media/CT42GRgUYAA5iDo.jpg,1,redbone,0.506826,True,miniature_pinscher,0.074192,True,Rhodesian_ridgeback,0.07201,True
2,666033412701032449,https://pbs.twimg.com/media/CT4521TWwAEvMyu.jpg,1,German_shepherd,0.596461,True,malinois,0.138584,True,bloodhound,0.116197,True
3,666044226329800704,https://pbs.twimg.com/media/CT5Dr8HUEAA-lEu.jpg,1,Rhodesian_ridgeback,0.408143,True,redbone,0.360687,True,miniature_pinscher,0.222752,True
4,666049248165822465,https://pbs.twimg.com/media/CT5IQmsXIAAKY4A.jpg,1,miniature_pinscher,0.560311,True,Rottweiler,0.243682,True,Doberman,0.154629,True


In [242]:
df_tweetInfo.head()

Unnamed: 0,tweet_id,retweet_count,favorite_count
0,892420643555336193,8159,37468
1,892177421306343426,6039,32198
2,891815181378084864,3997,24271
3,891689557279858688,8304,40780
4,891327558926688256,9007,39001


#### There do not appear to be any accuracy issues

<span style="color:red">

## Issues to check for: 

* **Accuracy**
  * There maybe some entries that are not pictures of dogs (typically rated less than 10?)
  * Some names in the name columns can't possibly be dog names. (a, an, the, etc...)

* **Completeness**
  * Missing data for dog breed predictions:
  * Only want original ratings. Make sure we don't have retweets.
</span>

<span style="color:red">

### Preliminary code to check accuracy of **doggo** column in df_archive
* This mini-analysis is incomplete, and probably needs revisited.

</span>

In [302]:
# determine how many entries have "doggo" in the tweet
df_temp = df_archive[df_archive['text'].str.contains("doggo")]
#df_temp.doggo.count()

# of those entries, determine how many have 'None' in the 'doggo' column
df_temp2 = df_temp[df_temp['doggo'].str.contains("None")]
#df_temp2.count()
#df_temp2.text[0:5].apply(lambda entry: print(entry))
df_temp2.head()

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,timestamp,source,text,retweeted_status_id,retweeted_status_user_id,retweeted_status_timestamp,expanded_urls,rating_numerator,rating_denominator,name,doggo,floofer,pupper,puppo
83,876537666061221889,,,2017-06-18 20:30:39 +0000,"<a href=""http://twitter.com/download/iphone"" r...",I can say with the pupmost confidence that the...,,,,https://twitter.com/mpstowerham/status/8761629...,14,10,,,,,
268,841439858740625411,,,2017-03-14 00:04:30 +0000,"<a href=""http://twitter.com/download/iphone"" r...",Here we have some incredible doggos for #K9Vet...,,,,https://twitter.com/dog_rates/status/841439858...,14,10,,,,,
302,836648853927522308,,,2017-02-28 18:46:45 +0000,"<a href=""http://twitter.com/download/iphone"" r...",RT @SchafeBacon2016: @dog_rates Slightly distu...,8.366481e+17,7.124572e+17,2017-02-28 18:43:57 +0000,https://twitter.com/SchafeBacon2016/status/836...,11,10,,,,,
475,816062466425819140,,,2017-01-02 23:23:48 +0000,"<a href=""http://twitter.com/download/iphone"" r...",RT @dog_rates: Meet Jack. He's one of the rare...,8.159907e+17,4196984000.0,2017-01-02 18:38:42 +0000,https://www.gofundme.com/surgeryforjacktheminp...,11,10,Jack,,,,
477,815990720817401858,,,2017-01-02 18:38:42 +0000,"<a href=""http://twitter.com/download/iphone"" r...",Meet Jack. He's one of the rare doggos that do...,,,,https://www.gofundme.com/surgeryforjacktheminp...,11,10,Jack,,,,


In [301]:
# ----- LEFTOVER CODE SNIPPETS -----
#f = lambda x: print(x)
#df_temp2.count()
#for row2 in df_temp2:
#    print(row2)
#df_temp3['doggo'] = df_temp2['doggo'].apply(lambda x: x > 0)
#df_temp2
#     # filter to all entries where the 'doggo' column == 'None'
#     # print
# df_temp = df_archive[df_archive['text'].str.contains("doggo")]
# #df_archive[df_archive['text'].str.contains("doggo")].count()
# #df_archive.groupby('doggo').count()
# df_archive.groupby('doggo').count()
# #df_temp.head()
# #df_temp[df]

#df_archive.groupby('rating_numerator').rating_numerator.count()
#df_archive[df_archive['text'].str.contains("doggo")]
#df_archive[df_archive['text'].str.contains("doggo")].iloc[2].text

### Consistency
* Inconsistent data is both valid and accurate, but _there are multiple __correct__ ways of referring to the same thing_.
* Consistency, i.e., a standard format, in columns that represent the same data across tables and/or within tables is desired.

In [303]:
print(df_tweetInfo.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2335 entries, 0 to 2334
Data columns (total 3 columns):
tweet_id          2335 non-null object
retweet_count     2335 non-null object
favorite_count    2335 non-null object
dtypes: object(3)
memory usage: 54.8+ KB
None


In [None]:
# - Doggo, floofer, pupper, and puppo are categories.
# - timestamp and retweeted_status_timestamp should be of data type datetime.

In [307]:
df_archive.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2356 entries, 0 to 2355
Data columns (total 17 columns):
tweet_id                      2356 non-null int64
in_reply_to_status_id         78 non-null float64
in_reply_to_user_id           78 non-null float64
timestamp                     2356 non-null object
source                        2356 non-null object
text                          2356 non-null object
retweeted_status_id           181 non-null float64
retweeted_status_user_id      181 non-null float64
retweeted_status_timestamp    181 non-null object
expanded_urls                 2297 non-null object
rating_numerator              2356 non-null int64
rating_denominator            2356 non-null int64
name                          2356 non-null object
doggo                         2356 non-null object
floofer                       2356 non-null object
pupper                        2356 non-null object
puppo                         2356 non-null object
dtypes: float64(4), int64(3), ob

<span style="color:blue">

* In **df_archive**, 'timestamp' and 'retweeted_status_timestamp' should have type 'datetime'.
* In **df_archive**, the following columns should have type 'int64':
   * 'in_reply_to_status_id'
   * 'in_reply_to_user_id',
   * 'retweeted_status_id'
   * 'retweeted_status_user_id'

</span>

In [308]:
df_images.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2075 entries, 0 to 2074
Data columns (total 12 columns):
tweet_id    2075 non-null int64
jpg_url     2075 non-null object
img_num     2075 non-null int64
p1          2075 non-null object
p1_conf     2075 non-null float64
p1_dog      2075 non-null bool
p2          2075 non-null object
p2_conf     2075 non-null float64
p2_dog      2075 non-null bool
p3          2075 non-null object
p3_conf     2075 non-null float64
p3_dog      2075 non-null bool
dtypes: bool(3), float64(3), int64(2), object(4)
memory usage: 152.1+ KB


In [305]:
df_tweetInfo.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2335 entries, 0 to 2334
Data columns (total 3 columns):
tweet_id          2335 non-null object
retweet_count     2335 non-null object
favorite_count    2335 non-null object
dtypes: object(3)
memory usage: 54.8+ KB


<span style="color:blue">

In **df_tweetInfo**, 'tweet_id' should have type 'int64' for consistency across the dataframes.

</span>