In [33]:
import pandas as pd
import numpy as np
import requests
import os
from random import sample
import re

# Gather

Code for downloading Udacity's Dog Prediction Data

In [12]:
#url = 'https://d17h27t6h515a5.cloudfront.net/topher/2017/August/599fd2ad_image-predictions/image-predictions.tsv'
#r = requests.get('https://d17h27t6h515a5.cloudfront.net/topher/2017/August/599fd2ad_image-predictions/image-predictions.tsv')

In [16]:
# make a directory if one does not already exist
#folder_name = 'dog_predictions'
#if not os.path.exists(folder_name):
    #os.makedirs(folder_name)

In [23]:
#with open(os.path.join(folder_name, 
                           #url.split('/')[-1]), mode='wb') as file:
             #file.write(r.content)

Initializing All Relevant Datasets

In [4]:
twitter_archive = pd.read_csv('twitter-archive-enhanced.csv')
twitter_archive;

In [5]:
predicts = pd.read_csv('dog_predictions/image-predictions.tsv', sep='\t')
predicts;

In [6]:
twitter_api = pd.read_csv('twitter_api_data.csv')
twitter_api;

# Assess

### Documented Issues

`twitter_archive`
#### Quality
- timestamp is a string and not datetime
- text column irrelevant material
- missing values in expanded_urls
- There are 42 instances where categorical variables are found in the text, but are not accurately accounted for in the categorical columns
- There are 109 instances where the name column is not accurate, (ex: index 542 name is considered "incredibly" since text before contains "incredible"), and an incorrect name is in place.
    - I recognize that it is an oversight that I cannot test whether or not a name is missed because I do not yet have knowledge of a language processing library.
- Missing rows in "in_reply_to_status_id" "in_reply_to_user_id" "retweeted_status_id" "retweeted_status_user_id" "retweeted_status_timestamp"
- Ratings may contain floats. Texts needs to be checked again
- change id to string

#### Tidiness
- text column contains a source variable for the tweet
- dog "ages/types" (floofer, pupper etc.) should be single, categorial column

`predicts`
#### Quality
- prediction dog breeds have inconsistent casing
- column titles should be be full names
- change id to string
- extract predictions for images where predictions are both dogs and above 70% confidence
    - If our confidence level is too low, then our statements become less meaningful. However, because I am not sure how to test the accuracy of the predictions, I will choose a lowish confidence level since I am aware that many of the pictures will contain dogs.

#### Tidiness
 

`twitter_api`
#### Quality
- change id to string (should have done this when extracting)

#### Tidiness
- tables need to be reorganized
    - 1 for souce metadata (urls

In [35]:
pd.set_option('display.max_colwidth', -1)

### twitter_archive

#### Visual Assessment

In [11]:
#to be used for visual assessments. Supressed to save space.
twitter_archive.sample(5);

In [7]:
twitter_archive.source.unique()

array(['<a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a>',
       '<a href="http://twitter.com" rel="nofollow">Twitter Web Client</a>',
       '<a href="http://vine.co" rel="nofollow">Vine - Make a Scene</a>',
       '<a href="https://about.twitter.com/products/tweetdeck" rel="nofollow">TweetDeck</a>'],
      dtype=object)

In [59]:
#no duplicated values
list(twitter_archive.duplicated()).count(True)

0

In [20]:
#to be used for visual assessments. commented out to save space
twitter_archive.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2356 entries, 0 to 2355
Data columns (total 17 columns):
tweet_id                      2356 non-null int64
in_reply_to_status_id         78 non-null float64
in_reply_to_user_id           78 non-null float64
timestamp                     2356 non-null object
source                        2356 non-null object
text                          2356 non-null object
retweeted_status_id           181 non-null float64
retweeted_status_user_id      181 non-null float64
retweeted_status_timestamp    181 non-null object
expanded_urls                 2297 non-null object
rating_numerator              2356 non-null int64
rating_denominator            2356 non-null int64
name                          2356 non-null object
doggo                         2356 non-null object
floofer                       2356 non-null object
pupper                        2356 non-null object
puppo                         2356 non-null object
dtypes: float64(4), int64(3), ob

In [14]:
#investigating max values for numerator/denominator
twitter_archive.describe();

#### Function(s) used for progrmattic assessment

In [15]:
def category_accuracy(df, columns_list):
    '''category_accuracy parses through the texts column of a dataframe and checks if the information matches
    the values of the categorical columns, GIVEN that the text is used as the categorical variable source
    and categorical column values are the same as column header.
    Restrictions: text column must be called "text".
    Returns: category_accuracy returns the index of rows where the categorical values do not match the text'''
    offending_rows = []
    # matching text with other columns
    for index, row in df.iterrows():
        match = []
        secondary = [row[column] for column in columns_list]
        source = row.text.split()
        for word in source:
            match += [value for value in columns_list if value in word and pd.notnull(word) and word != 'None']
    #checking for accuracy
        if len(match)> 0:
            test = []
            for current in secondary:
                for i in match:
                    test.append(i == current)
            # this loops checks to see if the row value is the same as the matched value
            if test.count(True) < len(match):
                # if there are less correct than actual matches, then there is an inaccurate column.
                offending_rows.append(index)
    return offending_rows

### Parsing Text for Accuracy

In [16]:
#category_accuracy(df, columns)
offending_categorical_rows = category_accuracy(twitter_archive, ['doggo', 'floofer', 'pupper', 'puppo'])
print('The number of instances where doggo, floofer, pupper, and puppo is found in the text, but does not have the correct value is: ', len(offending_categorical_rows))

The number of instances where doggo, floofer, pupper, and puppo is found in the text, but does not have the correct value is:  42


In [17]:
print('The list of offending rows: ', offending_categorical_rows)

The list of offending rows:  [54, 83, 85, 106, 134, 172, 228, 268, 274, 296, 302, 475, 477, 545, 798, 934, 946, 987, 993, 1027, 1093, 1120, 1220, 1228, 1254, 1265, 1351, 1516, 1634, 1635, 1636, 1643, 1710, 1712, 1743, 1826, 1843, 1847, 1862, 1900, 1928, 2141]


In [18]:
#investigate offending rows. Rows had issues. Code has been commented out to save space.
#for i in offending_rows:
    #display(twitter_archive[twitter_archive.index == i])

### Checking Names Column

Since names are capitalized, names that are lowercase will be flagged as they will likely not be actual names.

In [397]:
#demonstrating regular names
regular_names = []
for name in twitter_archive.name:
    if name[0].isupper() and name != 'None' and pd.notnull(name):
        regular_names.append(name)
sample(regular_names, 20)

['Lilli',
 'Tayzie',
 'Corey',
 'Freddery',
 'Lola',
 'Earl',
 'Rorie',
 'Odie',
 'Gromit',
 'Elliot',
 'Gilbert',
 'Chompsky',
 'Alice',
 'Patch',
 'Zeke',
 'Riley',
 'Leo',
 'Marley',
 'Schnitzel',
 'Jack']

In [412]:
#flagging lowercase names and index
incorrect_names = []
for index, row in twitter_archive.iterrows():
    if row['name'][0].islower() and row['name'] != 'None' and pd.notnull(row['name']):
        incorrect_names.append((row['name'], index))
incorrect_names
#len(incorrect_names) returns 109 instances

[('such', 22),
 ('a', 56),
 ('quite', 118),
 ('quite', 169),
 ('quite', 193),
 ('not', 335),
 ('one', 369),
 ('incredibly', 542),
 ('a', 649),
 ('mad', 682),
 ('an', 759),
 ('very', 773),
 ('a', 801),
 ('very', 819),
 ('just', 822),
 ('my', 852),
 ('one', 924),
 ('not', 988),
 ('his', 992),
 ('one', 993),
 ('a', 1002),
 ('a', 1004),
 ('a', 1017),
 ('an', 1025),
 ('very', 1031),
 ('actually', 1040),
 ('a', 1049),
 ('just', 1063),
 ('getting', 1071),
 ('mad', 1095),
 ('very', 1097),
 ('this', 1120),
 ('unacceptable', 1121),
 ('all', 1138),
 ('a', 1193),
 ('old', 1206),
 ('a', 1207),
 ('infuriating', 1259),
 ('a', 1340),
 ('a', 1351),
 ('a', 1361),
 ('an', 1362),
 ('a', 1368),
 ('a', 1382),
 ('very', 1385),
 ('getting', 1435),
 ('just', 1457),
 ('a', 1499),
 ('the', 1527),
 ('the', 1603),
 ('actually', 1693),
 ('by', 1724),
 ('a', 1737),
 ('officially', 1747),
 ('a', 1785),
 ('the', 1797),
 ('the', 1815),
 ('a', 1853),
 ('a', 1854),
 ('a', 1877),
 ('a', 1878),
 ('life', 1916),
 ('a', 1923

## predicts

In [10]:
predicts.sample(5)

Unnamed: 0,tweet_id,jpg_url,img_num,p1,p1_conf,p1_dog,p2,p2_conf,p2_dog,p3,p3_conf,p3_dog
254,670733412878163972,https://pbs.twimg.com/media/CU7seitWwAArlVy.jpg,1,dhole,0.350416,False,hare,0.236661,False,wood_rabbit,0.091133,False
1229,745433870967832576,https://pbs.twimg.com/media/ClhQJUUWAAEVpBX.jpg,1,barrow,0.999962,False,basset,1.4e-05,True,wok,6e-06,False
1918,855459453768019968,https://pbs.twimg.com/media/C98z1ZAXsAEIFFn.jpg,2,Blenheim_spaniel,0.389513,True,Pekinese,0.18822,True,Japanese_spaniel,0.082628,True
202,669683899023405056,https://pbs.twimg.com/media/CUsx8q_WUAA-m4k.jpg,1,Pomeranian,0.998275,True,Chihuahua,0.000605,True,Pekinese,0.000516,True
472,675145476954566656,https://pbs.twimg.com/media/CV6ZOPqWsAA20Uj.jpg,1,Labrador_retriever,0.458746,True,Great_Dane,0.235504,True,Staffordshire_bullterrier,0.116864,True


In [8]:
predicts.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2075 entries, 0 to 2074
Data columns (total 12 columns):
tweet_id    2075 non-null int64
jpg_url     2075 non-null object
img_num     2075 non-null int64
p1          2075 non-null object
p1_conf     2075 non-null float64
p1_dog      2075 non-null bool
p2          2075 non-null object
p2_conf     2075 non-null float64
p2_dog      2075 non-null bool
p3          2075 non-null object
p3_conf     2075 non-null float64
p3_dog      2075 non-null bool
dtypes: bool(3), float64(3), int64(2), object(4)
memory usage: 152.1+ KB


## twitter_api

In [16]:
twitter_api.sample(5)

Unnamed: 0,id,favorite_count,retweet_count
1399,699691744225525762,10518,4735
1733,679777920601223168,3148,1135
1373,701981390485725185,3531,1012
1115,732726085725589504,3618,908
2224,668291999406125056,239,29


In [9]:
twitter_api.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2356 entries, 0 to 2355
Data columns (total 4 columns):
id                2356 non-null int64
favorite_count    2356 non-null object
retweet_count     2356 non-null object
expanded_url      2191 non-null object
dtypes: int64(1), object(3)
memory usage: 73.7+ KB


# Clean

Creating Cleaning Copies

In [68]:
twitter_archive_clean = twitter_archive.copy()

In [66]:
predicts_clean = predicts.copy()

In [67]:
twitter_api_clean = twitter_api.copy()

### Twitter Archive

#### Quality
- timestamp is a string and not datetime
- text column irrelevant material
- missing values in expanded_urls
- There are 42 instances where categorical variables are found in the text, but are not accurately accounted for in the categorical columns
- There are 109 instances where the name column is not accurate, (ex: index 542 name is considered "incredibly" since text before contains "incredible"), and an incorrect name is in place.
    - I recognize that it is an oversight that I cannot test whether or not a name is missed because I do not yet have knowledge of a language processing library.
- Missing rows in "in_reply_to_status_id" "in_reply_to_user_id" "retweeted_status_id" "retweeted_status_user_id" "retweeted_status_timestamp"
- Ratings may contain floats. Texts needs to be checked again
- change id to string

#### Tidiness
- text column contains a source variable for the tweet
- dog "ages/types" (floofer, pupper etc.) should be single, categorial column

#### Define 
Use astype to change timestamp to timedate

#### Code

In [69]:
twitter_archive_clean.timestamp = pd.to_datetime(twitter_archive.timestamp)

#### Test

In [32]:
#timestamp was successfully changed to datetime, commented for space
#twitter_archive_clean.info();

#### Define
Use str.replace to remove unwanted url from text column

#### Code

In [70]:
#testing
twitter_archive.text[0]

"This is Phineas. He's a mystical boy. Only ever appears in the hole of a donut. 13/10 https://t.co/MgUWQ76dJU"

In [57]:
#regex citation: https://stackoverflow.com/questions/6038061/regular-expression-to-find-urls-within-a-string
#regex group matches with any instance with beginning with http/ftp/https
#includes characacters :// 
#([\w_-]+(?:(?:\.[\w_-]+)+))groups combonations of "alphanumeric.alphanumeric"
#([\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])? groups combonations of alphanumeric and special symbols
text = re.sub(r'(http|ftp|https)://([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?', '', twitter_archive.text[0], flags = re.MULTILINE)
text

"This is Phineas. He's a mystical boy. Only ever appears in the hole of a donut. 13/10 "

In [76]:
twitter_archive_clean.text = twitter_archive_clean.text.str.replace(r'(http|ftp|https)://([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?', '');

#### Test

In [78]:
#urls removed, supressed for space
twitter_archive_clean.text;

In [82]:
twitter_archive_clean.sample(5);

# Analysis

(jotting ideas down early so as to not forget)
### Motivating Questions:
- Do higher "ratings" correlate with higher number of retweets? (Must define ratings. Std might be useful)
- Investigate which is more "popular": Cute or Funny. Cute defined by Kindchenschema and funny defined by Benign Violation.
- Within the categories of cuteness and funniness, are more extreme examples more popular? Measured by retweets.
- What are observed characteristics of the top 3 most popular tweets? Is there a theme?

Although the ratings individually do not seem to make much sense due to the numerator exceeding the denominator, perhaps the ratings can be better understood as a decimal score, with higher scores indicating higher approval.