In [171]:
#import statements

import tweepy
from tweepy import OAuthHandler
import numpy as np
import pandas as pd
import requests
import json
from timeit import default_timer as timer

# Gathering Data

### Gathering Data from file directly from download

In [30]:
#loading twitter-archive-enhanced.csv from download

df_csv = pd.read_csv('twitter-archive-enhanced.csv')

### Gathering data programmitically from link provided

In [4]:
#getting url from website provided by Udacity and using requests library to programatically load the file

url = 'https://d17h27t6h515a5.cloudfront.net/topher/2017/August/599fd2ad_image-predictions/image-predictions.tsv'
r = requests.get(url)

In [5]:
#writing contents of file provided from website as image-predictions.tsv

with open(url.split('/')[-1], mode='wb') as file:
    file.write(r.content)

In [101]:
df_tsv = pd.read_csv('image-predictions.tsv', sep='\t')

### Gathering Data from Twitter API

In [7]:
# consumer and access keys/tokens

consumer_key = 'HIDDEN'
consumer_secret = 'HIDDEN'
access_token = 'HIDDEN'
access_secret = 'HIDDEN'

In [8]:
#authorization using consumer key/secret

auth = tweepy.OAuthHandler(consumer_key, consumer_secret)

In [9]:
#authorization with access token/secret

auth.set_access_token(access_token, access_secret)

In [10]:
#setting API variable
api = tweepy.API(auth, wait_on_rate_limit=True)

In [11]:
#grabbing tweet_id values in df_csv dataframe and assigning that in the tweet_ids list

tweet_ids = df_csv.tweet_id.values

In [14]:
#code from twitter-api.py to access each tweet and append each line in a .txt file, this block will be commented 
#out due to the time it takes to run this block

count = 0
fails_dict = {}
start = timer()

#Save each tweet's returned JSON as a new line in a .txt file

with open('tweet_json.txt', 'w') as outfile:
    #This loop will likely take 20-30 minutes to run because of Twitter's rate limit
    for tweet_id in tweet_ids:
        count += 1
        print(str(count) + ": " + str(tweet_id))
        try:
            tweet = api.get_status(tweet_id, tweet_mode='extended')
            print("Success")
            json.dump(tweet._json, outfile)
            outfile.write('\n')
        except tweepy.TweepError as e:
            print("Fail")
            fails_dict[tweet_id] = e
            pass
end = timer()
print(end - start)
print(fails_dict)

In [12]:
#creating empty tweets_list then running for loop to reach each line in json file then append it to list.  
#For each iteration, it starts with an empty dictionary, line_dict, then adds the values from each line that
#is loaded from the json file in terms of tweet_id, retweet_count, and favorite_count.
#This code is from the Knowledge forum from Myles C, which he used to help a student create the tweet_json.txt file

tweets_list = []
with open('tweet_json.txt') as file:
    for line in file:
        line_dict = {}
        json_line = json.loads(line)
        line_dict['tweet_id'] = json_line['id_str']
        line_dict['retweet_count'] = json_line['retweet_count']
        line_dict['favorite_count'] = json_line['favorite_count']
        tweets_list.append(line_dict)
        
df_twitter = pd.DataFrame(tweets_list)

# Assessing Data

In [15]:
#assessing df_csv

df_csv

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,timestamp,source,text,retweeted_status_id,retweeted_status_user_id,retweeted_status_timestamp,expanded_urls,rating_numerator,rating_denominator,name,doggo,floofer,pupper,puppo
0,892420643555336193,,,2017-08-01 16:23:56 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Phineas. He's a mystical boy. Only eve...,,,,https://twitter.com/dog_rates/status/892420643...,13,10,Phineas,,,,
1,892177421306343426,,,2017-08-01 00:17:27 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Tilly. She's just checking pup on you....,,,,https://twitter.com/dog_rates/status/892177421...,13,10,Tilly,,,,
2,891815181378084864,,,2017-07-31 00:18:03 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Archie. He is a rare Norwegian Pouncin...,,,,https://twitter.com/dog_rates/status/891815181...,12,10,Archie,,,,
3,891689557279858688,,,2017-07-30 15:58:51 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Darla. She commenced a snooze mid meal...,,,,https://twitter.com/dog_rates/status/891689557...,13,10,Darla,,,,
4,891327558926688256,,,2017-07-29 16:00:24 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Franklin. He would like you to stop ca...,,,,https://twitter.com/dog_rates/status/891327558...,12,10,Franklin,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2351,666049248165822465,,,2015-11-16 00:24:50 +0000,"<a href=""http://twitter.com/download/iphone"" r...",Here we have a 1949 1st generation vulpix. Enj...,,,,https://twitter.com/dog_rates/status/666049248...,5,10,,,,,
2352,666044226329800704,,,2015-11-16 00:04:52 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is a purebred Piers Morgan. Loves to Netf...,,,,https://twitter.com/dog_rates/status/666044226...,6,10,a,,,,
2353,666033412701032449,,,2015-11-15 23:21:54 +0000,"<a href=""http://twitter.com/download/iphone"" r...",Here is a very happy pup. Big fan of well-main...,,,,https://twitter.com/dog_rates/status/666033412...,9,10,a,,,,
2354,666029285002620928,,,2015-11-15 23:05:30 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is a western brown Mitsubishi terrier. Up...,,,,https://twitter.com/dog_rates/status/666029285...,7,10,a,,,,


In [18]:
df_csv.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2356 entries, 0 to 2355
Data columns (total 17 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   tweet_id                    2356 non-null   int64  
 1   in_reply_to_status_id       78 non-null     float64
 2   in_reply_to_user_id         78 non-null     float64
 3   timestamp                   2356 non-null   object 
 4   source                      2356 non-null   object 
 5   text                        2356 non-null   object 
 6   retweeted_status_id         181 non-null    float64
 7   retweeted_status_user_id    181 non-null    float64
 8   retweeted_status_timestamp  181 non-null    object 
 9   expanded_urls               2297 non-null   object 
 10  rating_numerator            2356 non-null   int64  
 11  rating_denominator          2356 non-null   int64  
 12  name                        2356 non-null   object 
 13  doggo                       2356 

In [23]:
df_csv.doggo.value_counts()

None     2259
doggo      97
Name: doggo, dtype: int64

In [29]:
df_csv[df_csv.expanded_urls.isnull()]

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,timestamp,source,text,retweeted_status_id,retweeted_status_user_id,retweeted_status_timestamp,expanded_urls,rating_numerator,rating_denominator,name,doggo,floofer,pupper,puppo
30,886267009285017600,8.862664e+17,2281182000.0,2017-07-15 16:51:35 +0000,"<a href=""http://twitter.com/download/iphone"" r...",@NonWhiteHat @MayhewMayhem omg hello tanner yo...,,,,,12,10,,,,,
55,881633300179243008,8.81607e+17,47384430.0,2017-07-02 21:58:53 +0000,"<a href=""http://twitter.com/download/iphone"" r...",@roushfenway These are good dogs but 17/10 is ...,,,,,17,10,,,,,
64,879674319642796034,8.795538e+17,3105441000.0,2017-06-27 12:14:36 +0000,"<a href=""http://twitter.com/download/iphone"" r...",@RealKentMurphy 14/10 confirmed,,,,,14,10,,,,,
113,870726314365509632,8.707262e+17,16487760.0,2017-06-02 19:38:25 +0000,"<a href=""http://twitter.com/download/iphone"" r...",@ComplicitOwl @ShopWeRateDogs &gt;10/10 is res...,,,,,10,10,,,,,
148,863427515083354112,8.634256e+17,77596200.0,2017-05-13 16:15:35 +0000,"<a href=""http://twitter.com/download/iphone"" r...",@Jack_Septic_Eye I'd need a few more pics to p...,,,,,12,10,,,,,
179,857214891891077121,8.571567e+17,180671000.0,2017-04-26 12:48:51 +0000,"<a href=""http://twitter.com/download/iphone"" r...",@Marc_IRL pixelated af 12/10,,,,,12,10,,,,,
185,856330835276025856,,,2017-04-24 02:15:55 +0000,"<a href=""http://twitter.com/download/iphone"" r...",RT @Jenna_Marbles: @dog_rates Thanks for ratin...,8.563302e+17,66699013.0,2017-04-24 02:13:14 +0000,,14,10,,,,,
186,856288084350160898,8.56286e+17,279281000.0,2017-04-23 23:26:03 +0000,"<a href=""http://twitter.com/download/iphone"" r...",@xianmcguire @Jenna_Marbles Kardashians wouldn...,,,,,14,10,,,,,
188,855862651834028034,8.558616e+17,194351800.0,2017-04-22 19:15:32 +0000,"<a href=""http://twitter.com/download/iphone"" r...",@dhmontgomery We also gave snoop dogg a 420/10...,,,,,420,10,,,,,
189,855860136149123072,8.558585e+17,13615720.0,2017-04-22 19:05:32 +0000,"<a href=""http://twitter.com/download/iphone"" r...",@s8n You tried very hard to portray this good ...,,,,,666,10,,,,,


In [27]:
df_csv.query('doggo == "doggo"')

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,timestamp,source,text,retweeted_status_id,retweeted_status_user_id,retweeted_status_timestamp,expanded_urls,rating_numerator,rating_denominator,name,doggo,floofer,pupper,puppo
9,890240255349198849,,,2017-07-26 15:59:51 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Cassie. She is a college pup. Studying...,,,,https://twitter.com/dog_rates/status/890240255...,14,10,Cassie,doggo,,,
43,884162670584377345,,,2017-07-09 21:29:42 +0000,"<a href=""http://twitter.com/download/iphone"" r...",Meet Yogi. He doesn't have any important dog m...,,,,https://twitter.com/dog_rates/status/884162670...,12,10,Yogi,doggo,,,
99,872967104147763200,,,2017-06-09 00:02:31 +0000,"<a href=""http://twitter.com/download/iphone"" r...",Here's a very large dog. He has a date later. ...,,,,https://twitter.com/dog_rates/status/872967104...,12,10,,doggo,,,
108,871515927908634625,,,2017-06-04 23:56:03 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Napolean. He's a Raggedy East Nicaragu...,,,,https://twitter.com/dog_rates/status/871515927...,12,10,Napolean,doggo,,,
110,871102520638267392,,,2017-06-03 20:33:19 +0000,"<a href=""http://twitter.com/download/iphone"" r...",Never doubt a doggo 14/10 https://t.co/AbBLh2FZCH,,,,https://twitter.com/animalcog/status/871075758...,14,10,,doggo,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1117,732375214819057664,,,2016-05-17 01:00:32 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Kyle (pronounced 'Mitch'). He strives ...,,,,https://twitter.com/dog_rates/status/732375214...,11,10,Kyle,doggo,,,
1141,727644517743104000,,,2016-05-03 23:42:26 +0000,"<a href=""http://twitter.com/download/iphone"" r...",Here's a doggo struggling to cope with the win...,,,,https://twitter.com/dog_rates/status/727644517...,13,10,,doggo,,,
1156,724771698126512129,,,2016-04-26 01:26:53 +0000,"<a href=""http://twitter.com/download/iphone"" r...",Nothin better than a doggo and a sunset. 11/10...,,,,https://twitter.com/dog_rates/status/724771698...,11,10,,doggo,,,
1176,719991154352222208,,,2016-04-12 20:50:42 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This doggo was initially thrilled when she saw...,,,,https://twitter.com/dog_rates/status/719991154...,10,10,,doggo,,,


In [25]:
df_csv.floofer.value_counts()

None       2346
floofer      10
Name: floofer, dtype: int64

In [26]:
df_csv.query('floofer == "floofer"')

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,timestamp,source,text,retweeted_status_id,retweeted_status_user_id,retweeted_status_timestamp,expanded_urls,rating_numerator,rating_denominator,name,doggo,floofer,pupper,puppo
46,883360690899218434,,,2017-07-07 16:22:55 +0000,"<a href=""http://twitter.com/download/iphone"" r...",Meet Grizzwald. He may be the floofiest floofe...,,,,https://twitter.com/dog_rates/status/883360690...,13,10,Grizzwald,,floofer,,
200,854010172552949760,,,2017-04-17 16:34:26 +0000,"<a href=""http://twitter.com/download/iphone"" r...","At first I thought this was a shy doggo, but i...",,,,https://twitter.com/dog_rates/status/854010172...,11,10,,doggo,floofer,,
582,800388270626521089,,,2016-11-20 17:20:08 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Doc. He takes time out of every day to...,,,,https://twitter.com/dog_rates/status/800388270...,12,10,Doc,,floofer,,
774,776218204058357768,,,2016-09-15 00:36:55 +0000,"<a href=""http://twitter.com/download/iphone"" r...",Atlas rolled around in some chalk and now he's...,,,,https://twitter.com/dog_rates/status/776218204...,13,10,,,floofer,,
984,749317047558017024,,,2016-07-02 19:01:20 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Blu. He's a wild bush Floofer. I wish ...,,,,https://twitter.com/dog_rates/status/749317047...,12,10,Blu,,floofer,,
1022,746542875601690625,,,2016-06-25 03:17:46 +0000,"<a href=""http://vine.co"" rel=""nofollow"">Vine -...",Here's a golden floofer helping with the groce...,,,,https://vine.co/v/5uZYwqmuDeT,11,10,,,floofer,,
1091,737445876994609152,,,2016-05-31 00:49:32 +0000,"<a href=""http://twitter.com/download/iphone"" r...",Just wanted to share this super rare Rainbow F...,,,,https://twitter.com/dog_rates/status/737445876...,13,10,,,floofer,,
1110,733822306246479872,,,2016-05-21 00:50:46 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Moose. He's a Polynesian Floofer. Dapp...,,,,https://twitter.com/dog_rates/status/733822306...,10,10,Moose,,floofer,,
1534,689993469801164801,,,2016-01-21 02:10:37 +0000,"<a href=""http://vine.co"" rel=""nofollow"">Vine -...",Here we are witnessing a rare High Stepping Al...,,,,https://vine.co/v/ienexVMZgi5,12,10,,,floofer,,
1614,685307451701334016,,,2016-01-08 03:50:03 +0000,"<a href=""http://twitter.com/download/iphone"" r...",Say hello to Petrick. He's an Altostratus Floo...,,,,https://twitter.com/dog_rates/status/685307451...,11,10,Petrick,,floofer,,


In [22]:
df_csv.source.value_counts()

<a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a>     2221
<a href="http://vine.co" rel="nofollow">Vine - Make a Scene</a>                          91
<a href="http://twitter.com" rel="nofollow">Twitter Web Client</a>                       33
<a href="https://about.twitter.com/products/tweetdeck" rel="nofollow">TweetDeck</a>      11
Name: source, dtype: int64

In [16]:
#assessing df_tsv

df_tsv

Unnamed: 0,tweet_id,jpg_url,img_num,p1,p1_conf,p1_dog,p2,p2_conf,p2_dog,p3,p3_conf,p3_dog
0,666020888022790149,https://pbs.twimg.com/media/CT4udn0WwAA0aMy.jpg,1,Welsh_springer_spaniel,0.465074,True,collie,0.156665,True,Shetland_sheepdog,0.061428,True
1,666029285002620928,https://pbs.twimg.com/media/CT42GRgUYAA5iDo.jpg,1,redbone,0.506826,True,miniature_pinscher,0.074192,True,Rhodesian_ridgeback,0.072010,True
2,666033412701032449,https://pbs.twimg.com/media/CT4521TWwAEvMyu.jpg,1,German_shepherd,0.596461,True,malinois,0.138584,True,bloodhound,0.116197,True
3,666044226329800704,https://pbs.twimg.com/media/CT5Dr8HUEAA-lEu.jpg,1,Rhodesian_ridgeback,0.408143,True,redbone,0.360687,True,miniature_pinscher,0.222752,True
4,666049248165822465,https://pbs.twimg.com/media/CT5IQmsXIAAKY4A.jpg,1,miniature_pinscher,0.560311,True,Rottweiler,0.243682,True,Doberman,0.154629,True
...,...,...,...,...,...,...,...,...,...,...,...,...
2070,891327558926688256,https://pbs.twimg.com/media/DF6hr6BUMAAzZgT.jpg,2,basset,0.555712,True,English_springer,0.225770,True,German_short-haired_pointer,0.175219,True
2071,891689557279858688,https://pbs.twimg.com/media/DF_q7IAWsAEuuN8.jpg,1,paper_towel,0.170278,False,Labrador_retriever,0.168086,True,spatula,0.040836,False
2072,891815181378084864,https://pbs.twimg.com/media/DGBdLU1WsAANxJ9.jpg,1,Chihuahua,0.716012,True,malamute,0.078253,True,kelpie,0.031379,True
2073,892177421306343426,https://pbs.twimg.com/media/DGGmoV4XsAAUL6n.jpg,1,Chihuahua,0.323581,True,Pekinese,0.090647,True,papillon,0.068957,True


In [19]:
df_tsv.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2075 entries, 0 to 2074
Data columns (total 12 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   tweet_id  2075 non-null   int64  
 1   jpg_url   2075 non-null   object 
 2   img_num   2075 non-null   int64  
 3   p1        2075 non-null   object 
 4   p1_conf   2075 non-null   float64
 5   p1_dog    2075 non-null   bool   
 6   p2        2075 non-null   object 
 7   p2_conf   2075 non-null   float64
 8   p2_dog    2075 non-null   bool   
 9   p3        2075 non-null   object 
 10  p3_conf   2075 non-null   float64
 11  p3_dog    2075 non-null   bool   
dtypes: bool(3), float64(3), int64(2), object(4)
memory usage: 152.1+ KB


In [17]:
#assessing df_twitter

df_twitter

Unnamed: 0,tweet_id,retweet_count,favorite_count
0,892420643555336193,7455,35297
1,892177421306343426,5535,30564
2,891815181378084864,3660,22998
3,891689557279858688,7618,38577
4,891327558926688256,8227,36859
...,...,...,...
2326,666049248165822465,39,95
2327,666044226329800704,124,263
2328,666033412701032449,39,108
2329,666029285002620928,41,118


In [20]:
df_twitter.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2331 entries, 0 to 2330
Data columns (total 3 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   tweet_id        2331 non-null   object
 1   retweet_count   2331 non-null   int64 
 2   favorite_count  2331 non-null   int64 
dtypes: int64(2), object(1)
memory usage: 54.8+ KB


### Observations

#### Quality Issues

- Convert timestamp in df_csv column to date_time format
- Convert tweet_id column in df_tsv to string values
- Convert tweet_id column in df_csv to string values
- Names in p1, p2, and p3 columns in df_tsv to be capitalized
- Replace underscores with spaces in p1, p2, and p3 columns 
- Convert items in source column in df_csv dataframe from html tag to something more readable
- Change rating numerator column to rating_out_of_10 column
- Changing values in floofer column to 'Yes' or 'No' 
- Change floofer column to categorical data type
- Change classification column after it is created to categorical data type

#### Tidiness Issues

- Removing unnecessary columns from df_csv dataset (keeping tweet_id, source, timestamp, text, rating_out_of_10,
    name, classification, and floofer), which should fix the majority of the missing values
    
- Combining doggo, pupper, and puppo into one column: classification

- Merging dataframes to address uneven amount of user_ids

# Cleaning Data

In [103]:
#copying each dataframe to test cleaning process

df_csv_clean = df_csv
df_tsv_clean = df_tsv
df_twitter_clean = df_twitter

### Define -- Tidiness Issues / Missing Values

- Combine doggo, pupper, and pupo into one column, classification by adding a new column into df_csv dataframe: classification
    Next create a for loop that goes through the doggo, pupper, and puppo columns and adds that value into the classification
    column if it has the doggo, pupper or puppo name.  After the loop is done, then remove the preceding three columns.
- Remove unnecessary columns from df_csv_dataset by assigning the df_csv_clean dataframe the following columns:
    keeping tweet_id, source, timestamp, text, rating_numerator, name, classification, and floofer

### Code -- Tidiness Issues / Missing Values

In [39]:
#adding classification column to dataframe by having it equal to the puppo column
df_csv_clean['classification'] = df_csv_clean['puppo']
df_csv_clean.sample()

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,timestamp,source,text,retweeted_status_id,retweeted_status_user_id,retweeted_status_timestamp,expanded_urls,rating_numerator,rating_denominator,name,doggo,floofer,pupper,puppo,classification
533,807621403335917568,,,2016-12-10 16:22:02 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Ollie Vue. He was a 3 legged pupper on...,,,,https://twitter.com/dog_rates/status/807621403...,14,10,Ollie,,,pupper,,


In [51]:
# for loop to place values in doggo, pupper, and puppo columns into classification column if the respective columns equal
# that value, else it will equal "unknown"

for x in range(df_csv_clean.shape[0]):
    if df_csv_clean['doggo'][x] == 'doggo':
        df_csv_clean['classification'][x] = df_csv_clean['doggo'][x]
    elif df_csv_clean['puppo'][x] == 'puppo':
        df_csv_clean['classification'][x] = df_csv_clean['puppo'][x]
    elif df_csv_clean['pupper'][x] == 'pupper':
        df_csv_clean['classification'][x] = df_csv_clean['pupper'][x]
    else:
        df_csv_clean['classification'][x] = 'unknown'

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_csv_clean['classification'][x] = 'unknown'
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_csv_clean['classification'][x] = df_csv_clean['doggo'][x]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_csv_clean['classification'][x] = df_csv_clean['puppo'][x]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  

In [52]:
#testing df_csv_classification column using value_counts() method to ensure classification column has the pupper,
#doggo, and puppo values

df_csv_clean.classification.value_counts()

unknown    1985
pupper      245
doggo        97
puppo        29
Name: classification, dtype: int64

In [57]:
#dropping the puppo, pupper, and doggo columns

df_csv_clean.drop('puppo', axis = 1, inplace = True)
df_csv_clean.drop('pupper', axis = 1, inplace = True)
df_csv_clean.drop('doggo', axis = 1, inplace = True)

In [62]:
#looking at columns list at remaining columns

df_csv_clean.columns

Index(['tweet_id', 'in_reply_to_status_id', 'in_reply_to_user_id', 'timestamp',
       'source', 'text', 'retweeted_status_id', 'retweeted_status_user_id',
       'retweeted_status_timestamp', 'expanded_urls', 'rating_numerator',
       'rating_denominator', 'name', 'floofer', 'classification'],
      dtype='object')

In [63]:
#Keeping tweet_id, timestamp, source, text, rating_numerator, name, floofer, and classification columns

df_csv_clean = df_csv_clean[['tweet_id', 'timestamp', 'source', 'text', 'rating_numerator', 'name', 'floofer', 'classification']]

### Test -- Tidiness Issues / Missing Values

In [64]:
#testing resultant dataframe

df_csv_clean.sample()

Unnamed: 0,tweet_id,timestamp,source,text,rating_numerator,name,floofer,classification
227,848324959059550208,2017-04-02 00:03:26 +0000,"<a href=""http://twitter.com/download/iphone"" r...",Meet Odin. He's supposed to be giving directio...,12,Odin,,unknown


### Define -- Quality Issues

- Convert timestamp in df_csv column to date_time format using pd.to_datetime()
- Convert tweet_id column in df_tsv to string values using astype(str)
- Convert tweet_id column in df_csv to string values using astype(str)
- Names in p1, p2, and p3 columns in df_tsv to be more consistent naming using str.title() to capitalize all items especially
    dog breeds
- Replace underscores with spaces in p1, p2, and p3 columns by using str.replace()
- Convert items in source column in df_csv dataframe from html tag to something more readable using str.split() and a for
    loop to filter out the values in the html tags
- Change rating numerator column to rating_out_of_10 column by using the rename method
- Changing values in floofer column to 'Yes' or 'No' by running a for loop in that column and changing floofer to yes
    and none to no
- Change floofer column to categorical data type using astype()
- Change classification column after it is created to categorical data type by using astype()

### Code -- Quality Issues

In [71]:
#converting timestamp in df_csv_clean column to date_time format using pd.to_datetime()

df_csv_clean.timestamp = pd.to_datetime(df_csv_clean.timestamp)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


In [75]:
#testing conversion

df_csv_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2356 entries, 0 to 2355
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype              
---  ------            --------------  -----              
 0   tweet_id          2356 non-null   int64              
 1   timestamp         2356 non-null   datetime64[ns, UTC]
 2   source            2356 non-null   object             
 3   text              2356 non-null   object             
 4   rating_numerator  2356 non-null   int64              
 5   name              2356 non-null   object             
 6   floofer           2356 non-null   object             
 7   classification    2356 non-null   object             
dtypes: datetime64[ns, UTC](1), int64(2), object(5)
memory usage: 147.4+ KB


In [104]:
#converting tweet_id column in df_tsv_clean to string value

df_tsv_clean.tweet_id = df_tsv_clean.astype(str)

In [105]:
#testing conversion

df_tsv_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2075 entries, 0 to 2074
Data columns (total 12 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   tweet_id  2075 non-null   object 
 1   jpg_url   2075 non-null   object 
 2   img_num   2075 non-null   int64  
 3   p1        2075 non-null   object 
 4   p1_conf   2075 non-null   float64
 5   p1_dog    2075 non-null   bool   
 6   p2        2075 non-null   object 
 7   p2_conf   2075 non-null   float64
 8   p2_dog    2075 non-null   bool   
 9   p3        2075 non-null   object 
 10  p3_conf   2075 non-null   float64
 11  p3_dog    2075 non-null   bool   
dtypes: bool(3), float64(3), int64(1), object(5)
memory usage: 152.1+ KB


In [158]:
#converting tweet_id column in df_tsv_clean to string value

df_csv_clean.tweet_id = df_csv_clean.astype(str)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


In [159]:
#testing conversion

df_csv_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2356 entries, 0 to 2355
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype              
---  ------            --------------  -----              
 0   tweet_id          2356 non-null   object             
 1   timestamp         2356 non-null   datetime64[ns, UTC]
 2   source            2356 non-null   object             
 3   text              2356 non-null   object             
 4   rating_out_of_10  2356 non-null   int64              
 5   name              2356 non-null   object             
 6   floofer           2356 non-null   object             
 7   classification    2356 non-null   object             
dtypes: datetime64[ns, UTC](1), int64(1), object(6)
memory usage: 147.4+ KB


In [106]:
#Converting values in p1, p2, and p3 columns in df_tsv_clean to capital names using str.title()

df_tsv_clean.p1 = df_tsv_clean.p1.str.title()
df_tsv_clean.p2 = df_tsv_clean.p2.str.title()
df_tsv_clean.p3 = df_tsv_clean.p3.str.title()

In [107]:
#Replacing underscores with spaces in p1, p2, and p3 columns using str.replace()

df_tsv_clean.p1 = df_tsv_clean.p1.str.replace('_', ' ')
df_tsv_clean.p2 = df_tsv_clean.p2.str.replace('_', ' ')
df_tsv_clean.p3 = df_tsv_clean.p3.str.replace('_', ' ')

In [108]:
df_tsv_clean.sample(10)

Unnamed: 0,tweet_id,jpg_url,img_num,p1,p1_conf,p1_dog,p2,p2_conf,p2_dog,p3,p3_conf,p3_dog
1389,766423258543644672,https://pbs.twimg.com/media/CqLh4yJWcAAHomv.jpg,2,Keeshond,0.995823,True,Pomeranian,0.003897,True,Norwegian Elkhound,0.000253,True
364,672884426393653248,https://pbs.twimg.com/media/CVaQ0M4UsAAki3t.jpg,1,Tusker,0.12241,False,Warthog,0.11987,False,Water Buffalo,0.105856,False
704,684926975086034944,https://pbs.twimg.com/media/CYFZXdiU0AAc_kw.jpg,1,Labrador Retriever,0.769412,True,Golden Retriever,0.144893,True,Lion,0.02144,False
89,667534815156183040,https://pbs.twimg.com/media/CUOPYI5UcAAj_nO.jpg,1,Pembroke,0.435254,True,Cardigan,0.307407,True,Cocker Spaniel,0.033158,True
1093,719704490224398336,https://pbs.twimg.com/media/CfznaXuUsAAH-py.jpg,1,Home Theater,0.059033,False,Window Shade,0.038299,False,Bathtub,0.035528,False
1571,794983741416415232,https://pbs.twimg.com/media/CvT6IV6WEAQhhV5.jpg,3,Schipperke,0.363272,True,Kelpie,0.197021,True,Norwegian Elkhound,0.151024,True
141,668567822092664832,https://pbs.twimg.com/media/CUc64knWoAkZt70.jpg,1,Shih-Tzu,0.985649,True,Lhasa,0.007078,True,Pekinese,0.003053,True
1291,751538714308972544,https://pbs.twimg.com/media/Cm4AeG8XEAAulD2.jpg,2,Labrador Retriever,0.516257,True,Golden Retriever,0.210839,True,Dingo,0.162022,False
1935,859924526012018688,https://pbs.twimg.com/media/C-8QypZXcAAekaF.jpg,1,French Bulldog,0.254587,True,Staffordshire Bullterrier,0.192558,True,Hog,0.10027,False
1777,828381636999917570,https://pbs.twimg.com/media/C38Asz1WEAAvzj3.jpg,1,Bedlington Terrier,0.392535,True,Labrador Retriever,0.089022,True,Clumber,0.0818,True


In [135]:
#convert values in df_csv_clean source column using str.split() and for loop

df_csv_clean.source = df_csv_clean.source.str.split(pat='>', n=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


In [139]:
#for loop iterating through each value in source column and assigning it to the second half of the split string as well as
#trimming off the </a> tag

for x in range(df_csv_clean.shape[0]):
    df_csv_clean.source[x] = df_csv_clean.source[x][1][:-4]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_csv_clean.source[x] = df_csv_clean.source[x][1][:-4]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  exec(code_obj, self.user_global_ns, self.user_ns)


In [142]:
#testing from a sample of 5

df_csv_clean.sample(5)

Unnamed: 0,tweet_id,timestamp,source,text,rating_numerator,name,floofer,classification
856,764259802650378240,2016-08-13 00:38:30+00:00,Twitter for iPhone,This is Kota and her son Benedict. She doesn't...,10,Kota,,unknown
2190,668960084974809088,2015-11-24 01:11:27+00:00,Twitter for iPhone,Meet Jaycob. He got scared of the vacuum. Hide...,10,Jaycob,,unknown
2046,671520732782923777,2015-12-01 02:46:33+00:00,Twitter for iPhone,Meet Alejandro. He's an extremely seductive pu...,10,Alejandro,,unknown
282,839239871831150596,2017-03-07 22:22:32+00:00,Twitter for iPhone,This is Odie. He's big. 13/10 would attempt to...,13,Odie,,unknown
1059,741743634094141440,2016-06-11 21:27:17+00:00,Twitter for iPhone,Meet Aqua. She's a sandy pupper. Not sure how ...,11,Aqua,,pupper


In [None]:
#Changing rating_numerator column to rating_out_of_10 column using rename function

In [145]:
df_csv_clean.rename(columns = {'rating_numerator' : 'rating_out_of_10'}, inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().rename(


In [146]:
df_csv_clean.sample()

Unnamed: 0,tweet_id,timestamp,source,text,rating_out_of_10,name,floofer,classification
310,835309094223372289,2017-02-25 02:03:02+00:00,Twitter for iPhone,RT @dog_rates: So this just changed my life. 1...,13,,,unknown


In [147]:
#Changing values in floofer column to 'Yes' or 'No' by running a for loop in that column and changing floofer to yes 
#and none to no

for x in range(df_csv_clean.shape[0]):
    if df_csv_clean.floofer[x] == 'floofer':
        df_csv_clean.floofer[x] = 'Yes'
    else:
        df_csv_clean.floofer[x] = 'No'

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_csv_clean.floofer[x] = 'No'
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  exec(code_obj, self.user_global_ns, self.user_ns)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_csv_clean.floofer[x] = 'Yes'


In [150]:
df_csv_clean.sample(5)

Unnamed: 0,tweet_id,timestamp,source,text,rating_out_of_10,name,floofer,classification
1732,679828447187857408,2015-12-24 00:58:27+00:00,Twitter for iPhone,Everybody look at this beautiful pupper 13/10 ...,13,,No,pupper
878,760656994973933572,2016-08-03 02:02:14+00:00,Twitter for iPhone,This is Rose. Her face is stuck like that. 11/...,11,Rose,No,unknown
639,793256262322548741,2016-11-01 01:00:05+00:00,Twitter for iPhone,Oh h*ck look at this spookling right here. Fri...,12,,No,unknown
645,793165685325201412,2016-10-31 19:00:10+00:00,Twitter for iPhone,This is Benji. He's Air Bud. It's a low effort...,12,Benji,No,unknown
544,805932879469572096,2016-12-06 00:32:26+00:00,Twitter for iPhone,This is Major. He put on a tie for his first r...,12,Major,No,unknown


In [169]:
#Changing classification column in df_csv_clean column to category data type

df_csv_clean.classification = df_csv_clean.classification.astype('category')

In [170]:
df_csv_clean.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2059 entries, 0 to 2058
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype              
---  ------            --------------  -----              
 0   tweet_id          2059 non-null   object             
 1   timestamp         2059 non-null   datetime64[ns, UTC]
 2   source            2059 non-null   object             
 3   text              2059 non-null   object             
 4   rating_out_of_10  2059 non-null   int64              
 5   name              2059 non-null   object             
 6   floofer           2059 non-null   object             
 7   classification    2059 non-null   category           
 8   jpg_url           2059 non-null   object             
 9   img_num           2059 non-null   int64              
 10  p1                2059 non-null   object             
 11  p1_conf           2059 non-null   float64            
 12  p1_dog            2059 non-null   bool               
 13  p2 

In [152]:
#Merging df_tsv_clean with df_twitter_clean

df_tsv_clean = df_tsv_clean.merge(df_twitter_clean, on = 'tweet_id')

In [166]:
df_tsv_clean

Unnamed: 0,tweet_id,jpg_url,img_num,p1,p1_conf,p1_dog,p2,p2_conf,p2_dog,p3,p3_conf,p3_dog,retweet_count,favorite_count
0,666020888022790149,https://pbs.twimg.com/media/CT4udn0WwAA0aMy.jpg,1,Welsh Springer Spaniel,0.465074,True,Collie,0.156665,True,Shetland Sheepdog,0.061428,True,447,2355
1,666029285002620928,https://pbs.twimg.com/media/CT42GRgUYAA5iDo.jpg,1,Redbone,0.506826,True,Miniature Pinscher,0.074192,True,Rhodesian Ridgeback,0.072010,True,41,118
2,666033412701032449,https://pbs.twimg.com/media/CT4521TWwAEvMyu.jpg,1,German Shepherd,0.596461,True,Malinois,0.138584,True,Bloodhound,0.116197,True,39,108
3,666044226329800704,https://pbs.twimg.com/media/CT5Dr8HUEAA-lEu.jpg,1,Rhodesian Ridgeback,0.408143,True,Redbone,0.360687,True,Miniature Pinscher,0.222752,True,124,263
4,666049248165822465,https://pbs.twimg.com/media/CT5IQmsXIAAKY4A.jpg,1,Miniature Pinscher,0.560311,True,Rottweiler,0.243682,True,Doberman,0.154629,True,39,95
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2054,891327558926688256,https://pbs.twimg.com/media/DF6hr6BUMAAzZgT.jpg,2,Basset,0.555712,True,English Springer,0.225770,True,German Short-Haired Pointer,0.175219,True,8227,36859
2055,891689557279858688,https://pbs.twimg.com/media/DF_q7IAWsAEuuN8.jpg,1,Paper Towel,0.170278,False,Labrador Retriever,0.168086,True,Spatula,0.040836,False,7618,38577
2056,891815181378084864,https://pbs.twimg.com/media/DGBdLU1WsAANxJ9.jpg,1,Chihuahua,0.716012,True,Malamute,0.078253,True,Kelpie,0.031379,True,3660,22998
2057,892177421306343426,https://pbs.twimg.com/media/DGGmoV4XsAAUL6n.jpg,1,Chihuahua,0.323581,True,Pekinese,0.090647,True,Papillon,0.068957,True,5535,30564


In [161]:
#Merging df_csv_clean with df_tsv_clean

df_csv_clean = df_csv_clean.merge(df_tsv_clean, on = 'tweet_id')

In [165]:
#Looking at cleaned dataframe

df_csv_clean.head()

Unnamed: 0,tweet_id,timestamp,source,text,rating_out_of_10,name,floofer,classification,jpg_url,img_num,...,p1_conf,p1_dog,p2,p2_conf,p2_dog,p3,p3_conf,p3_dog,retweet_count,favorite_count
0,892420643555336193,2017-08-01 16:23:56+00:00,Twitter for iPhone,This is Phineas. He's a mystical boy. Only eve...,13,Phineas,No,unknown,https://pbs.twimg.com/media/DGKD1-bXoAAIAUK.jpg,1,...,0.097049,False,Bagel,0.085851,False,Banana,0.07611,False,7455,35297
1,892177421306343426,2017-08-01 00:17:27+00:00,Twitter for iPhone,This is Tilly. She's just checking pup on you....,13,Tilly,No,unknown,https://pbs.twimg.com/media/DGGmoV4XsAAUL6n.jpg,1,...,0.323581,True,Pekinese,0.090647,True,Papillon,0.068957,True,5535,30564
2,891815181378084864,2017-07-31 00:18:03+00:00,Twitter for iPhone,This is Archie. He is a rare Norwegian Pouncin...,12,Archie,No,unknown,https://pbs.twimg.com/media/DGBdLU1WsAANxJ9.jpg,1,...,0.716012,True,Malamute,0.078253,True,Kelpie,0.031379,True,3660,22998
3,891689557279858688,2017-07-30 15:58:51+00:00,Twitter for iPhone,This is Darla. She commenced a snooze mid meal...,13,Darla,No,unknown,https://pbs.twimg.com/media/DF_q7IAWsAEuuN8.jpg,1,...,0.170278,False,Labrador Retriever,0.168086,True,Spatula,0.040836,False,7618,38577
4,891327558926688256,2017-07-29 16:00:24+00:00,Twitter for iPhone,This is Franklin. He would like you to stop ca...,12,Franklin,No,unknown,https://pbs.twimg.com/media/DF6hr6BUMAAzZgT.jpg,2,...,0.555712,True,English Springer,0.22577,True,German Short-Haired Pointer,0.175219,True,8227,36859


In [168]:
#checking datatypes of cleaned info

df_csv_clean.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2059 entries, 0 to 2058
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype              
---  ------            --------------  -----              
 0   tweet_id          2059 non-null   object             
 1   timestamp         2059 non-null   datetime64[ns, UTC]
 2   source            2059 non-null   object             
 3   text              2059 non-null   object             
 4   rating_out_of_10  2059 non-null   int64              
 5   name              2059 non-null   object             
 6   floofer           2059 non-null   object             
 7   classification    2059 non-null   object             
 8   jpg_url           2059 non-null   object             
 9   img_num           2059 non-null   int64              
 10  p1                2059 non-null   object             
 11  p1_conf           2059 non-null   float64            
 12  p1_dog            2059 non-null   bool               
 13  p2 

In [167]:
#Saving Cleaned Dataframe as twitter_archive_master.csv

df_csv_clean.to_csv('twitter_archive_master.csv')