# Project: Wrangling and Analyze Data

## Data Gathering

1. Directly download the WeRateDogs Twitter archive data (twitter_archive_enhanced.csv)

In [1]:
# import modules; pandas, numpy, requests, os, tweepy, beautiful soup
import json 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import requests
import tweepy
import os
from bs4 import BeautifulSoup
%matplotlib inline

In [2]:
# reads the twitter-archive-enhanced.csv file into a dataframe assigned to df_archive
df_archive = pd.read_csv('twitter-archive-enhanced.csv')
df_archive.head()

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,timestamp,source,text,retweeted_status_id,retweeted_status_user_id,retweeted_status_timestamp,expanded_urls,rating_numerator,rating_denominator,name,doggo,floofer,pupper,puppo
0,892420643555336193,,,2017-08-01 16:23:56 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Phineas. He's a mystical boy. Only eve...,,,,https://twitter.com/dog_rates/status/892420643...,13,10,Phineas,,,,
1,892177421306343426,,,2017-08-01 00:17:27 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Tilly. She's just checking pup on you....,,,,https://twitter.com/dog_rates/status/892177421...,13,10,Tilly,,,,
2,891815181378084864,,,2017-07-31 00:18:03 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Archie. He is a rare Norwegian Pouncin...,,,,https://twitter.com/dog_rates/status/891815181...,12,10,Archie,,,,
3,891689557279858688,,,2017-07-30 15:58:51 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Darla. She commenced a snooze mid meal...,,,,https://twitter.com/dog_rates/status/891689557...,13,10,Darla,,,,
4,891327558926688256,,,2017-07-29 16:00:24 +0000,"<a href=""http://twitter.com/download/iphone"" r...",This is Franklin. He would like you to stop ca...,,,,https://twitter.com/dog_rates/status/891327558...,12,10,Franklin,,,,


###### 2. Use the Requests library to download the tweet image prediction (image_predictions.tsv)

In [3]:
# Create a folder
folder_name = 'new-folder'
if not os.path.exists(folder_name):
    os.makedirs(folder_name)

# Create the Request
url = 'https://d17h27t6h515a5.cloudfront.net/topher/2017/August/599fd2ad_image-predictions/image-predictions.tsv'
response = requests.get(url)

#Assessing the content and write a file
with open (os.path.join(folder_name, url.split('/')[-1]), mode = 'wb') as file:
    file.write(response.content)
    

In [4]:
#reads the image-predictions.tsv file into a dataframe and assigned to the variable df_image_pred
df_image_pred = pd.read_csv('image-predictions.tsv', sep = '\t')
df_image_pred.head()

Unnamed: 0,tweet_id,jpg_url,img_num,p1,p1_conf,p1_dog,p2,p2_conf,p2_dog,p3,p3_conf,p3_dog
0,666020888022790149,https://pbs.twimg.com/media/CT4udn0WwAA0aMy.jpg,1,Welsh_springer_spaniel,0.465074,True,collie,0.156665,True,Shetland_sheepdog,0.061428,True
1,666029285002620928,https://pbs.twimg.com/media/CT42GRgUYAA5iDo.jpg,1,redbone,0.506826,True,miniature_pinscher,0.074192,True,Rhodesian_ridgeback,0.07201,True
2,666033412701032449,https://pbs.twimg.com/media/CT4521TWwAEvMyu.jpg,1,German_shepherd,0.596461,True,malinois,0.138584,True,bloodhound,0.116197,True
3,666044226329800704,https://pbs.twimg.com/media/CT5Dr8HUEAA-lEu.jpg,1,Rhodesian_ridgeback,0.408143,True,redbone,0.360687,True,miniature_pinscher,0.222752,True
4,666049248165822465,https://pbs.twimg.com/media/CT5IQmsXIAAKY4A.jpg,1,miniature_pinscher,0.560311,True,Rottweiler,0.243682,True,Doberman,0.154629,True


###### 3. Use the Tweepy library to query additional data via the Twitter API (tweet_json.txt)

In [None]:
import tweepy
from tweepy import OAuthHandler
import json
from timeit import default_timer as timer

# Query Twitter API for each tweet in the Twitter archive and save JSON in a text file
# These are hidden to comply with Twitter's API terms and conditions
consumer_key = 'HIDDEN'
consumer_secret = 'HIDDEN'
access_token = 'HIDDEN'
access_secret = 'HIDDEN'

auth = OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_secret)

api = tweepy.API(auth, wait_on_rate_limit=True)

# NOTE TO STUDENT WITH MOBILE VERIFICATION ISSUES:
# df_1 is a DataFrame with the twitter_archive_enhanced.csv file. You may have to
# change line 17 to match the name of your DataFrame with twitter_archive_enhanced.csv
# NOTE TO REVIEWER: this student had mobile verification issues so the following
# Twitter API code was sent to this student from a Udacity instructor
# Tweet IDs for which to gather additional data via Twitter's API
tweet_ids = df_archive.tweet_id.values
len(tweet_ids)

# Query Twitter's API for JSON data for each tweet ID in the Twitter archive
count = 0
fails_dict = {}
start = timer()
# Save each tweet's returned JSON as a new line in a .txt file
with open('tweet_json.txt', 'w') as outfile:
    # This loop will likely take 20-30 minutes to run because of Twitter's rate limit
    for tweet_id in tweet_ids:
        count += 1
        print(str(count) + ": " + str(tweet_id))
        try:
            tweet = api.get_status(tweet_id, tweet_mode='extended')
            print("Success")
            json.dump(tweet._json, outfile)
            outfile.write('\n')
        except tweepy.TweepError as e:
            print("Fail")
            fails_dict[tweet_id] = e
            pass
end = timer()
print(end - start)
print(fails_dict)

###### I could not get a developer twitter account, therefore,I will be making use of the tweet-json.txt file as an alternative.

###### The cell below contains code that will open the text file, load the file to json format, then create a dataframe from this json format.

In [None]:
#loading the json text file into a dataframe assigned to the variable df_tweets

tweets = []
with open ('tweet-json.txt') as file:
    for line in file:
        data = json.loads(line)
        tweets.append(data)
df_tweets = pd.DataFrame(tweets)

In [None]:
#Read the tweet dataframe
df_tweets.head(3)

###### The columns: id, retweet_count and favorite_count is extracted from the tweets dataframe and assigned as df_required_columns

In [None]:
#extract the columns: id, retweet_count, favorite_count into a dataframe assigned to the variable df_required_columns

df_required_columns = df_tweets[['id', 'retweet_count', 'favorite_count']]
df_required_columns.head(3)

In [None]:
 df_required_columns.describe()

In [None]:
df_archive.info()

#### some rating_numerator values were wrongly extracted, wrong datatype for rating_numerator (should be float instead of int), the value 1776 seems like an outlier

In [None]:
df_archive.rating_numerator.unique()

In [None]:
df_archive[df_archive.text.str.contains(r"(\d+\.\d*\/\d+)")][['text', 'rating_numerator']].values

###### Some ratings are retweets

In [None]:
#checking the retweeted_status_id for non-null values

df_archive[df_archive.retweeted_status_id.notna()].shape

###### Incorrect names for some dogs

In [None]:
# convert to list the name colum where the names are lower case

list(df_archive[df_archive.name.str.islower()].name)

#### Not all images are dog images

In [None]:
#checks the column where p1_dog equals False
#this means that the image is not a dog type image

df_image_pred_sorted  = df_image_pred.sort_values('tweet_id', ascending = False)
not_dog = df_image_pred_sorted[df_image_pred_sorted['p1_dog'] == False]

In [None]:
not_dog.shape

###### Check if there is an entry for both doggo and floofer

In [None]:
#declaring a function to compare the occurence of more than one dog stages in different stage columns

def compare_dog_stages(a ="", b=""):
    return df_archive[(df_archive[a] == a) & (df_archive[b] == b)]

doggo_floofer = compare_dog_stages('doggo', 'floofer')
compare_dog_stages('doggo', 'floofer')

In [None]:
list(doggo_floofer.text)

In [None]:
doggo_floofer = compare_dog_stages('doggo', 'floofer')

###### Check if there is an entry for both doggo and pupper

In [None]:
doggo_pupper = compare_dog_stages('doggo', 'pupper')

In [None]:
doggo_pupper

In [None]:
compare_dog_stages('doggo', 'pupper').text.values

###### Check if there is an entry for both doggo and puppo

In [None]:
doggo_puppo = compare_dog_stages('doggo', 'puppo')

In [None]:
doggo_puppo

###### Check if there is an entry for both floofer and pupper

In [None]:
floofer_pupper = compare_dog_stages('floofer', 'pupper')

In [None]:
floofer_pupper

###### Check if there is an entry for both floofer and puppo

In [None]:
floofer_puppo = compare_dog_stages('floofer', 'puppo')

In [None]:
floofer_puppo

###### Check if there is an entry for both pupper and puppo

In [None]:
pupper_puppo = compare_dog_stages('pupper', 'puppo')

In [None]:
pupper_puppo

## Assessing Data


### Quality issues
   twitter-archive-enhanced.csv file:
1. missing rows for some columns such as: in_reply_to_status_id, in_reply_to_user_id,  retweeted_status_id,       retweeted_status_user_id, retweeted_status_timestamp and expanded_urls.

2. Some of the texts information are mis-interpreted resulting to a dog having two stage classifications.

3. some rating_numerator values were wrongly extracted, wrong datatype for rating_numerator (should be float instead of int), the value 1776 seems like an outlier
4. Inconsistent representation of date format and column name in this file and the extracted twitter api file


5. Incorrect name for some dogs.

6. some ratings are retweets; about 181 count ratings
    
    image_predictions.tsv

7. not all the images are dogs

8. Inconsistent representation of tweet ID columns for this file and the extracted twitter api file



### Tidiness issues
1. The data collected from twitter api should not be in another table. The columns selected should be included in the twitter-archived-enhanced.csv. Also, the image-prediction file should be added to the twitter-archived-enhanced file

2. The variables: doggo, floofer, pupper, doggo should be in a category named "dog_stage"

## Cleaning Data


In [None]:
# Make copies of original pieces of data
clean_archive = df_archive.copy()
clean_image_pred = df_image_pred.copy()
clean_required_columns = df_required_columns.copy()

In [None]:
clean_archive.head()

### Issue #1: missing rows for some columns such as: in_reply_to_status_id, in_reply_to_user_id, retweeted_status_id, retweeted_status_user_id, retweeted_status_timestamp and expanded_urls.

#### Define: drop these columns having missing rows

#### Code

In [None]:
clean_archive.drop(columns = ['in_reply_to_status_id', 'in_reply_to_user_id', 'retweeted_status_id', 
                              'retweeted_status_user_id', 'retweeted_status_timestamp', 'expanded_urls'], inplace = True)

#### Test

In [None]:
clean_archive.head()

### Issue #2:Some of the texts information are mis-interpreted resulting to a dog having two stage classifications

#### Define: Set the doggo value in the doggo column to none for the 'doggo_floofer' dataframe

#### Code

In [None]:
# function to check the occurence of more than one dog stage in different stage column for the clean_archive dataframe

def clean_compare_dog_stages(a ="", b=""):
    return clean_archive[(clean_archive[a] == a) & (clean_archive[b] == b)]

clean_doggo_floofer = clean_compare_dog_stages('doggo', 'floofer')
clean_doggo_pupper = clean_compare_dog_stages('doggo', 'pupper')
clean_doggo_puppo = clean_compare_dog_stages('doggo', 'puppo')
clean_floofer_pupper = clean_compare_dog_stages('floofer', 'pupper')
clean_floofer_puppo = clean_compare_dog_stages('floofer', 'puppo')
clean_pupper_puppo = clean_compare_dog_stages('pupper', 'puppo')

In [None]:
clean_doggo_floofer

In [None]:
list(clean_doggo_floofer.text)

In [None]:
clean_archive.loc[200, 'doggo'] = 'None'

In [None]:
clean_archive.loc[200].doggo

#### Test

In [None]:
clean_archive.loc[200]

In [None]:
list(clean_doggo_pupper.text)

Most of the text in this dataframe indicates that two dog images were posted (doggo and pupper). Three texts were mis-interpreted because there was a spelling of 'doggo' and 'pupper'

In [None]:
clean_doggo_pupper.index

The indexes of the mis-interpreted texts: 460, 705, 956,

In [None]:
# rename the pupper column of the index to None
clean_archive.loc[460, 'pupper'] = 'None'
clean_archive.loc[705, 'pupper'] = 'None'

In [None]:
clean_archive.loc[460]

In [None]:
clean_archive.loc[705]

delete the other rows

In [None]:
clean_archive.drop(index = [531, 565, 575, 733, 778, 822, 889, 956, 1063, 1113], inplace = True)

#### Test

In [None]:
clean_archive[(clean_archive['doggo'] == 'doggo') & (clean_archive['pupper'] == 'pupper')]

In [None]:
clean_doggo_puppo

In [None]:
list(clean_doggo_puppo.text)

This is just puppo not puppo and doggo

In [None]:
clean_doggo_puppo.index

In [None]:
clean_archive.loc[191, 'doggo'] = 'None'

#### Test

In [None]:
clean_archive.loc[191]

In [None]:
clean_floofer_pupper

In [None]:
clean_floofer_puppo

In [None]:
clean_pupper_puppo

### Issue #3: some rating_numerator values were wrongly extracted, wrong datatype for rating_numerator (should be float instead of int), the value 1776 seems like an outlierer

#### Define: Convert the datatype of rating_numerator to float

In [None]:
clean_archive.rating_numerator = clean_archive.rating_numerator.astype(float)

#### Test

In [None]:
clean_archive.info()

#### Define: Manually extract the correct rating_numerator for the affected columns

#### Code

In [None]:
clean_archive[clean_archive.text.str.contains(r"(\d+\.\d*\/\d+)")][['text', 'rating_numerator']]

In [None]:
wrong_rating = clean_archive[clean_archive.text.str.contains(r"(\d+\.\d*\/\d+)")][['text', 'rating_numerator']]

In [None]:
wrong_rating

In [None]:
wrong_rating.index

In [None]:
# Define a function to manually change the rating_numerator to the correct value

def correct_rating_numerator(index, numerator):
   clean_archive.loc[index, 'rating_numerator'] = numerator

In [None]:
correct_rating_numerator(38, 13.5)
correct_rating_numerator(454, 9.75)
correct_rating_numerator(502, 11.27)
correct_rating_numerator(1161, 9.5)
correct_rating_numerator(1177, 11.26)

#### Test

In [None]:
clean_archive.loc[502]

#### Define: Replace this value as 176. Reason because, the double "7" could be as a result of an error.

#### Code

In [None]:
clean_archive.rating_numerator = clean_archive.rating_numerator.replace(1776, 176)

In [None]:
clean_archive[clean_archive.rating_numerator == 1776]

In [None]:
clean_archive[clean_archive.rating_numerator == 176]

#### Test

### Issue #4: Inconsistent representation of date format and column name in this file and the extracted twitter api file

#### Define: rename the "timestamp" column as "created_at" in order to match the column name in the extracted twitter api data

#### Code

In [None]:
clean_archive.rename(columns = {'timestamp' :'created_at'}, inplace = True)

#### Test

In [None]:
clean_archive.columns

### Issue #5: Incorrect name for some dogs

#### Define: Drop the rows having incorrect dog names

#### Code

In [None]:
# create a dataframe of lowercase name and assign it to the variable 'name_islower'

name_islower = clean_archive[clean_archive.name.str.islower()]  

In [None]:
#get the indexes of this dataframe
name_islower_index = name_islower.index

In [None]:
#remove this dataframe from the 'clean_archive' dataframe by dropping using index argument

clean_archive.drop(name_islower_index, inplace = True)

#### Test

In [None]:
clean_archive[clean_archive.name.str.islower()]

### Issue #6: some ratings are retweets; about 181 count ratings

#### Define: drop the rows having retweets. Well, this has been taken care of when the column 'retweeted_status_id' was dropped

##  Removing retweets 

In [None]:
retweet_posts = clean_archive.text.str.contains('RT')

In [None]:
retweet_index = clean_archive[retweet_posts].index
clean_archive.drop(index = retweet_index, axis = 0, inplace = True)

In [None]:
clean_archive.info()

### Issue #7: not all the images are dogs

#### Define: drop the columns that are not dog images

#### Code

In [None]:
not_dog_index = not_dog.index

In [None]:
clean_image_pred.drop(index = not_dog_index, inplace = True)

#### Test

In [None]:
clean_image_pred[clean_image_pred['p1_dog'] == False]

In [None]:
not_dog_id = not_dog.tweet_id

In [None]:
e = clean_archive.tweet_id.isin(not_dog_id)

In [None]:
clean_archive[e].shape

In [None]:
e_index = clean_archive[e].index
clean_archive.drop(index = e_index, axis = 0, inplace = True)

In [None]:
clean_archive.shape

In [None]:
clean_archive[e]

### Issue #8:  Inconsistent representation of tweet ID columns for this file and the extracted twitter api file

#### Define: rename the tweet ID column of the extracted twitter api file to 'tweet_id'

#### Code

In [None]:
clean_required_columns.rename(columns = {'id' : 'tweet_id'}, inplace = True)

#### Test

In [None]:
clean_required_columns.columns

## Tidiness Issue

### Issue #1: The data collected from twitter api should not be in another table. The columns selected should be included in the twitter-archived-enhanced.csv

#### Define: Merge the dataframe collected via twitter api with the twitter archive enchaced dataframe

#### Code

In [None]:
clean_archive = clean_archive.merge(clean_required_columns, on = 'tweet_id', how = 'left')

In [None]:
clean_archive = clean_archive.merge(clean_image_pred, on = 'tweet_id', how = 'left')

#### Test

In [None]:
clean_archive.head()

In [None]:
clean_archive.shape

### Issue #2: The variables: doggo, floofer, pupper, puppo should be in a category named "dog_stage"

#### Define: create new column called 'dog_stage' and concatenate the values of the doggo, floofer, pupper, puppo in it

#### Code

In [None]:
doggo_true = clean_archive[clean_archive['doggo'] == 'doggo'].doggo
floofer_true = clean_archive[clean_archive['floofer'] == 'floofer'].floofer
pupper_true = clean_archive[clean_archive['pupper'] == 'pupper'].pupper
puppo_true = clean_archive[clean_archive['puppo'] == 'puppo'].puppo
tweet_id = clean_archive.tweet_id

In [None]:
clean_archive['dog_stage'] = pd.concat([doggo_true, floofer_true, pupper_true, puppo_true], ignore_index = False)

#### Test

In [None]:
clean_archive.head(5)

In [None]:
# drop the doggo, floofer, pupper, puppo columns
clean_archive.drop(columns = ['doggo', 'floofer', 'pupper', 'puppo'], inplace = True)

In [None]:
# fill the null values with "None"
clean_archive.fillna('None', inplace = True)

In [None]:
clean_archive.dog_stage.unique()

## Storing Data
Save gathered, assessed, and cleaned master dataset to a CSV file named "twitter_archive_master.csv".

In [None]:
clean_archive.to_csv('twitter_archive_master.csv', index = False)

## Analyzing and Visualizing Data


In [None]:
clean_archive.info()

### Insights:
1. The most common type of dog stages

2. The one tweet with most likes

3. Some properties of tweets with most likes

### Visualization

### 1: The most common type of dog stages

In [None]:

clean_archive.dog_stage.value_counts().plot(figsize = (5,5), kind = 'pie', 
                                    title = 'Chart showing the most ocured stage of dog');

#### Explanation

The graph above shows the classification of the dog_stages in terms of the value counts. There are four stages of dogs; viz, doggo, floofer, pupper and puppo. In this dataset, there are cases where there are no dog stage value, such observations are recorded as None. From the chart above, we can see that there are several observations where the dog stage is None; the value is 1319.
The pupper dog stage is the most occured with value counts of 154, followed by value count of 60, then puppo with value count of 54, lastly floofer with value count of 12

### 2: The one tweet with most likes

In [None]:
top_10_favorites = clean_archive.nlargest(10, 'favorite_count')
top_10_favorites

In [None]:
top_10_favorites.favorite_count.plot(kind = 'bar', title = 'Bar chart showing the top 10 favorite posts', 
                                     rot = 45, color = 'blue');

#### Explanation

The graph above is a bar chart showing the top 10 favorite posts. This is the row index of the dataframe ploted against the favorite_count column. The row index of 283 has the higest favorite count, followed by row 681, 55,365 etc

### 3: Some properties of tweets with most likes

In [None]:
#top 100 highest favorite_count tweets

top_100_favorites = clean_archive.nlargest(100, 'favorite_count')
top_100_favorites.rating_denominator.value_counts()

In [None]:
top_100_favorites.rating_denominator.value_counts().plot(kind = 'bar',
                                                        title = 'Bar chart showing plot of rating_denominator against its value counts',
                                                        ylabel = 'value_counts', xlabel = 'rating_denominator', color = 'blue');

The plot above is a bar chart that shows the plot of rating_denominator for the top 100 tweets against their value counts. The chart indicatees that just two rating denominator viz 10 and 11 were observed for the top 100 tweets. Their value counts are 99 and 1 respectively.

In [None]:
top_100_favorites = clean_archive.nlargest(100, 'favorite_count')
top_100_favorites.rating_numerator.value_counts()

In [None]:
top_100_favorites.rating_numerator.value_counts().plot(kind = 'bar',
                                                      title = 'Bar chart showing plot of rating_numerator against its value counts',
                                                      ylabel = 'value_counts', xlabel = 'rating_numerator', color = 'blue');