In [None]:
# Import all packages and set plots to be embedded inline
import pandas as pd
import numpy as np
import json as json
import tweepy
from timeit import default_timer as timer
import matplotlib.pyplot as plt
import seaborn as sb

%matplotlib inline

In [None]:
# Setting up Plot style
plt.style.use({'figure.facecolor':'white'})

In [None]:
# Loading dataset
tweet_df = pd.read_csv('twitter-archive-enhanced-2.csv')
image_df = pd.read_csv('image-predictions-3.tsv', sep='\t')


In [None]:
# We make a copy of the loaded dataframe as a best practice.
tweet_av_df = tweet_df.copy()
image_pred_df = image_df.copy()

### Visual Assessment on Twitter archive content which contains basic tweet data ###
-----
For this project, key data assessment requirements for twitter archive data include original rating and there should be an image associated with the given rating.
- "expanded_urls" is associated with image urls for a given tweet.
- It is observed that the "expanded_urls" column does have missing or no values
- "rating_numerator" gives us insights into the given dog rating.
- "tweet_id" it a unique identifier identifying each unique tweet for each dog.
- "timestamp" captures the date and time specifics when a direct message was posted to weratedogs
- 2356 records in twitter archive data set.



In [None]:
tweet_av_df

### Programmatic Assessment on Twitter archive content which contains basic tweet data ###

In [None]:
tweet_av_df.head()

In [None]:
tweet_av_df.tail()

In [None]:
tweet_av_df.sample(5)

In [None]:
tweet_av_df.info()

In [None]:
tweet_av_df.rating_numerator.describe()

In [None]:
tweet_av_df[['doggo', 'floofer', 'pupper', 'puppo']].apply(pd.Series.value_counts)

### **Define - Data Quality Issues** ###
----------
- Completeness: The following columns are incomplete - and have missing values. 
'in_reply_to_status_id', 'in_reply_to_user_id', 'retweeted_status_id','retweeted_status_user_id', 'retweeted_status_timestamp','expanded_urls'.

- Consistency: 'in_reply_to_user_id', 'retweeted_status_user_id' (status ids are sometimes populated with user_id).

- Validity: 'rating_denominator' has 0 values in it. This will result in 0 rating for dogs.

- Erroneous data types: 'timestamp', 'retweeted_status_timestamp' has been set as object type.

- Erroneous data types: 'in_reply_to_status_id', 'in_reply_to_user_id', 'retweeted_status_id','retweeted_status_user_id' should be int type.

- Accuracy: 'name' column has inaccurate values.








### **Code - Code to fix the data quality issues.** ###
--------

In [None]:
# Fixing the 'timestamp' data quality issue. Converting object type to datetime64 with UTC timezone 
tweet_av_df['timestamp'] = pd.to_datetime(tweet_av_df['timestamp'], utc=True, errors='coerce')

In [None]:
# Fixing 'expanded_urls' data quality issue. Delete rows that have null values since we only want original tweet ratings that have images.
tweet_av_df = tweet_av_df.dropna(axis=0, subset=['expanded_urls'])

In [None]:
# Fixing 'retweeted_status_id' data quality issue. We delete rows that have a value associated with retweeted_status_id since we only want original tweet ratings.
tweet_av_df = tweet_av_df.drop(tweet_av_df.loc[tweet_av_df.retweeted_status_id.notna()].index)
tweet_av_df.reset_index()


In [None]:
# Continue with 'retweeted_status_id' - since we are intrested with original tweets only, it is safe to drop this column since we are left with null values in this column.
tweet_av_df.drop(axis=1, columns=['retweeted_status_id'], inplace=True)


In [None]:
# 'retweeted_status_user_id' column can also be dropped since we are also left with null values. Keeping the original question in mind.
tweet_av_df.drop(axis=1, columns=['retweeted_status_user_id'], inplace=True)

In [None]:
# 'retweeted_status_timestamp' column can also be dropped since we are also left with null values. Keeping the original question in mind.
tweet_av_df.drop(axis=1, columns=['retweeted_status_timestamp'], inplace=True)

In [None]:
# 'in_reply_to_status_id' column can be dropped, since we are not really looking at tweets that were in reply.
tweet_av_df.drop(axis=1, columns=['in_reply_to_status_id'], inplace=True)

In [None]:
# 'in_reply_to_user_id' column can be dropped too, since we are not really looking at tweets that were in reply.
tweet_av_df.drop(axis=1, columns=['in_reply_to_user_id'], inplace=True)

### **Test** ###
------

In [None]:
# Taking a peek at tweet dataframe after fixing the data quality issues.
tweet_av_df.info()


### Assessment Findings on Image Predictions data set ###
------
**Data Quality Issues**

- No data quality issues were found with this data set. The dataset is complete, consistent, valid with no accuracy issues.

### Gather Favourite and Retweet Count using Twitter api ###
-----

Before we proceed with data tidy tasks., we will gather favourite and retweet count data using twitter api.

In [None]:

# A function to collect tweet json data using twitter api and save it to json file.
def collecttweetdata(tweet_ids):
    consumer_key = ''
    consumer_secret = ''
    
    auth = tweepy.AppAuthHandler(consumer_key, consumer_secret)
    api = tweepy.API(auth, wait_on_rate_limit=True)
    count = 0
    fails_dict = {}
    start = timer()
    with open('tweet-json.txt', 'w') as outfile:
        for tweet_id in tweet_ids:
            count += 1
            print(str(count) + ": " + str(tweet_id))
            try:
                tweetjson = api.get_status(tweet_id, tweet_mode="extended")
                print("Success")
                json.dump(tweetjson._json, outfile)
                outfile.write('\n')
            except tweepy.TweepError as e:
                print("Fail", e)
                fails_dict[tweet_id] = e
                pass
    end = timer()
    print(end - start)
    print(fails_dict)

In [None]:
# Lets collect tweet data and save to tweets data json file.
tweet_ids = tweet_av_df.tweet_id.values
collecttweetdata(tweet_ids)

In [None]:
# Creating dataframe from the tweet json that will have the 'favorite_count' and 'retweet_count'
def is_json_key_present(json, key):
    try:
        buf = json[key]
    except KeyError:
        return False

    return True

column_names = ["tweet_id", "favorite_count", "retweet_count"]
list_vals = []
with open('tweet-json.txt','r') as jfile:
    for line in jfile:
        try:
            myjson = json.loads(line)
            if (is_json_key_present(myjson,'id') and  is_json_key_present(myjson,'favorite_count') and is_json_key_present(myjson,'retweet_count')):
                vals = [myjson['id'], myjson['favorite_count'], myjson['retweet_count']]
                list_vals.append(vals)
            else:
                print('Skipping row as there is no data')
        except:
            pass
tweet_data_df = pd.DataFrame(list_vals, columns=column_names)
print(tweet_data_df.shape, tweet_data_df.columns)


### **Define - Tideness issues - 1** ###
----------

The following tidiness issues were identified.

- The retweet and favorite count belong to twitter data set - to form an observational unit (table).
- As each variable forms a column, the columns on twitter data set 'doggo', 'floofer', 'pupper', 'puppo' are identifying various stages of dog. We fix this by creating a single column 'growth_stage' that captures the dog stage.
- The image predictions data can also be combined with twitter data set to form an observational unit from where the predictions on each tweet can be analyzed.


### **Code - Code to fixe the data quality issues.** ###
--------

In [None]:
# Merge tweet archive data set with tweet json data set that has favorite and retweet count.
twitter_av_favs_df =  tweet_av_df.merge(tweet_data_df, on='tweet_id')

### **Test - the data quality issues.** ###
--------

In [None]:
twitter_av_favs_df.info()


### **Define - Tideness issues - 2** ###
----------

The columns on twitter archive data set 'doggo', 'floofer', 'pupper', 'puppo' are identifying various stages of dog. We fix this wide form of data by creating a single column 'growth_stage' that captures the dog stage.

I created a function to melt the individual stage columns into a single column to identify the existing stage of the dog. The highest stage takes precedence when a tweet indicates multiple stages of the dog.


### **Code - Code to fix the data quality issues.** ###
--------

In [None]:
# Custom function to derive the current stage of the dog and create a dataframe to hold the tweet_id and the growth_stage.
def dog_stages(twitter_avfavsdf):
    stages_dict = {}
    for i in range(len(twitter_avfavsdf)):
        row = twitter_avfavsdf.iloc[i]
        stage = "unknown"
        if (row.floofer == "floofer"):
            stage = "floofer"
        elif (row.puppo == "puppo"):
            stage = "puppo"
        elif (row.pupper == "pupper"):
            stage = "pupper"
        elif (row.doggo == "doggo"):
            stage = "doggo"
        stages_dict[row.tweet_id] = stage
    return stages_dict

stage_dictn = dog_stages(twitter_av_favs_df)
stage_df = pd.DataFrame(stage_dictn.items(), columns=['tweet_id', 'growth_stage'])

In [None]:
# Enriching the twitter data set that has favourite and retweet count with growth_stage column and creating a new data set.
twitter_av_favsstages_df =  twitter_av_favs_df.merge(stage_df, on='tweet_id')

In [None]:
# We are going to drop the stage columns.
twitter_av_favsstages_df.drop(axis=1, columns=['doggo', 'floofer', 'pupper', 'puppo'], inplace=True)

In [None]:
# Finally we will save this to our master data frame - 'twitter_archive_master.csv'.
twitter_av_favsstages_df.to_csv('twitter_archive_master.csv', index=False)

### **Test - the data quality issues.** ###
--------

In [None]:
# Lets get the shape and columns
print(twitter_av_favsstages_df.shape, twitter_av_favsstages_df.columns)

In [None]:
# Getting a sample of our dataset
twitter_av_favsstages_df.sample(3)

### Visualizations to help us get insignts on WeRateDogs Twitter data ###

- What was the most favorite dog stage on average?

- What was the most retweeted dog stage on average?

- Which dog stage are highly rated on average?
 

In [None]:
twitter_av_master_df = pd.read_csv('twitter_archive_master.csv')

In [None]:
# On twitter which dog stage has more faviourites on average?
plt.figure(figsize=[16, 12])
base_color = sb.color_palette()[0]
splot = sb.barplot(x="growth_stage", y="favorite_count", data=twitter_av_master_df, ci=None, order=['doggo', 'pupper', 'puppo', 'floofer'], color=base_color)
for p in splot.patches:
    splot.annotate(format(p.get_height(), '.0f'), (p.get_x() + p.get_width() / 2., p.get_height()), ha = 'center', va = 'center', xytext = (0, 10), textcoords = 'offset points')
plt.xlabel("Dog Stages")
plt.ylabel("Favorite Count")
plt.title("Favorite's by Dog Stages on Twitter")
plt.show()

In [None]:
# On twitter which dog stage has high retweets on average?
plt.figure(figsize=[16, 12])
base_color = sb.color_palette()[0]
splot = sb.barplot(x="growth_stage", y="retweet_count", data=twitter_av_master_df, ci=None, order=['doggo', 'pupper', 'puppo', 'floofer'], color=base_color)
for p in splot.patches:
    splot.annotate(format(p.get_height(), '.0f'), (p.get_x() + p.get_width() / 2., p.get_height()), ha = 'center', va = 'center', xytext = (0, 10), textcoords = 'offset points')
plt.xlabel("Dog Stages")
plt.ylabel("Retweet Count")
plt.title("Retweet's by Dog Stages on Twitter")
plt.show()

In [None]:
# How are the dogs rated by their stages?
plt.figure(figsize=[16, 12])
base_color = sb.color_palette()[0]

splot = sb.barplot(x="growth_stage", y="rating_numerator", data=twitter_av_master_df, ci=None, order=['doggo', 'pupper', 'puppo', 'floofer'], color=base_color)
for p in splot.patches:
    splot.annotate(format(p.get_height(), '.0f'), (p.get_x() + p.get_width() / 2., p.get_height()), ha = 'center', va = 'center', xytext = (0, 10), textcoords = 'offset points')

plt.xlabel("Dog Stages")
plt.ylabel("Rating on scale of 10")
plt.title("Rating averages by Dog Stages on Twitter")
plt.show()

### Insights from WeRateDogs data set ###

- puppo dog stage was highly favorited on average by 21,631.

- Again puppo dog stage was highly retweeted on average by 6292.

- All three dog stages doggo, puppo, floofer were equally rated on average at 12. While pupper was only off by 1 rating.


