## Load Modules
- ttools has helper functions

In [1]:

%load_ext autoreload
%autoreload 2
import sys, codecs, json
import ttools
from twython import TwythonStreamer, Twython
from datetime import datetime
from time import time
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

## Get top100 [from pre-made json file]
First, load the dictionary with the top100 most followed twtter users and extract the user_ids for use in api

In [2]:
top100file = './top100_id_dictionary.json'
top100 = ttools.json_to_dict(top100file)  # format is {user_id:[username,name]} really we just care about the user ids for now
top100ids = [int(uid) for uid in top100.keys()]

Read in and inspect the top100 tweet/timeline data!

In [3]:
%time top100all = pd.read_csv('top100users_and_timelines.csv')

CPU times: user 1.04 s, sys: 107 ms, total: 1.15 s
Wall time: 1.15 s




In [4]:
top100all.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 203263 entries, 0 to 203262
Data columns (total 18 columns):
tweet_id                 203263 non-null object
date                     203262 non-null object
user_id                  203262 non-null object
text                     203262 non-null object
text_noMentions          203215 non-null object
is_quote_status          203256 non-null object
is_reply_to_status       203256 non-null object
is_reply_to_user         203256 non-null object
numMentions              203256 non-null float64
user_verified            203256 non-null object
user_description_text    161567 non-null object
user_followers_count     203256 non-null float64
user_friends_count       203256 non-null float64
user_listed_count        203256 non-null float64
user_favourites_count    203256 non-null float64
user_statuses_count      203250 non-null float64
retweet_count            203250 non-null float64
favorite_count           203250 non-null float64
dtypes: float64(8

Now, we need to categorize the top100. Here is some helper code. The 'top100cat' dataframe is the result.

In [5]:
#turn dict into dataframe
top100forCategories = []
for uid in list(top100.keys()):
    top100forCategories.append([int(uid),top100[uid][0],top100[uid][1]])

#save the dataframe
pd.DataFrame(top100forCategories).to_csv('top100categorization.csv')
#then, manually labeled each entry as one of the five categories. Saved labeled file as 'top100categorization_complete.csv'

#Now, read in the complete csv. This df can be used with the 'top100users_and_timelines.csv' dataset to help
#categorize the top100 users/tweets into correct category
CATEGORY = {'a':'artist','b':'businessLeader','c':'company','p':'politician','t':'athlete'}
top100cat = pd.read_csv('top100categorization_complete.csv')
top100cat.drop(['Unnamed: 0','notes'],axis=1,inplace=True)
top100cat.rename(columns={'0':'user_id','1':'screenname','2':'name'},inplace=True)
print('top100 category count:\n%s'%(top100cat['category'].value_counts().rename(CATEGORY)))
#top100cat.info()

top100 category count:
artist            60
company           14
athlete            9
politician         7
businessLeader     2
Name: category, dtype: int64


## Example use: get ALL tweets from the top 100 users that came from all users labeled as athletes:


In [6]:
athlete_ids = set(top100cat[top100cat['category']=='t']['user_id'])
#note...top100cat may include non-english id's. these are filtered out in top100all, so the set of user_id's that we catch in the next line
#may be a subset of the set of user_ids within the category being filtered. i.e. there are 9 athletes in the top100, but one of their accounts
#was non-english. So when we filter on atheletes, our tweet/timeline data will only have timelines for 8 athletes. this is not a problem,
#just something to be aware of.
athlete_tweets = top100all[top100all.apply(lambda x:x['user_id'] in athlete_ids,axis=1)]

In [7]:
print('%s tweets [%0.1f%%] of top100 dataset are from atheletes'%(athlete_tweets.shape[0],(athlete_tweets.shape[0]/top100all.shape[0])*100))
print('the athletes in the dataset are:\n%s'%(top100cat[top100cat.apply(lambda x:x['user_id'] in athlete_tweets['user_id'].unique(),axis=1)]['name']))

17956 tweets [8.8%] of top100 dataset are from atheletes
the athletes in the dataset are:
45                     kaka
48             fc barcelona
55                      nba
56         sachin tendulkar
59              virat kohli
68                      nfl
73           andrés iniesta
80    uefa champions league
Name: name, dtype: object
