# Following
Accounts that are followed by reporters.

In [1]:
import pandas as pd
import numpy as np
import logging
#from dateutil.parser import parse as date_parse
#from utils import tweet_iter, tweet_type

logger = logging.getLogger()
logger.setLevel(logging.DEBUG)

newspaper_reporters_follower_to_followed_df = pd.read_csv('newspaper_reporters_follower_to_followed.csv', 
                                                          names=['follower_user_id', 'followed_user_id'],
                                                          dtype={'follower_user_id': np.str, 'followed_user_id': np.str})
newspaper_reporters_follower_to_followed_df.count()

follower_user_id    938969
followed_user_id    938969
dtype: int64

In [2]:
newspaper_reporters_follower_to_followed_df.head()

Unnamed: 0,follower_user_id,followed_user_id
0,2345626885,3350850490
1,2345626885,2396531010
2,2345626885,4832812066
3,2345626885,4226086551
4,2345626885,323934044


In [3]:
periodical_reporters_follower_to_followed_df = pd.read_csv('periodical_reporters_follower_to_followed.csv', 
                                                          names=['follower_user_id', 'followed_user_id'],
                                                          dtype={'follower_user_id': np.str, 'followed_user_id': np.str})
periodical_reporters_follower_to_followed_df.count()

follower_user_id    846009
followed_user_id    846009
dtype: int64

In [4]:
follower_to_followed_df = newspaper_reporters_follower_to_followed_df.append([periodical_reporters_follower_to_followed_df], 
                                                                             ignore_index=True)
follower_to_followed_df.count()

follower_user_id    1784978
followed_user_id    1784978
dtype: int64

### Number of follower accounts

In [5]:
follower_df = pd.DataFrame(follower_to_followed_df['follower_user_id'].value_counts()).rename(columns={'follower_user_id': 'following_count'})
follower_df.count()

following_count    1457
dtype: int64

In [6]:
follower_df.describe()

Unnamed: 0,following_count
count,1457.0
mean,1225.10501
std,1354.160332
min,1.0
25%,411.0
50%,911.0
75%,1543.0
max,15914.0


### Load followed screen names

In [7]:
followed_screen_name_lookup_df = pd.read_csv('followed.csv', 
                                             names=['screen_name', 'user_id'],
                                             dtype={'user_id': np.str}).set_index(['user_id'])
followed_screen_name_lookup_df.head()

Unnamed: 0_level_0,screen_name
user_id,Unnamed: 1_level_1
404896635,tattlerbcc
82889339,mhondorp
4704704592,NazNiyam
594297187,tomsprattphx
3238639833,stopgmonow


In [8]:
followed_df = pd.DataFrame(follower_to_followed_df['followed_user_id'].value_counts()).rename(columns={'followed_user_id': 'follower_count'})
followed_df.index.name = 'user_id'
followed_df.count()

follower_count    401093
dtype: int64

In [9]:
followed_df.head()

Unnamed: 0_level_0,follower_count
user_id,Unnamed: 1_level_1
813286,961
51241574,919
807095,898
1339835893,896
25073877,893


In [10]:
followed_merge_df = followed_df.join(followed_screen_name_lookup_df, how='left')
followed_merge_df.count()

follower_count    401093
screen_name       401075
dtype: int64

In [11]:
followed_merge_df.head()

Unnamed: 0_level_0,follower_count,screen_name
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1
813286,961,BarackObama
51241574,919,AP
807095,898,nytimes
1339835893,896,HillaryClinton
25073877,893,realDonaldTrump


### Load known twitter accounts

In [12]:
from utils import load_screen_name_lookup_df

screen_name_lookup_df = load_screen_name_lookup_df()
screen_name_lookup_df['type'].value_counts()

media          5915
government     2959
reporters      1457
politicians     601
Name: type, dtype: int64

In [13]:
screen_name_lookup_df.head()

Unnamed: 0_level_0,screen_name,type,screen_name_lower
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2345626885,marcy_crane,reporters,marcy_crane
780221130,loren_duggan,reporters,loren_duggan
285772181,akesslerdc,reporters,akesslerdc
29607664,adamliptak,reporters,adamliptak
9484732,amacker,reporters,amacker


## Top followed accounts <----------
Unknown for type indicates that it is not matched with an known Twitter account.

In [14]:
followed_join_df = followed_merge_df.join(screen_name_lookup_df[['type']], how='left').sort_values('follower_count', ascending=False)
followed_join_df['type'].fillna('unknown', inplace=True)
# followed.index.name = 'user_id'
followed_join_df.head(25)

Unnamed: 0_level_0,follower_count,screen_name,type
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
813286,961,BarackObama,unknown
51241574,919,AP,media
807095,898,nytimes,media
1339835893,896,HillaryClinton,unknown
25073877,893,realDonaldTrump,politicians
818927131883356161,886,PressSec,politicians
822215673812119553,865,WhiteHouse,politicians
2467791,850,washingtonpost,media
822215679726100480,846,POTUS,politicians
9300262,839,politico,media


## Followed accounts by type <----------

In [15]:
followed_join_df.groupby('type').sum()

Unnamed: 0_level_0,follower_count
type,Unnamed: 1_level_1
government,26257
media,22038
politicians,52760
reporters,141122
unknown,1542801


## Top followed accounts that are not known. <----------
These are the accounts that we will want to categorize.

In [16]:
top_not_known_followed_df = followed_join_df[followed_join_df.type == 'unknown'].sort_values('follower_count', ascending=False)
top_not_known_followed_df[['screen_name', 'follower_count']].head(100)

Unnamed: 0_level_0,screen_name,follower_count
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1
813286,BarackObama,961
1339835893,HillaryClinton,896
14246001,mikeallen,823
30313925,ObamaWhiteHouse,817
16017475,NateSilver538,795
93069110,maggieNYT,793
18622869,ezraklein,762
1536791610,POTUS44,745
113420831,PressSec44,743
14529929,jaketapper,719
