# Following
Accounts that are followed by reporters.

In [26]:
import pandas as pd
import numpy as np
import logging
#from dateutil.parser import parse as date_parse
#from utils import tweet_iter, tweet_type

logger = logging.getLogger()
logger.setLevel(logging.DEBUG)

newspaper_reporters_follower_to_followed_df = pd.read_csv('newspaper_reporters_follower_to_followed.csv', 
                                                          names=['follower_user_id', 'followed_user_id'],
                                                          dtype={'follower_user_id': np.str, 'followed_user_id': np.str})
newspaper_reporters_follower_to_followed_df.count()

follower_user_id    938969
followed_user_id    938969
dtype: int64

In [27]:
newspaper_reporters_follower_to_followed_df.head()

Unnamed: 0,follower_user_id,followed_user_id
0,2345626885,3350850490
1,2345626885,2396531010
2,2345626885,4832812066
3,2345626885,4226086551
4,2345626885,323934044


In [28]:
periodical_reporters_follower_to_followed_df = pd.read_csv('periodical_reporters_follower_to_followed.csv', 
                                                          names=['follower_user_id', 'followed_user_id'],
                                                          dtype={'follower_user_id': np.str, 'followed_user_id': np.str})
periodical_reporters_follower_to_followed_df.count()

follower_user_id    846009
followed_user_id    846009
dtype: int64

In [29]:
follower_to_followed_df = newspaper_reporters_follower_to_followed_df.append([periodical_reporters_follower_to_followed_df], 
                                                                             ignore_index=True)
follower_to_followed_df.count()

follower_user_id    1784978
followed_user_id    1784978
dtype: int64

### Number of follower accounts

In [30]:
follower_df = pd.DataFrame(follower_to_followed_df['follower_user_id'].value_counts()).rename(columns={'follower_user_id': 'following_count'})
follower_df.count()

following_count    1457
dtype: int64

In [31]:
follower_df.describe()

Unnamed: 0,following_count
count,1457.0
mean,1225.10501
std,1354.160332
min,1.0
25%,411.0
50%,911.0
75%,1543.0
max,15914.0


### Load followed screen names

In [32]:
followed_screen_name_lookup_df = pd.read_csv('followed.csv', 
                                             names=['screen_name', 'user_id'],
                                             dtype={'user_id': np.str}).set_index(['user_id'])
followed_screen_name_lookup_df.head()

Unnamed: 0_level_0,screen_name
user_id,Unnamed: 1_level_1
404896635,tattlerbcc
82889339,mhondorp
4704704592,NazNiyam
594297187,tomsprattphx
3238639833,stopgmonow


In [33]:
followed_df = pd.DataFrame(follower_to_followed_df['followed_user_id'].value_counts()).rename(columns={'followed_user_id': 'follower_count'})
followed_df.index.name = 'user_id'
followed_df.count()

follower_count    401093
dtype: int64

In [34]:
followed_df.head()

Unnamed: 0_level_0,follower_count
user_id,Unnamed: 1_level_1
813286,961
51241574,919
807095,898
1339835893,896
25073877,893


In [35]:
followed_merge_df = followed_df.join(followed_screen_name_lookup_df, how='left')
followed_merge_df.count()

follower_count    401093
screen_name       401075
dtype: int64

In [36]:
followed_merge_df.head()

Unnamed: 0_level_0,follower_count,screen_name
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1
813286,961,BarackObama
51241574,919,AP
807095,898,nytimes
1339835893,896,HillaryClinton
25073877,893,realDonaldTrump


### Load known twitter accounts

In [37]:
from utils import load_screen_name_lookup_df

screen_name_lookup_df = load_screen_name_lookup_df()
screen_name_lookup_df['type'].value_counts()

media          5915
government     2959
reporters      1457
politicians     601
Name: type, dtype: int64

In [38]:
screen_name_lookup_df.head()

Unnamed: 0_level_0,screen_name,type,screen_name_lower
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2345626885,marcy_crane,reporters,marcy_crane
780221130,loren_duggan,reporters,loren_duggan
285772181,akesslerdc,reporters,akesslerdc
29607664,adamliptak,reporters,adamliptak
9484732,amacker,reporters,amacker


## Top followed accounts <----------
Unknown for type indicates that it is not matched with an known Twitter account.

In [39]:
followed_join_df = followed_merge_df.join(screen_name_lookup_df[['type']], how='left').sort_values('follower_count', ascending=False)
followed_join_df['type'].fillna('unknown', inplace=True)
# followed.index.name = 'user_id'
followed_join_df.head(25)

Unnamed: 0_level_0,follower_count,screen_name,type
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
813286,961,BarackObama,unknown
51241574,919,AP,media
807095,898,nytimes,media
1339835893,896,HillaryClinton,unknown
25073877,893,realDonaldTrump,politicians
818927131883356161,886,PressSec,politicians
822215673812119553,865,WhiteHouse,politicians
2467791,850,washingtonpost,media
822215679726100480,846,POTUS,politicians
9300262,839,politico,media


## Followed accounts by type <----------

In [40]:
followed_join_df.groupby('type').sum()

Unnamed: 0_level_0,follower_count
type,Unnamed: 1_level_1
government,26257
media,22038
politicians,52760
reporters,141122
unknown,1542801


## Top followed accounts that are not known. <----------
These are the accounts that we will want to categorize.

In [41]:
top_not_known_followed_df = followed_join_df[followed_join_df.type == 'unknown'].sort_values('follower_count', ascending=False)
top_not_known_followed_df[['screen_name', 'follower_count']].head(100)

Unnamed: 0_level_0,screen_name,follower_count
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1
813286,BarackObama,961
1339835893,HillaryClinton,896
14246001,mikeallen,823
30313925,ObamaWhiteHouse,817
16017475,NateSilver538,795
93069110,maggieNYT,793
18622869,ezraklein,762
1536791610,POTUS44,745
113420831,PressSec44,743
14529929,jaketapper,719


## Followed accounts by user

In [42]:
follower_to_followed_type_df = pd.merge(follower_to_followed_df, screen_name_lookup_df[['type']], how='left', left_on='followed_user_id', right_index=True)
follower_to_followed_type_df['type'].fillna('unknown', inplace=True)
follower_to_followed_type_df.head()


Unnamed: 0,follower_user_id,followed_user_id,type
0,2345626885,3350850490,unknown
1,2345626885,2396531010,unknown
2,2345626885,4832812066,government
3,2345626885,4226086551,government
4,2345626885,323934044,reporters


In [43]:
followed_summary_by_user_df = follower_to_followed_type_df.groupby([follower_to_followed_type_df.follower_user_id, follower_to_followed_type_df.type]).size().unstack().fillna(0)
# Add a total column
followed_summary_by_user_df['total'] = followed_summary_by_user_df.sum(axis=1)
for col_name in followed_summary_by_user_df.columns[:-1]:
    followed_summary_by_user_df['{}_percent'.format(col_name)] = followed_summary_by_user_df[col_name] / followed_summary_by_user_df.total
followed_summary_by_user_df.head(10)

type,government,media,politicians,reporters,unknown,total,government_percent,media_percent,politicians_percent,reporters_percent,unknown_percent
follower_user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
100165378,18.0,28.0,35.0,16.0,1558.0,1655.0,0.010876,0.016918,0.021148,0.009668,0.94139
1001991865,1.0,0.0,0.0,0.0,8.0,9.0,0.111111,0.0,0.0,0.0,0.888889
1002229862,8.0,5.0,10.0,119.0,360.0,502.0,0.015936,0.00996,0.01992,0.237052,0.717131
100270054,43.0,15.0,13.0,30.0,357.0,458.0,0.093886,0.032751,0.028384,0.065502,0.779476
100802089,18.0,10.0,31.0,95.0,660.0,814.0,0.022113,0.012285,0.038084,0.116708,0.810811
100860790,30.0,7.0,30.0,108.0,1940.0,2115.0,0.014184,0.00331,0.014184,0.051064,0.917258
1009749229,5.0,0.0,0.0,34.0,428.0,467.0,0.010707,0.0,0.0,0.072805,0.916488
102171691,13.0,12.0,9.0,110.0,625.0,769.0,0.016905,0.015605,0.011704,0.143043,0.812744
102789488,8.0,10.0,20.0,95.0,949.0,1082.0,0.007394,0.009242,0.018484,0.0878,0.877079
102994740,11.0,22.0,5.0,35.0,1202.0,1275.0,0.008627,0.017255,0.003922,0.027451,0.942745


### Average of percent of following by type for each user
That is, for each user determine the percent of following by type. Then take the average of each type.

Thus, this following analysis is on a per-user basis, accounting for how prolific a follower a user is. (That is, users who follower more users aren't weighed more heavily.)


In [44]:
followed_summary_by_user_df.filter(axis=1, regex="_percent$").mean()

type
government_percent     0.017843
media_percent          0.016689
politicians_percent    0.030611
reporters_percent      0.095417
unknown_percent        0.839441
dtype: float64