In [1]:
import pandas as pd
dfs = ['biden','klobuchar','bloomberg','buttigieg','sanders','steyer','warren']
frames = [pd.read_pickle(n + "_cleaned.pkl") for n in dfs]

In [2]:
for x in frames:
    x.reset_index(drop = True, inplace = True)

In [3]:
frames[2].columns

Index(['candidate', 'entity_id', 'callsign', 'nielsen_dma_rank', 'create_ts',
       'file_url', 'file_name', 'file_status', 'file_id',
       'network_affiliation', 'active_ind'],
      dtype='object')

In [4]:
opif_df = pd.concat(frames)
len(opif_df)

13279

In [5]:
opif_df.candidate.value_counts()

BLOOMBERG    7383
STEYER       2276
SANDERS      1645
KLOBUCHAR     700
BIDEN         480
BUTTIGIEG     467
WARREN        328
Name: candidate, dtype: int64

In [6]:
import datetime 
opif_df['date'] = opif_df.create_ts.dt.date
opif_df['date']

0      2019-10-07
1      2019-10-07
2      2019-10-07
3      2019-10-07
4      2019-10-07
          ...    
323    2020-03-31
324    2020-03-31
325    2020-03-31
326    2020-03-31
327    2020-03-31
Name: date, Length: 13279, dtype: object

In [7]:
opif_df.to_pickle("OPIF_04-06.pkl")

## For export to Excel spreadsheet, 
### I will omit file_status, file_id, active_ind, create_ts, entity_id, and 
### add an indication of what kind of file it is e.g. request, contract, invoice.

In [8]:
## Resume session:
import pandas as pd
df = pd.read_pickle("OPIF_04-06.pkl")

In [9]:
## Check for one-to-one correspondence between entity ID and call sign.
def isOneToOne(df, col1, col2):
    first = df.drop_duplicates([col1, col2]).groupby(col1)[col2].count().max()
    second = df.drop_duplicates([col1, col2]).groupby(col2)[col1].count().max()
    return first + second == 2

isOneToOne(df, 'entity_id', 'callsign')

True

In [10]:
df.drop(columns = ['entity_id','create_ts','file_status','file_id','active_ind'], inplace = True)

In [11]:
## Number of political files with request, contract, or invoice in its name. 
import re
def checkNameIgnoreCase(srch):
    return df['file_name'].str.contains(srch, flags = re.IGNORECASE, regex = True)

req = checkNameIgnoreCase('request').value_counts()
c = checkNameIgnoreCase('contract').value_counts()
i = checkNameIgnoreCase('invoice').value_counts()
print(req[1],c[1],i[1])

274 1273 2159


In [12]:
## Most common terms that appear in file_name, for future reference
top50 = pd.Series(' '.join(df['file_name']).lower().split()).value_counts()[:50]

In [13]:
top50

bloomberg                5174
president                2356
mike                     1965
-                        1890
steyer                   1880
for                      1769
invoice                  1706
2020                     1585
tom                      1392
order                    1112
michael                  1103
contract                  960
rev                       883
sanders                   833
bernie                    820
est                       702
fed                       673
inv                       441
2                         403
1                         399
form                      386
revised                   366
biden                     341
d                         320
amy                       298
klobuchar                 274
feb                       271
#                         252
warren                    237
k5-mike                   234
wo                        233
rebate                    226
us                        212
to        

In [14]:
df.loc[checkNameIgnoreCase('request'), 'label'] = 'Request'
df.loc[checkNameIgnoreCase('contract'), 'label'] = 'Contract'
df.loc[checkNameIgnoreCase('invoice'), 'label'] = 'Invoice'
df[:5]

Unnamed: 0,candidate,callsign,nielsen_dma_rank,file_url,file_name,network_affiliation,date,label
0,BIDEN,WMUR-TV,BOSTON (MANCHESTER),https://publicfiles.fcc.gov/api/manager/downlo...,10.8.19 Biden 1932771 PREBOOK,ABC,2019-10-07,
1,BIDEN,WMUR-TV,BOSTON (MANCHESTER),https://publicfiles.fcc.gov/api/manager/downlo...,10.8.19 Biden 1932773 PREBOOK,ABC,2019-10-07,
2,BIDEN,WMUR-TV,BOSTON (MANCHESTER),https://publicfiles.fcc.gov/api/manager/downlo...,10.8.19 Biden 1932772 PREBOOK,ABC,2019-10-07,
3,BIDEN,WSPA-TV,GREENVLL-SPART-ASHEVLL-AND,https://publicfiles.fcc.gov/api/manager/downlo...,BIDEN 12.31.19-1.6.20 ORDER #26660834,CBS,2019-10-07,
4,BIDEN,WSPA-TV,GREENVLL-SPART-ASHEVLL-AND,https://publicfiles.fcc.gov/api/manager/downlo...,BIDEN 1.21.20-1.27.20 ORDER #26660842,CBS,2019-10-07,


In [15]:
## Export to Excel
df = df[['candidate', 'date','nielsen_dma_rank','callsign','network_affiliation','file_name','label','file_url']]
df.to_excel('OPIF_04-06.xlsx')

## Repeat for NAB files

In [16]:
dfs = ['biden','klobuchar','bloomberg','buttigieg','sanders','steyer','warren']
frames_nab = [pd.read_pickle(n + "_nab.pkl") for n in dfs]

In [17]:
for x in frames_nab:
    x.reset_index(drop = True, inplace = True)
df_nab = pd.concat(frames_nab, sort=True)

In [18]:
df_nab.candidate.value_counts()

BLOOMBERG    846
STEYER       372
SANDERS      252
KLOBUCHAR    142
BIDEN        110
WARREN        81
BUTTIGIEG     78
Name: candidate, dtype: int64

In [19]:
df_nab['date'] = pd.to_datetime(df_nab.create_ts, utc = True).dt.date

In [20]:
print(df_nab['file_status'].value_counts())
print(df_nab['active_ind'].value_counts())

com_cpy    1848
com_prc      33
Name: file_status, dtype: int64
Y    1873
N       8
Name: active_ind, dtype: int64


In [21]:
isOneToOne(df_nab, 'entity_id', 'callsign')

True

In [22]:
df_nab.drop(columns = ['entity_id','create_ts','file_status','file_id','active_ind'], inplace = True)
df_nab = df_nab[['candidate', 'date','nielsen_dma_rank','callsign','network_affiliation','file_name','file_url']]

In [23]:
top20 = pd.Series(' '.join(df_nab['file_name']).lower().split()).value_counts()[:20]
top20

nab           1309
bloomberg      552
-              409
form           341
president      334
for            287
steyer         250
2020           248
contract       198
tom            164
bernie         163
mike           153
michael        151
sanders        135
pb18           125
supplement     120
fed            110
order           96
d               95
rev             77
dtype: int64

In [24]:
df_nab.to_excel('OPIF_04-06_NAB.xlsx')