In [2]:
import requests, requests_cache, json, pandas as pd, numpy as np
requests_cache.install_cache()

In [3]:
# Importing Super Tuesday DMAs file
dma_list = pd.read_excel(r'Super Tuesday DMAs.xlsx')
dma_list_1 = dma_list[dma_list['SuperTues'] == 1]['FCC DMA'].str.replace('\xa0', ' ')
super_tuesday_dma = dma_list_1.str.slice(stop =-7).str.strip()
super_tuesday_dma[0:5]

0                    BANGOR
1     BIRMINGHAM (ANN TUSC)
2       BOSTON (MANCHESTER)
3    BURLINGTON-PLATTSBURGH
4                 CHARLOTTE
Name: FCC DMA, dtype: object

In [3]:
from datetime import date
## Given a boundary date_1 in mm/dd/yy format, returns a filter for 
## political files created between then and the current date.
def opif_timestamp_to_present(date_1):
    date_2 = date.today().strftime("%m/%d/%y")
    timestamp = {"create_ts": str(date_1) + ' - ' + date_2}
    return timestamp

## Returns a filter for political files between date_1 and date_2.
def opif_timestamp_interval(date_1, date_2):
    return {"create_ts": date_1 + ' - ' + date_2}

In [4]:
## Filters for OPIF File Search for Super Tuesday states in February

filter_list = [
            opif_timestamp_to_present('09/01/19'),
            {"political_file_type":"PA"}, 
            {"office_type":"Presidential"},
            {"campaign_year":"2020"},
            {"source_service_code":"TV"}
                      ]
for dma in super_tuesday_dma: 
    filter_list.append({"nielsen_dma_rank":dma})
filter_list[:10]

[{'create_ts': '09/01/19 - 04/06/20'},
 {'political_file_type': 'PA'},
 {'office_type': 'Presidential'},
 {'campaign_year': '2020'},
 {'source_service_code': 'TV'},
 {'nielsen_dma_rank': 'BANGOR'},
 {'nielsen_dma_rank': 'BIRMINGHAM (ANN TUSC)'},
 {'nielsen_dma_rank': 'BOSTON (MANCHESTER)'},
 {'nielsen_dma_rank': 'BURLINGTON-PLATTSBURGH'},
 {'nielsen_dma_rank': 'CHARLOTTE'}]

In [6]:
## Filters for OPIF File Search for all states in February
# filter_list = [
#             opif_timestamp_interval('02/01/20','02/29/20'),
#             {"political_file_type":"PA"}, 
#             {"office_type":"Presidential"},
#             {"campaign_year":"2020"},
#             {"source_service_code":"TV"},
#                       ] 

In [5]:
## Returns top ten results of OPIF File Search based on 
## keyword, filters, order, and offset.
def opif_pub_search10(keyword, offset):
    url = "https://www.fcc.gov/search/api?t=opif"
    order = 'old'
    filters = json.dumps(filter_list)
    
    # add parameters
    parameters = {  'q': keyword,
                    's': str(offset),
                    'o': order,
                    'f': {filters} }
    response = requests.get(url, params = parameters)
            
    return response.json()['response']

In [6]:
## check for if offset is functioning
r1 = opif_pub_search10('bloomberg', 1)
r2 = opif_pub_search10('bloomberg', 2)
for i in r1['docs']:
    print (i['file_name'])
print ('\n')
for i in r2['docs']:
    print (i['file_name'])

Mike Bloomberg 2020 Inc- 1409313
Michael Bloomberg NAB
Mike Bloomberg NAB 2020
Mike Bloomberg 986337
Bloomberg Spot Approval 1 11-22-19
NAB Form Bloomberg US President 2020 - WWBT 11-22-19
NAB Form Bloomberg US President 2020 - WUPV 11-22-19
KGTV Bloomberg (Assembly) - Public File Disclosure
11.21 NAB Form_LOCAL.pdf Michael Bloomberg
Bloomberg 11_25 WWBT 1409383


Michael Bloomberg NAB
Mike Bloomberg NAB 2020
Mike Bloomberg 986337
Bloomberg Spot Approval 1 11-22-19
NAB Form Bloomberg US President 2020 - WWBT 11-22-19
NAB Form Bloomberg US President 2020 - WUPV 11-22-19
KGTV Bloomberg (Assembly) - Public File Disclosure
11.21 NAB Form_LOCAL.pdf Michael Bloomberg
Bloomberg 11_25 WWBT 1409383
Bloomberg 11_25 WUPV 1409539


In [7]:
int(r1['numFound']/10) + 1

821

In [8]:
import time
from IPython.display import clear_output

## Returns all results of OPIF File Search based on keyword.
def opif_pub_search(keyword):
    url = "https://www.fcc.gov/search/api?t=opif"
    result = []
    # initialize parameters
    offset = 0
    page = 1      # page = (offset/10) + 1
    order = 'old' # choose from 'old', 'best', or 'new'
    filters = json.dumps(filter_list)
    
    parameters = {  'q': keyword,
                    's': str(offset),
                    'o': order,
                    'f': {filters} }
    
    # total_pages = (numFound / 10) + 1
    total_pages = int(requests.get(url, params = parameters).json()['response']['numFound'] / 10) + 1
    
    while page <= total_pages:
        parameters['s'] = str(offset)
        
        # prints status of method
        print("Requesting page {}/{}".format(page,total_pages))
        clear_output(wait = True)
        
        # API call
        response = requests.get(url, params = parameters)
        
        # if we get an error, print the response and halt the loop
        if response.status_code != 200:
            print(response.text)
            break
        
        # if item index does not align with page, notify user and halt the loop
        if response.json()['response']['start'] != offset:
            print("Offset error at s = " + str(offset))
            break
        
        # add to our result set
        result.append(response)
        
        # if not a cached result, sleep
        if not getattr(response, 'from_cache', False):
            time.sleep(0.25)
        
        offset += 10
        page = int(offset/10) + 1
    return result

In [9]:
## Converts result set to DataFrame, drops duplicates, and saves as pickle
def save_as_df(result, candidate):
    frames = [pd.DataFrame(r.json()['response']['docs']) for r in result]
    df = pd.concat(frames, sort=True)
    print('Number of duplicates dropped: ', df.duplicated(subset = 'id', keep = 'first').sum())
    df.drop_duplicates(subset = 'id', keep = 'first', inplace = True)
    df.reset_index(drop = True, inplace = True)
    
    ## Inserts URL link as stem + folder_id + '/' + file_manager_id + '.' + file_extension
    file_url = 'https://publicfiles.fcc.gov/api/manager/download/' + df['folder_id'] + '/' + df['file_manager_id'] + '.' + df['file_extension']
    
    df.insert(3, 'file_url', file_url)
    df_name = str(candidate) + '.pkl'
    df.to_pickle(df_name)

### Run each search at least twice in case you run into a 500 HTTP status code.

In [13]:
bloomberg_files = opif_pub_search('mike') + opif_pub_search('michael') + opif_pub_search('bloomberg')

Requesting page 821/821


In [14]:
save_as_df(bloomberg_files, 'bloomberg')

Number of duplicates dropped:  7082


In [15]:
sanders_files = opif_pub_search('sanders') + opif_pub_search('bernie')

Requesting page 170/170


In [16]:
save_as_df(sanders_files, 'sanders')

Number of duplicates dropped:  1419


In [29]:
biden_files = opif_pub_search('biden') + opif_pub_search('joe')

Requesting page 30/30


In [18]:
save_as_df(biden_files, 'biden')

Number of duplicates dropped:  292


In [28]:
warren_files = opif_pub_search('warren') + opif_pub_search('elizabeth')

Requesting page 24/24


In [20]:
save_as_df(warren_files, 'warren')

Number of duplicates dropped:  238


In [27]:
buttigieg_files = opif_pub_search('buttigieg') + opif_pub_search('pete')

Requesting page 48/48


In [23]:
save_as_df(buttigieg_files, 'buttigieg')

Number of duplicates dropped:  353


In [26]:
klobuchar_files = opif_pub_search('amy') + opif_pub_search('klobuchar')

Requesting page 69/69


In [25]:
save_as_df(klobuchar_files, 'klobuchar')

Number of duplicates dropped:  545


In [30]:
steyer_files = opif_pub_search('tom') + opif_pub_search('steyer')

Requesting page 265/265


In [31]:
save_as_df(steyer_files, 'steyer')

Number of duplicates dropped:  2136
