# Data cleaning routine for all candidates

In [1]:
import re, pandas as pd
## Returns NAB files, prints all WNAB files for inspection.
def get_nab(df):
    print (df[df['callsign'].str.contains('NAB')]['file_name'])
    return df[df['file_name'].str.contains('NAB|PB18|PB-18|PB 18|PB19|PB-19|PB 19| PB', flags = re.IGNORECASE, regex = True) & ~df['callsign'].str.contains('NAB')]

## Returns df - NAB files
def select_df(candidate):
    temp = pd.read_pickle(str(candidate) + ".pkl")
    temp = temp[['entity_id','callsign','nielsen_dma_rank','create_ts','file_url','file_name',
                 'file_status','file_id','network_affiliation','active_ind']]
    temp.insert(0,'candidate',str.upper(candidate))
    temp['create_ts'] = pd.to_datetime(temp.create_ts)
    temp_nab = get_nab(temp)
    temp_nab.to_pickle(str(candidate) + "_nab" + ".pkl")
    print("Before: ", len(temp))
    temp.drop(temp_nab.index, inplace = True)
    print("After: ", len(temp))
    temp.to_pickle(str(candidate) + "_cleaned" + ".pkl")
    return temp

In [2]:
## Bloomberg
bloomberg_df = select_df('bloomberg')

6741           WNAB Order # 26780698 Contact # 4197115
6742                             WNAB Order # 26780698
6743                         wnab bloomberg c# 4197115
6959                             WNAB Order # 26785003
6960          WNAB Order # 26785003 Contract # 4204769
6961          WNAB Order # 26788027 Contract # 4217032
6962                             WNAB Order # 26788027
6963                         wnab bloomberg c# 4204769
7064                         wnab bloomberg c# 4217032
7490                             WNAB Order # 26800758
7491          WNAB Order # 26800758 Contract # 4241798
7492          WNAB Order # 26798959 Contract # 4238316
7493                             WNAB Order # 26798959
7494                         wnab bloomberg c# 4238316
7495                         wnab bloomberg c# 4241798
7565          WNAB Order # 26792127 Contract # 4227386
7566                           WNAB Order # 26792127 2
7567                          wnab bloomberg c#4227386
8147      

In [3]:
## Biden
biden_df = select_df('biden')

498                       WNAB Order # 26809807
499    WNAB Order # 26809807 Contract # 4260092
500                            NAB - BIDEN 2020
501               wnab biden c# 4260092 postlog
Name: file_name, dtype: object
Before:  590
After:  480


#### There will be one NAB file in the Biden dataset.

In [4]:
## Warren
wdf = select_df('warren')

Series([], Name: file_name, dtype: object)
Before:  409
After:  328


In [5]:
## Sanders
sanders_df = select_df('sanders')

Series([], Name: file_name, dtype: object)
Before:  1900
After:  1647


In [6]:
## Buttigieg
pete_df = select_df('buttigieg')

Series([], Name: file_name, dtype: object)
Before:  545
After:  467


In [7]:
## Klobuchar
klobuchar_df = select_df('klobuchar')

644               Amy for America March 3rd NAB
645                 wnab amy c# 4251843 postlog
823                       WNAB Order # 26805433
824    WNAB Order # 26805433 Contract # 4251843
Name: file_name, dtype: object
Before:  842
After:  700


#### There will be one NAB file in the Klobuchar dataset.

In [8]:
## Steyer
steyer_df = select_df('steyer')

2608    WNAB Order # 26804810 Contract # 4250203
2609                       WNAB Order # 26804810
2610                                      TS NAB
2611              wnab steyer c# 4250203 postlog
Name: file_name, dtype: object
Before:  2657
After:  2283


#### There will be one NAB file in the Steyer dataset.

In [29]:
## But this was the result count for February!

Before: 3399
After: 2973


In [144]:
steyer_df[0:5]

Unnamed: 0,candidate,entity_id,callsign,nielsen_dma_rank,create_ts,file_url,file_name,file_status,file_id,network_affiliation,active_ind
2,STEYER,35648,KTAL-TV,SHREVEPORT,2019-11-26 22:32:51.193000+00:00,https://publicfiles.fcc.gov/api/manager/downlo...,Tom Steyer Est. #8139 11.28.19-12.04.19,com_cpy,9d3bb5bb-b1ed-2623-7ff8-48750413a2e1,NBC,Y
3,STEYER,35670,KTLA,LOS ANGELES,2019-11-27 01:13:05.639000+00:00,https://publicfiles.fcc.gov/api/manager/downlo...,New- Pol- Tom Steyer- President 2239734--1 NEW,com_cpy,6160b22a-a291-8a16-8adf-33a513ec50ee,CW,N
5,STEYER,35648,KTAL-TV,SHREVEPORT,2019-12-04 22:20:36.662000+00:00,https://publicfiles.fcc.gov/api/manager/downlo...,Tom Steyer Est. #8139 11.28.19-12.04.19 - Rev,com_cpy,3bf4c262-115a-e53b-0a21-125c6735d94c,NBC,Y
7,STEYER,35648,KTAL-TV,SHREVEPORT,2019-12-09 19:35:41.895000+00:00,https://publicfiles.fcc.gov/api/manager/downlo...,Tom Steyer Est. #8170 12.16.19-12.22.19,com_cpy,120a40b5-23e4-1aac-dae1-a7688cf9d5f3,NBC,Y
8,STEYER,35648,KTAL-TV,SHREVEPORT,2019-12-09 19:35:41.893000+00:00,https://publicfiles.fcc.gov/api/manager/downlo...,Tom Steyer Est. #8169 12.10.19-12.15.19,com_cpy,1908707a-1489-ed51-b313-262936b6fb29,NBC,Y


## Checks for data integrity

In [9]:
import re

## Checks input df for all candidates except for the given candidate key
def check_str(key):
    check_string = 'Bernie|Sanders|Joe|Biden|Elizabeth|Warren|Pete|Buttigieg|Amy|Klobuchar|Tom|Steyer|bloomberg|Bloomberg'
    if key == 'sanders': return (check_string.replace('Bernie|Sanders|',''))
    elif key == 'biden': return check_string.replace('Joe|Biden|','')
    elif key == 'warren': return check_string.replace('Elizabeth|Warren|','')
    elif key == 'buttigieg': return check_string.replace('Pete|Buttigieg|','')
    elif key == 'klobuchar': return check_string.replace('Amy|Klobuchar|','')
    elif key == 'steyer': return check_string.replace('Tom|Steyer|','')
    elif key == 'bloomberg': return check_string.replace('|bloomberg|Bloomberg','')
    else: return 'Error'
    
def check(df, key):
    check_string = check_str(key)
    return (df[df['file_name'].str.contains(check_string, flags = re.IGNORECASE, regex = True)]['file_name'])

In [10]:
check(bloomberg_df, 'bloomberg')

1317                   MIKE BLOOMBERG--TOMMY MRBPBK6028EH
1360                              Political Ad Form-Tommy
1760                   MIKE BLOOMBERG--TOMMY MRBPBK6028EH
2091                   MIKE BLOOMBERG--TOMMY MRBPBK6028EH
2367                       2.18.20 Mike Bloomberg - Tommy
2729                   MIKE BLOOMBERG--TOMMY MRBPBK6028EH
7057    Bloomberg US President WUPV review form - Tomm...
7059    Bloomberg US President WWBT review form - Tomm...
7421                                     MRBPK6028H Tommy
Name: file_name, dtype: object

In [11]:
check(biden_df, 'biden')

Series([], Name: file_name, dtype: object)

In [12]:
check(wdf, 'warren')

Series([], Name: file_name, dtype: object)

In [13]:
check(sanders_df, 'sanders')

1184    Tom Steyer for President 2020 Order 1007906
1842                              Bloomberg AD 2.28
Name: file_name, dtype: object

In [14]:
print("Before: " + str(len(sanders_df)))
sanders_df.drop(check(sanders_df, 'sanders').index, inplace = True)
sanders_df.to_pickle("sanders_cleaned.pkl")
print("After: " + str(len(sanders_df)))

Before: 1647
After: 1645


In [15]:
sanders_nab = pd.read_pickle("sanders_nab.pkl")

In [16]:
print("Before: " + str(len(sanders_nab)))
sanders_nab.drop(check(sanders_nab, 'sanders').index, inplace = True)
sanders_nab.to_pickle("sanders_nab.pkl")
print("After: " + str(len(sanders_nab)))

Before: 253
After: 252


In [17]:
check(pete_df, 'buttigieg')

Series([], Name: file_name, dtype: object)

In [18]:
check(klobuchar_df, 'klobuchar')

Series([], Name: file_name, dtype: object)

In [19]:
check(steyer_df, 'steyer')

600     Bloomberg US President WUPV review form - Tomm...
601     Bloomberg US President WWBT review form - Tomm...
604                    MIKE BLOOMBERG--TOMMY MRBPBK6028EH
819                    MIKE BLOOMBERG--TOMMY MRBPBK6028EH
894                    MIKE BLOOMBERG--TOMMY MRBPBK6028EH
989                        2.18.20 Mike Bloomberg - Tommy
1174                   MIKE BLOOMBERG--TOMMY MRBPBK6028EH
Name: file_name, dtype: object

In [20]:
## Corrections for data entry errors
steyer_df.drop(check(steyer_df, 'steyer').index, inplace = True)
steyer_df.to_pickle("steyer_cleaned.pkl")
print("After: " + str(len(steyer_df)))

After: 2276


In [21]:
steyer_nab = pd.read_pickle("steyer_nab.pkl")
check(steyer_nab, 'steyer').index

Int64Index([772, 773], dtype='int64')

In [22]:
print("Before: " + str(len(steyer_nab)))
steyer_nab.drop(check(steyer_nab, 'steyer').index, inplace = True)
steyer_nab.to_pickle("steyer_nab.pkl")
print("After: " + str(len(steyer_nab)))

Before: 374
After: 372
