In [1]:
# dependencies
import numpy as np
import pandas as pd

In [2]:
def labels_from_audit(manual_audit):
    labels = [(key, 1) if val == True else (key, 0) for key,val in manual_audit.items()]
    return labels

In [3]:
# True if police shooting/misconduct/trials/terminations in content
# 'possibly False' if content describes an arrest/traffic stop but no misconduct/trials/etc
# 'likely False' if content is PSA, bureaucracy; mentions police, but not police actions/accusations
# Arrest summaries (ie. 8306) could be parsed with regex to filter for officer arrest
manual_audit = {
    2613: True,
    5668: 'possibly False',
    5266: 'likely False',
    4151: 'possibly False',
    10632: True,
    7688: 'possibly False',   # victim died from injuries after being found by police
    1434: True,
    7330: True,
    27057: True,
    21559: True,
    9790: 'possibly False',
    4582: 'possibly False',
    4985: 'possibly False',
    5910: 'possibly False',   # community expressed fear, police got video cameras, overtime
    2327: 'likely False',
    4003: True,    # participant in Capitol riots telling his story, tear gas, police shooting
    1091: 'possibly False',   # victim died from injuries after being found by police
    9541: 'likely False',     # details a new tech/lab police gained access to
    1199: True,
    6123: 'likely False',
    9021: True,
    1710: 'possibly False',
    1935: True,
    1937: True,
    8325: 'likely False',
    1780: True,
    1463: 'possibly False',   # school hired felon when background check from police took 21 days, police victim blame
    460: 'likely False',
    7323: 'likely False',     # details of Jessie Smollett case
    1502: True,
    8627: 'possibly False',
    29874: True,
    6209: True,
    1536: 'likely False',
    2978: 'likely False',
    1679: True,
    9413: True,
    1277: True,
    9005: True,
    20131: True,
    1116: True,
    7432: True,
    12526: True,
    4841: 'possibly False',
    5669: 'possibly False',   # pseudo-unique?
    9501: 'likely False',
    405: True,
    8718: 'possibly False',
    18610: True,   # pseudo-unique?
    3821: 'likely False',     # trials details but 
    2395: 'likely False',
    768: 'likely False',
    3845: True,
    8371: True,
    8306: 'possibly False',
    2902: True,
    13800: 'possibly False',
    1499: True,
    7712: 'possibly False',
    10250: 'possibly False',
    8159: 'likely False',
    1478: True,
    19097: True,
    20671: True,
    2459: True,
    6051: 'possibly False',     # describes a court order to seal a case where a man killed a cop
    10275: 'likely False',
    3267: 'likely False',
    674: 'likely False',
    1840: 'likely False',
    7663: 'possibly False',
    2369: True,
    2201: 'likely False',
    3332: 'possibly False',
    10273: 'likely False',
    1690: True,
    8045: True,
    1502: True,
    9449: 'likely False',
    10255: 'likely False',
    4690: 'possibly False',   # community expressed fear, police got video cameras, overtime
    649: 'possibly False',   # victim died from injuries after being found by police
    10488: True,     # judge used racial slurs when her home was burglarized, was sedated atm 
    9600: True,
    9423: 'likely False',
    2946: 'likely False',
    7360: 'possibly False',
    15599: True,
    4281: True,
    1096: True,
    11073: True,
    4538: 'likely False',
    1345: 'likely False',
    21632: True,
    30695: True,
    21285: True,
    5712: 'likely False',    # misconduct but on the part of athletes and coaches, RE: title xi
    8060: True,
    10166: True,
    30616: True,
    7896: 'likely False',
    780: 'possibly False',   # victim died from injuries after being found by police
    742: True,
    9897: True,
    997: 'likely False',
    9404: 'likely False',    # police showing off a training exercise: deadly force at nighttime
    4603: True,     # 'if you have someone you really want to put away'
}

In [6]:
# args
merged_f = '../output/merged.parquet'

# load data
merged = pd.read_parquet(merged_f)
labels = labels_from_audit(manual_audit)
#man_df = pd.DataFrame(labels, columns=['article_id', 'secondary'])

### positive inspection

In [7]:
rel_ids = set(merged.loc[merged.relevant == 1].article_id.unique())
manual_ids = set(manual_audit.keys())
remaining = rel_ids.difference(manual_ids)

In [8]:
print('inspected:\t', len(manual_audit))
print('True positive:\t', len([1 for key,val in manual_audit.items() if val == True]))
print('possibly False:\t', len([1 for key,val in manual_audit.items() if val == 'possibly False']))
print('likely False:\t', len([1 for key,val in manual_audit.items() if val == 'likely False']))

inspected:	 106
True positive:	 51
possibly False:	 25
likely False:	 30


In [11]:
merged.loc[merged.article_id == 588].content.values

array(['The following people were booked into East Baton Rouge Parish Prison or issued a summons by the Zachary Police Department from Sept. 17-23: Veronica Bell: 37; 900 W. Walnut St., Gloster, Mississippi; theft Gisele Brown: 47; 4158 Florida St., Zachary; theft Steven Ellis: 34; 1507 Grant St., Laplace; domestic abuse battery – child endangerment Adrieana Fields: 29; 3953 Winnebago St., Baton Rouge; theft Dmarkus Freeman: 27; 4065 S. Barrow Drive, Baton Rouge; theft Sebastian Graves: 20; 6419 Pride-Port Hudson Road, Slaughter; simple criminal damage to property Mignon Grayson: 29; 3273 Addison St., Baton Rouge; monetary instrument abuse and theft Dari Green: 30; 3930 Cypress Park, Zachary; theft Devin Mansur: 21; 3001 Shaffett Lane, Zachary; criminal mischief Brad Rodriguez: 41; 20042 Eastwood Drive, Zachary; domestic abuse battery — strangulation and domestic abuse battery — child endangerment Heath Perkins: 43; 6302 West Ave. Jackson; possession/distribution of Schedule II drugs, 

In [10]:
# True if police shooting/misconduct/trials/terminations in content
# 'possibly False' if content describes an arrest/traffic stop but no misconduct/trials/etc
# 'likely False' if content is PSA, bureaucracy; mentions police, but not police actions/accusations
# Arrest summaries (ie. 8306) could be parsed with regex to filter for officer arrest
remaining = {
    588: '',
    667: '',
    701: '',
    792: '',
    887: '',
    1013: '',
    1025: '',
    1080: '',
    1138: '',
    1179: '',
    1294: '',
    1378: '',
    1420: '',
    1429: '',
    1480: '',
    1548: '',
    1612: '',
    1739: '',
    1751: '',
    1795: '',
    1842: '',
    1867: '',
    1895: '',
    1898: '',
    2041: '',
    2054: '',
    2144: '',
    2177: '',
    2178: '',
    2205: '',
    2223: '',
    2227: '',
    2229: '',
    2319: '',
    2477: '',
    2632: '',
    2663: '',
    2825: '',
    2849: '',
    2869: '',
    2934: '',
    3012: '',
    3146: '',
    3153: '',
    3358: '',
    3406: '',
    3483: '',
    3611: '',
    3735: '',
    3803: '',
    3836: '',
    3848: '',
    3934: '',
    3992: '',
    4094: '',
    4180: '',
    4181: '',
    4193: '',
    4227: '',
    4269: '',
    4375: '',
    4444: '',
    4452: '',
    4571: '',
    4695: '',
    4759: '',
    4776: '',
    4791: '',
    4837: '',
    4986: '',
    5087: '',
    5161: '',
    5174: '',
    5211: '',
    5215: '',
    5260: '',
    5296: '',
    5300: '',
    5532: '',
    5681: '',
    5730: '',
    5746: '',
    5848: '',
    5978: '',
    6135: '',
    6196: '',
    6199: '',
    6205: '',
    6210: '',
    6254: '',
    6280: '',
    6379: '',
    6419: '',
    6423: '',
    6456: '',
    6474: '',
    6565: '',
    6588: '',
    6629: '',
    6843: '',
    6876: '',
    6879: '',
    6890: '',
    6910: '',
    7109: '',
    7254: '',
    7423: '',
    7463: '',
    7469: '',
    7515: '',
    7520: '',
    7554: '',
    7637: '',
    7651: '',
    7653: '',
    7685: '',
    7710: '',
    7739: '',
    7758: '',
    7806: '',
    7844: '',
    7890: '',
    7938: '',
    7982: '',
    8017: '',
    8254: '',
    8310: '',
    8375: '',
    8391: '',
    8403: '',
    8464: '',
    8560: '',
    8565: '',
    8581: '',
    8655: '',
    8672: '',
    8719: '',
    8771: '',
    8794: '',
    8835: '',
    8853: '',
    8892: '',
    8914: '',
    9078: '',
    9098: '',
    9115: '',
    9118: '',
    9132: '',
    9142: '',
    9181: '',
    9183: '',
    9232: '',
    9282: '',
    9313: '',
    9324: '',
    9369: '',
    9374: '',
    9406: '',
    9407: '',
    9444: '',
    9454: '',
    9475: '',
    9538: '',
    9615: '',
    9647: '',
    9648: '',
    9657: '',
    9679: '',
    9747: '',
    9843: '',
    9910: '',
    9923: '',
    10019: '',
    10039: '',
    10043: '',
    10128: '',
    10140: '',
    10155: '',
    10167: '',
    10192: '',
    10213: '',
    10261: '',
    10270: '',
    10354: '',
    10397: '',
    10400: '',
    10410: '',
    10462: '',
    10504: '',
    10509: '',
    10554: '',
    11998: '',
    12113: '',
    13893: '',
    14669: '',
    18548: '',
    18757: '',
    20156: '',
    20909: '',
    21401: '',
    22073: '',
    22579: '',
    22993: '',
    23468: '',
    25908: '',
    27087: '',
    27487: '',
    29704: '',
    30133: '',
    31316: ''
}

### negative inspection