In [1]:
# dependencies
import yaml
import numpy as np
import pandas as pd

In [2]:
def read_yaml(filename):
    assert open(filename)
    with open(filename, 'r') as f:
        data = yaml.load(f, Loader=yaml.SafeLoader)
    return data


def labels_from_audit(manual_audit):
    labels = [(key, 1) if val == True else (key, 0) for key,val in manual_audit.items()]
    return labels

In [3]:
# args
merged_f = '../output/merged.parquet'
ai_labels_f = '../hand/review_testdf.csv'
bp_labels_f = '../hand/review_random.yml'

# load data
merged = pd.read_parquet(merged_f)
ai_labels_df = pd.read_csv(ai_labels_f, usecols=['article_id', 'actual_relevancy'])
bp_labels_dict = read_yaml(bp_labels_f)

In [4]:
labels = labels_from_audit(bp_labels_dict)
man_df = pd.DataFrame(labels, columns=['article_id', 'actual_relevancy'])

In [5]:
ai_labels_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30 entries, 0 to 29
Data columns (total 2 columns):
 #   Column            Non-Null Count  Dtype
---  ------            --------------  -----
 0   article_id        30 non-null     int64
 1   actual_relevancy  30 non-null     int64
dtypes: int64(2)
memory usage: 608.0 bytes


In [6]:
man_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 184 entries, 0 to 183
Data columns (total 2 columns):
 #   Column            Non-Null Count  Dtype
---  ------            --------------  -----
 0   article_id        184 non-null    int64
 1   actual_relevancy  184 non-null    int64
dtypes: int64(2)
memory usage: 3.0 KB


In [7]:
ai_ids = set(ai_labels_df.article_id.unique())
bp_ids = set(man_df.article_id.unique())

In [8]:
both = ai_ids.intersection(bp_ids)
both

{8719, 8771, 9282, 9449, 9541, 9600, 9747, 10250, 10255, 18610}

In [9]:
both_ids = ai_ids.intersection(bp_ids)
both_df = merged.loc[merged.article_id.isin(both_ids)]
both_df

Unnamed: 0,article_id,matchedsentence_id,source_id,author,title,text,content,officer_id,extracted_keywords,kw_match,relevant,train,test
29240,18610,8079.0,2,Julia Guilbeau,Wayne Griffin fired from Lafayette Police Depa...,&quot;It would be interesting to learn why he ...,"Sgt. Wayne Griffin, who was placed on administ...",,{'terminated'},1,1,1,0
29241,18610,8078.0,2,Julia Guilbeau,Wayne Griffin fired from Lafayette Police Depa...,Griffin was previously named as interim police...,"Sgt. Wayne Griffin, who was placed on administ...",72518.0,{'police'},1,1,1,0
29242,18610,8078.0,2,Julia Guilbeau,Wayne Griffin fired from Lafayette Police Depa...,Griffin was previously named as interim police...,"Sgt. Wayne Griffin, who was placed on administ...",101884.0,{'police'},1,1,1,0
29243,18610,8077.0,2,Julia Guilbeau,Wayne Griffin fired from Lafayette Police Depa...,Multiple sources told The Current Griffin was ...,"Sgt. Wayne Griffin, who was placed on administ...",,{'police'},1,1,1,0
29244,18610,8076.0,2,Julia Guilbeau,Wayne Griffin fired from Lafayette Police Depa...,"Wayne Griffin, who was placed on administrativ...","Sgt. Wayne Griffin, who was placed on administ...",73862.0,{'police'},1,1,1,0
29245,18610,8076.0,2,Julia Guilbeau,Wayne Griffin fired from Lafayette Police Depa...,"Wayne Griffin, who was placed on administrativ...","Sgt. Wayne Griffin, who was placed on administ...",98862.0,{'police'},1,1,1,0
32161,10255,3997.0,16,"TIFFANY FLOURNOY, KTBS TV Contributing Writer",Police: Minden woman fatally stabs brother of ...,Harrison was transferred to Ochsner LSU Health...,"MINDEN, La. — Police in Minden have arrested a...",96193.0,{'police'},1,0,0,0
32162,10255,3996.0,16,"TIFFANY FLOURNOY, KTBS TV Contributing Writer",Police: Minden woman fatally stabs brother of ...,"Angela Washington, 47, has been charged with s...","MINDEN, La. — Police in Minden have arrested a...",129207.0,{'police'},1,0,0,0
32163,10255,3995.0,16,"TIFFANY FLOURNOY, KTBS TV Contributing Writer",Police: Minden woman fatally stabs brother of ...,"MINDEN, La. — Police in Minden have arrested a...","MINDEN, La. — Police in Minden have arrested a...",,{'police'},1,0,0,0
32164,10250,3994.0,2,Jennifer Wadsworth,LSU professor arrested after IT finds child po...,Police say the videos depicted rape on a range...,An LSU professor was arrested at work Friday a...,,{'police'},1,0,0,0


### positive inspection

In [10]:
rel_ids = set(merged.loc[merged.relevant == 1].article_id.unique())
manual_ids = set(bp_labels_dict.keys())
remaining = rel_ids.difference(manual_ids)

In [11]:
print('inspected:\t', len(bp_labels_dict))
print('True positive:\t', len([1 for key,val in bp_labels_dict.items() if val == True]))
print('possibly False:\t', len([1 for key,val in bp_labels_dict.items() if val == 'possibly False']))
print('likely False:\t', len([1 for key,val in bp_labels_dict.items() if val == 'likely False']))

inspected:	 184
True positive:	 76
possibly False:	 45
likely False:	 63


In [12]:
for key in remaining:
    print(key)
    print(merged.loc[merged.article_id == key].title.unique(), '\n')

31316
['Did detectives hoodwink a court commissioner in a fatal poisoning case? Prosecutors say no.'] 

7254
['East Baton Rouge Parish officials join together to distribute special tool, speak on wastewater system efforts'] 

4695
['Lafayette man arrested in connection to shots fired in Abbeville'] 

8794
['Louisiana rapper JayDaYoungan indicted on federal gun charge stemming from Bogalusa traffic stop'] 

5211
['New exhibit at National WWII Museum commemorates 80th anniversary of Pearl Harbor attacks'] 

5215
['NOPD celebrates its newest round of graduates'] 

9313
['Victims named in Shreveport Monday morning double homicide'] 

5730
['The 318: Caddo Parish tackles animal issues, Amazon TIF update, Bossier City joins lawsuit'] 

4193
['Caddo Commission advances ordinance for mandatory sterilization of all dogs, cats'] 

9324
['Former Haughton star Prescott nominated for Walter Payton NFL Man Of The Year Award'] 

6254
['Did ‘The Flash’ Really Just Kill Off [Spoiler]? (RECAP)'] 

10354

In [13]:
#merged.loc[merged.article_id == 31316].content.values[0]

### negative inspection

In [14]:
merged.loc[(merged.extracted_keywords == "{'terminated', 'police'}") & (merged.relevant == 0)]

Unnamed: 0,article_id,matchedsentence_id,source_id,author,title,text,content,officer_id,extracted_keywords,kw_match,relevant,train,test


In [15]:
merged.loc[(merged.extracted_keywords == "{'terminated', 'officer'}") & (merged.relevant == 0)]

Unnamed: 0,article_id,matchedsentence_id,source_id,author,title,text,content,officer_id,extracted_keywords,kw_match,relevant,train,test
35624,1490,315.0,1,Marta Jewson,District accuses Singleton Charter School of r...,“As a result of the issuance of the Notice o...,Months after an investigation found allegedly ...,,"{'terminated', 'officer'}",1,0,0,0


In [16]:
merged.loc[(merged.extracted_keywords == "{'terminated', 'police', 'officer'}") & (merged.relevant == 0)]

Unnamed: 0,article_id,matchedsentence_id,source_id,author,title,text,content,officer_id,extracted_keywords,kw_match,relevant,train,test
33467,7500,2812.0,32,"Makenzie Boucher, Shreveport Times",Power out at Huntington High School: students ...,More:Shreveport Police Department officer term...,Caddo Parish School announced Monday morning t...,,"{'terminated', 'police', 'officer'}",1,0,0,0


In [17]:
merged.loc[(merged.extracted_keywords == "{'nopd', 'police', 'officer'}") &\
           (merged.relevant == 0)]

Unnamed: 0,article_id,matchedsentence_id,source_id,author,title,text,content,officer_id,extracted_keywords,kw_match,relevant,train,test
25502,31073,13593.0,2,Staff Editorial,Our Views: The Times-Picayune makes these reco...,Salaries for higher ranking officers lag signi...,Voters in three area parishes are being asked ...,,"{'nopd', 'police', 'officer'}",1,0,0,0
30434,15104,6391.0,19,Kylee Bond,Man arrested after allegedly striking NOPD off...,\nNEW ORLEANS (WGNO) — A man was arrested in N...,\nNEW ORLEANS (WGNO) — A man was arrested in N...,,"{'nopd', 'police', 'officer'}",1,0,0,0
30682,14389,6036.0,19,Anna McAllister,Mothers who lost sons to gun violence plead fo...,Goyeneche says the lack of police officers on ...,\nNEW ORLEANS (WGNO) — In less than a week and...,,"{'nopd', 'police', 'officer'}",1,0,0,0
30861,13885,5777.0,19,Curt Sprang,Contractors working on one of the city's bigge...,The City’s Office of Police Secondary Employme...,\nNEW ORLEANS (WGNO) After a thief stole a gun...,,"{'nopd', 'police', 'officer'}",1,0,0,0
32962,8499,3212.0,1,Michael Isaac Stein,Expanded ‘Royal Street Patrol’ will soon launc...,Ellestad said that while he didn’t know whethe...,An off-duty police detail called the Royal Str...,,"{'nopd', 'police', 'officer'}",1,0,0,0
32972,8499,3202.0,1,Michael Isaac Stein,Expanded ‘Royal Street Patrol’ will soon launc...,The Royal Street Patrol will be staffed by non...,An off-duty police detail called the Royal Str...,,"{'nopd', 'police', 'officer'}",1,0,0,0
32973,8499,3201.0,1,Michael Isaac Stein,Expanded ‘Royal Street Patrol’ will soon launc...,The program will be overseen by the French Qua...,An off-duty police detail called the Royal Str...,,"{'nopd', 'police', 'officer'}",1,0,0,0
32974,8499,3200.0,1,Michael Isaac Stein,Expanded ‘Royal Street Patrol’ will soon launc...,An off-duty police detail called the Royal Str...,An off-duty police detail called the Royal Str...,,"{'nopd', 'police', 'officer'}",1,0,0,0
33896,6211,2370.0,19,Kenny Lopez,Carjackings in New Orleans are up 170 percent ...,"On Monday, NOPD officers caught 3 juvenile car...",\nNEW ORLEANS — According to The Metropolitan ...,,"{'nopd', 'police', 'officer'}",1,0,0,0


In [18]:
merged.loc[(merged.extracted_keywords == "{'terminated'}") & (merged.relevant == 0)]

Unnamed: 0,article_id,matchedsentence_id,source_id,author,title,text,content,officer_id,extracted_keywords,kw_match,relevant,train,test
25891,30022,13261.0,16,Celebretainment,Amanda Bynes reveals ambitions after having co...,Amanda Bynes plans to finish her degree after ...,Amanda Bynes plans to finish her degree after ...,,{'terminated'},1,0,0,0
26157,28987,12905.0,16,"Damian Holbrook, TV Insider",‘Superman & Lois’: Alex Garfin Breaks Down Jor...,"Thankfully, she came to that realization in ti...",[Warning: The below contains MAJOR spoilers fo...,,{'terminated'},1,0,0,0
26350,28329,12451.0,2,Wilson Alexander,Brian Kelly supports Frank Wilson after allega...,Lewis added the accusations to a lawsuit she f...,Brian Kelly said LSU supports running backs co...,,{'terminated'},1,0,0,0
26385,28124,12614.0,16,Celebretainment,Britney Spears says Euphoria made her 'anxiety...,Britney Spears says &#x27;Euphoria&#x27; made ...,Britney Spears says &#x27;Euphoria&#x27; made ...,,{'terminated'},1,0,0,0
26503,27678,12364.0,8,Russell Hedges,Men’s college basketball: LSU fires head coach...,Bill Armstrong has also been terminated as Ass...,Staff Reports LSU has fired head men’s basketb...,,{'terminated'},1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
35882,1066,192.0,1,Nicholas Chrastil,Despite defending reality series set inside ja...,"Last month, Emily Washington with the Roderick...",When Netflix announced last month that a new s...,,{'terminated'},1,0,0,0
35957,751,585.0,2,James Finn,Rapper JayDaYoungan posts Instagram photo from...,This employee was terminated for violating pol...,After a prominent rapper was arrested and accu...,,{'terminated'},1,0,0,0
36028,465,106.0,1,Nicholas Chrastil,Orleans sheriff’s office criticized for allowi...,"In early 2020, after we raised concerns about ...",Civil rights attorneys representing people inc...,,{'terminated'},1,0,0,0
36029,465,105.0,1,Nicholas Chrastil,Orleans sheriff’s office criticized for allowi...,"The attorneys, who are part of a longstandin...",Civil rights attorneys representing people inc...,,{'terminated'},1,0,0,0


In [19]:
merged.extracted_keywords.unique()

array([None, "{'police'}", "{'officer'}", "{'police', 'officer'}",
       "{'nopd'}", "{'nopd', 'police', 'officer'}", "{'nopd', 'officer'}",
       "{'nopd', 'police'}", "{'terminated'}", "{'terminated', 'police'}",
       "{'terminated', 'police', 'officer'}", "{'terminated', 'officer'}"],
      dtype=object)

### Most popular words

In [20]:
relevant = merged.loc[merged.relevant == 1, 'content'].unique().tolist()
tokens = {}
for content in relevant:
    chunks = content.split(' ')
    for chunk in chunks:
        if chunk not in tokens:
            tokens[chunk] = 1
        else:
            tokens[chunk] += 1
{k: v for k, v in sorted(tokens.items(), key=lambda item: item[1], reverse=True)}

{'the': 5437,
 'to': 2918,
 'of': 2549,
 'and': 2493,
 'a': 2468,
 'in': 2043,
 'was': 1240,
 'that': 1086,
 'for': 986,
 'on': 934,
 'with': 841,
 'he': 747,
 'is': 706,
 'said': 655,
 'his': 639,
 'The': 639,
 'at': 633,
 'as': 525,
 'an': 468,
 'from': 449,
 'has': 438,
 'not': 437,
 'by': 433,
 'who': 412,
 'be': 380,
 'Police': 374,
 'have': 366,
 'are': 365,
 'said.': 331,
 '': 328,
 'police': 316,
 'will': 306,
 'were': 300,
 'after': 299,
 'her': 282,
 'been': 275,
 'or': 272,
 'had': 269,
 'it': 267,
 'she': 252,
 'they': 252,
 'this': 249,
 'their': 225,
 'but': 224,
 'I': 222,
 'when': 213,
 'He': 212,
 'which': 198,
 'about': 190,
 'into': 188,
 'two': 188,
 'State': 183,
 'Louisiana': 181,
 'him': 177,
 'Parish': 175,
 'also': 174,
 'officers': 173,
 'would': 169,
 'one': 168,
 'people': 168,
 'more': 167,
 'we': 157,
 'A': 154,
 'all': 152,
 'out': 149,
 'during': 148,
 '—': 147,
 'Lafayette': 145,
 'New': 143,
 'no': 140,
 'over': 140,
 'up': 131,
 'you': 129,
 'because'

In [21]:
targets = ['force', 'complaint', 'misconduct', 'harassment', 'allegedly', 'accused']
target_ids = {}
for tup in merged.itertuples():
    if (tup.content != None):
        if (('police' in tup.content) | ('officer' in tup.content)):
            chunks = tup.content.lower().split(' ')
            found = set()
            for target in targets:
                if target in chunks:
                    found.add(target)
            if found != set():
                if str(found) not in target_ids:
                    target_ids[str(found)] = [tup.article_id]
                else:
                    target_ids[str(found)].append(tup.article_id)

In [22]:
target_counts = {k:len(vals) for k, vals in target_ids.items()}
{k: v for k, v in sorted(target_counts.items(), key=lambda item: item[1], reverse=True)}

{"{'accused'}": 549,
 "{'force'}": 514,
 "{'allegedly'}": 473,
 "{'complaint'}": 181,
 "{'allegedly', 'accused'}": 139,
 "{'allegedly', 'force'}": 93,
 "{'harassment'}": 56,
 "{'allegedly', 'force', 'accused'}": 35,
 "{'force', 'accused'}": 32,
 "{'allegedly', 'complaint', 'accused'}": 27,
 "{'misconduct', 'harassment', 'complaint', 'accused'}": 27,
 "{'harassment', 'complaint'}": 23,
 "{'misconduct', 'allegedly', 'force', 'accused'}": 23,
 "{'misconduct', 'accused'}": 20,
 "{'misconduct', 'allegedly'}": 20,
 "{'complaint', 'accused'}": 19,
 "{'misconduct', 'force'}": 19,
 "{'misconduct', 'force', 'accused'}": 19,
 "{'misconduct', 'complaint', 'force'}": 18,
 "{'allegedly', 'harassment'}": 17,
 "{'misconduct', 'allegedly', 'accused'}": 17,
 "{'misconduct', 'harassment', 'complaint'}": 15,
 "{'harassment', 'complaint', 'accused'}": 15,
 "{'misconduct', 'complaint'}": 14,
 "{'misconduct'}": 14,
 "{'misconduct', 'allegedly', 'complaint', 'accused'}": 11,
 "{'harassment', 'complaint', 'all

In [23]:
just_ids = [article_id for key, article_id_list in target_ids.items() for article_id in article_id_list ]
neg_audit_df = merged.loc[(merged.article_id.isin(just_ids)) & (merged.relevant == 0), ['article_id', 'content', 'relevant']].drop_duplicates(subset='content')

## Combine audit data to export

In [24]:
pos_audit_df = merged.loc[merged.article_id.isin(remaining), ['article_id', 'content', 'relevant']].drop_duplicates(subset='content')
pos_audit_df

Unnamed: 0,article_id,content,relevant
25391,31316,A prosecutor is denying accusations that detec...,1
25843,30133,LAPLACE - A sheriff&#x27;s deputy shot and wou...,1
26569,27487,Iberia Crime Stoppers has awarded two officers...,1
26689,27087,Two years ago a jury found that a former Louis...,1
27073,25908,A Louisiana State Police trooper’s behavior ar...,1
...,...,...,...
34514,4375,Orleans Parish Sheriff Marlin Gusman — who was...,1
34547,4269,The chief executive officer of Mary Bird Perki...,1
34560,4227,"SHREVEPORT, La. – It’s not just pit bulls that...",1
34590,4094,"\n(lailluminator.com) – Marcus Jones, the univ...",1


In [25]:
neg_audit_df

Unnamed: 0,article_id,content,relevant
25395,31308,(The Center Square) – Members of the U.S. mili...,0
25403,31266,(The Center Square) — Motorists on the Atchafa...,0
25461,31175,"\nBATON ROUGE, La. (BRPROUD) -- A Louisiana St...",0
25463,31170,A 54-year-old New Iberia man was arrested on F...,0
25481,31138,"\nBATON ROUGE, La. (BRPROUD) -- Monday marks t...",0
...,...,...,...
36009,653,After a monthslong investigation into alleged ...,0
36020,584,"When a tropical storm enters the gulf, Volunte...",0
36022,513,A Baton Rouge man accused of stabbing his gran...,0
36040,413,The Innocence Project New Orleans has filed ba...,0


In [26]:
with pd.ExcelWriter('../output/to_label.xlsx') as writer:  
    pos_audit_df.to_excel(writer, sheet_name='positive', index=False)
    neg_audit_df.to_excel(writer, sheet_name='negative', index=False)