In [43]:
# dependencies
import argparse
import logging
import yaml
import pandas as pd

In [53]:
# support methods
def read_yaml(filename):
    assert open(filename)
    with open(filename, 'r') as f:
        data = yaml.load(f, Loader=yaml.SafeLoader)
    return data


def labels_from_audit(manual_audit):
    labels = {key:(1 if val == True else 0) for key,val in manual_audit.items()}
    return labels


def prep_df(df):
    if 'actual_relevancy' in df.columns:
        return df.rename(columns={'actual_relevancy':'relevant'})#.set_index('article_id')
    return df.rename(columns={'new_label':'relevant'})#.set_index('article_id')


def update_from_dict(raw, p1):
    copy = raw.copy()
    artids = list(p1.keys())
    for artid in artids:
        copy.loc[copy.article_id == artid, 'relevant'] = p1[artid]
    return copy


def update_from_df(raw, new):
    copy = raw.copy()
    artids = new.article_id.unique().tolist()
    for artid in artids:
        copy.loc[copy.article_id == artid, 'relevant'] = new.loc[new.article_id == artid, 'relevant']
    return copy


def update(l, r):
    if type(r) is dict:
        return update_from_dict(l, r)
    return update_from_df(l, r)

In [57]:
# setup logging
#get_logging("output/patch_relevant.log")

# arg handling
#args = get_args()
raw_f = '../output/merged.parquet'
p1_f = '../hand/review_random.yml'
p2_f = '../hand/review_testdf.csv'
p3_f = '../hand/to_label_ai.xlsx'

# load data
#logger.info('loading data')
raw = pd.read_parquet(raw_f)
p1_dict = labels_from_audit(read_yaml(p1_f))
p2_df = prep_df(pd.read_csv(p2_f, usecols=['article_id', 'actual_relevancy']))
p3_df = prep_df(pd.read_excel(p3_f, usecols=['article_id', 'new_label']))

#logger.info('merging hand-labeled data')
new = pd.concat([p3_df, p2_df]).drop_duplicates(subset='article_id')

p2_true = set(p2_df.loc[(p2_df.relevant == 1), 'article_id'].unique())
p3_true = set(p3_df.loc[(p3_df.relevant == 1), 'article_id'].unique())
new_true = set(new.loc[(new.relevant == 1), 'article_id'].unique())

#logger.info('verifying no true records lost from merge')
assert len(p2_true.difference(new_true)) == len(p3_true.difference(new_true)) == 0
#logger.info('all true records present after merge')

#logger.info('implementing updated records')
out = update(raw, p1_dict)
out = update(out, new)
#out.to_parquet(args.input)
#logger.info(f'updated records and saved to {args.input}')
out

Unnamed: 0,article_id,matchedsentence_id,source_id,author,title,text,content,officer_id,extracted_keywords,kw_match,relevant,train,test
0,31383,,4,Keisha Swafford,How to Cope with Diabetes,,Living with diabetes is a lifetime challenge f...,,,0,0.0,0,0
1,31381,,2,Charles Salzer,Live Oak softball enters playoffs 'being able ...,,As the Live Oak softball team begins postseaso...,,,0,0.0,0,0
2,31380,,2,Community News Report,North Oaks Sports Medicine selects student-ath...,,North Oaks Sports Medicine certified athletic ...,,,0,0.0,0,0
3,31378,,2,Community News Report,Love the Boot cleanup event scheduled in Hammo...,,The Great American Cleanup Love the Boot event...,,,0,0.0,0,0
4,31377,,2,Community News Report,Gordon McKernan hosts Its annual Gordon’s Grad...,,Gordon McKernan Injury Attorneys has launched ...,,,0,0.0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
36105,373,9.0,1,Marta Jewson,"Masks required, weekly COVID-19 tests and vacc...",Chief Operations Officer Tiffany Delcour estim...,"At a press conference Wednesday, NOLA Public S...",,{'officer'},1,0.0,0,0
36106,366,3.0,1,Carly Berlin,"As initial FEMA deadline approaches, some appl...",At a Monday press conference with New Orlean...,"UPDATE: After this story was published, FEMA e...",,{'officer'},1,0.0,0,0
36107,365,2.0,1,Marta Jewson,Some New Orleans schools will start back virtu...,"Before students return to any campus, NOLA P...",Outside Frederick Douglass High School Thursda...,,{'officer'},1,0.0,0,0
36108,362,1.0,1,Marta Jewson,More than one-third of Louisiana students rema...,"Damage report As of Tuesday, roughly half of t...","About 250,000 Louisiana students remain out of...",,{'officer'},1,0.0,0,0


In [48]:
#p1_ids = set(p1_df.article_id.unique())
#p2_ids = set(p2_df.article_id.unique())
#
#both = p1_ids.intersection(p2_ids)
#
#for artid in list(both):
#    l = p1_df.loc[p1_df.article_id==artid, 'relevant'].values
#    r = p2_df.loc[p2_df.article_id==artid, 'relevant'].values
#    if l != r:
#        print(artid)
#        print(f'p1:\t{l},\tp2:\t{r}')
#        print(raw.loc[raw.article_id==artid, 'content'].unique().tolist())
#        print()

### Most popular words

In [20]:
relevant = merged.loc[merged.relevant == 1, 'content'].unique().tolist()
tokens = {}
for content in relevant:
    chunks = content.split(' ')
    for chunk in chunks:
        if chunk not in tokens:
            tokens[chunk] = 1
        else:
            tokens[chunk] += 1
{k: v for k, v in sorted(tokens.items(), key=lambda item: item[1], reverse=True)}

{'the': 5437,
 'to': 2918,
 'of': 2549,
 'and': 2493,
 'a': 2468,
 'in': 2043,
 'was': 1240,
 'that': 1086,
 'for': 986,
 'on': 934,
 'with': 841,
 'he': 747,
 'is': 706,
 'said': 655,
 'his': 639,
 'The': 639,
 'at': 633,
 'as': 525,
 'an': 468,
 'from': 449,
 'has': 438,
 'not': 437,
 'by': 433,
 'who': 412,
 'be': 380,
 'Police': 374,
 'have': 366,
 'are': 365,
 'said.': 331,
 '': 328,
 'police': 316,
 'will': 306,
 'were': 300,
 'after': 299,
 'her': 282,
 'been': 275,
 'or': 272,
 'had': 269,
 'it': 267,
 'she': 252,
 'they': 252,
 'this': 249,
 'their': 225,
 'but': 224,
 'I': 222,
 'when': 213,
 'He': 212,
 'which': 198,
 'about': 190,
 'into': 188,
 'two': 188,
 'State': 183,
 'Louisiana': 181,
 'him': 177,
 'Parish': 175,
 'also': 174,
 'officers': 173,
 'would': 169,
 'one': 168,
 'people': 168,
 'more': 167,
 'we': 157,
 'A': 154,
 'all': 152,
 'out': 149,
 'during': 148,
 '—': 147,
 'Lafayette': 145,
 'New': 143,
 'no': 140,
 'over': 140,
 'up': 131,
 'you': 129,
 'because'

In [21]:
targets = ['force', 'complaint', 'misconduct', 'harassment', 'allegedly', 'accused']
target_ids = {}
for tup in merged.itertuples():
    if (tup.content != None):
        if (('police' in tup.content) | ('officer' in tup.content)):
            chunks = tup.content.lower().split(' ')
            found = set()
            for target in targets:
                if target in chunks:
                    found.add(target)
            if found != set():
                if str(found) not in target_ids:
                    target_ids[str(found)] = [tup.article_id]
                else:
                    target_ids[str(found)].append(tup.article_id)

In [22]:
target_counts = {k:len(vals) for k, vals in target_ids.items()}
{k: v for k, v in sorted(target_counts.items(), key=lambda item: item[1], reverse=True)}

{"{'accused'}": 549,
 "{'force'}": 514,
 "{'allegedly'}": 473,
 "{'complaint'}": 181,
 "{'allegedly', 'accused'}": 139,
 "{'allegedly', 'force'}": 93,
 "{'harassment'}": 56,
 "{'allegedly', 'force', 'accused'}": 35,
 "{'force', 'accused'}": 32,
 "{'allegedly', 'complaint', 'accused'}": 27,
 "{'misconduct', 'harassment', 'complaint', 'accused'}": 27,
 "{'harassment', 'complaint'}": 23,
 "{'misconduct', 'allegedly', 'force', 'accused'}": 23,
 "{'misconduct', 'accused'}": 20,
 "{'misconduct', 'allegedly'}": 20,
 "{'complaint', 'accused'}": 19,
 "{'misconduct', 'force'}": 19,
 "{'misconduct', 'force', 'accused'}": 19,
 "{'misconduct', 'complaint', 'force'}": 18,
 "{'allegedly', 'harassment'}": 17,
 "{'misconduct', 'allegedly', 'accused'}": 17,
 "{'misconduct', 'harassment', 'complaint'}": 15,
 "{'harassment', 'complaint', 'accused'}": 15,
 "{'misconduct', 'complaint'}": 14,
 "{'misconduct'}": 14,
 "{'misconduct', 'allegedly', 'complaint', 'accused'}": 11,
 "{'harassment', 'complaint', 'all