In [None]:
# dependencies
import hashlib
from numpy import nan as nan
import pandas as pd
from sklearn.model_selection import train_test_split
from math import ceil

### id tracking
| df | article_id | matchedsentence_id | officer_id |
|:--- | ---: | ---: | :---:|
| text_df | text_df.id|X|X|
| sen_df  | sen_df.article_id|sen_df.id|X|
| true_df | X |true_df.matchedsentence_id|true_df.id|

In [None]:
# support methods for notebook version
def get_unique_report(df):
    cols = list(df.columns)
    print('             distinct value count by col')
    print('=======================================================')
    for col in cols:
        pretty_print(col, len(df[col].unique()))

In [None]:
def get_args():
    parser = argparse.ArgumentParser()
    parser.add_argument("--included")
    parser.add_argument("--true")
    parser.add_argument("--text")
    parser.add_argument("--output")
    return parser.parse_args()


def get_logging(logname):
        logging.basicConfig(level=logging.DEBUG,
                            format='%(asctime)s %(levelname)s %(message)s',
                            handlers=[logging.FileHandler(logname),
                            logging.StreamHandler()])


def open_gz(f):
    return pd.read_csv(f, compression='gzip')


def pretty_str(label, a, b=False, newline=False):
    if newline:
        if not b:
            return '{:50}{}{}'.format(label, a, '\n')
        else:
            return '{:50}{:10}{:10}{}'.format(label, a, b, '\n')
    if b:
        return '{:50}{:10}{:10}'.format(label, a, b)
    return '{:50}{}'.format(label, a)


def check_asserts(text_df, sen_df, true_df):
    assert text_df.shape == (29707, 12)
    assert sen_df.shape == (10470, 7)
    assert true_df.shape == (735, 3)
    assert all(text_df.columns == ['created_at', 'link', 'guid', 'source_id', \
                                   'updated_at', 'content', 'published_date', 'id', \
                                   'title', 'is_processed', 'author', 'url'])
    assert all(sen_df.columns == ['id', 'created_at', 'updated_at', 'article_id', 
                                   'extracted_keywords', 'text', 'title'])
    assert all(true_df.columns == ['id', 'matchedsentence_id', 'officer_id'])
    most = set(text_df.id.unique())
    mid = set(sen_df.id.unique())
    least = set(true_df.id.unique())
    assert len(least) < len(mid) < len(most)
    assert len(most.intersection(mid)) == 9891
    assert all(true_df.id == true_df.officer_id)   # what does it mean that this is true? will it always?
    pairs = set()
    for tup in true_df.itertuples():
        pairs.add((tup.id, tup.matchedsentence_id))
    assert len(pairs) == true_df.shape[0]
    articles = text_df.id.unique()
    matched = sen_df.article_id.unique()
    assert len(matched) < len(articles)
    assert len(articles) == 29707
    assert len(matched) == 4323
    for match in matched:
        assert match in articles
    matched_sen = sen_df.id.unique()
    true_match_sen = true_df.matchedsentence_id.unique()
    true_match_off = true_df.id.unique()
    assert len(true_match_sen) < len(matched_sen)
    assert len(matched_sen) == 10470
    assert len(true_match_sen) == 479
    for match in true_match_sen:
        assert match in matched_sen


def format_extracted_str(list_str):
    if list_str is not None:
        clean = list_str.replace('[', '').replace(']', '').replace('"', '').lower()
        if ',' in clean:
            return str({val for val in clean.split(',')})
        return str({clean})
    return None


def prep_dfs(text_df, sen_df, true_df):
    less_text = text_df.loc[:, ['id', 'source_id', 'author', 'title', 'content']]
    temp = less_text
    less_text = temp.rename(columns={'id':'article_id'})
    less_sen = sen_df.loc[:, ['id', 'article_id', 'text']]
    temp = less_sen
    less_sen = temp.rename(columns={'id':'matchedsentence_id'})
    less_sen['extracted_keywords'] = sen_df.extracted_keywords.apply(format_extracted_str)
    less_sen['kw_match'] = [1 for val in range(less_sen.shape[0])]
    less_true = true_df.loc[:, ['officer_id', 'matchedsentence_id']]
    less_true['relevant'] = [1 for val in range(less_true.shape[0])]
    return less_text, less_sen, less_true


def merge_dfs(less_text, less_sen, less_true):
    less_text = less_text.set_index('article_id')
    less_sen = less_sen.set_index('article_id')
    out = less_text.join(less_sen, on='article_id', how='outer').reset_index().set_index('matchedsentence_id')
    temp = less_true
    less_true = temp.set_index('matchedsentence_id')
    out = out.join(less_true, on='matchedsentence_id', how='outer')
    out = out.reset_index()
    out.kw_match.fillna(value=0, axis=0, inplace=True)
    out.relevant.fillna(value=0, axis=0, inplace=True)
    temp = out
    out['kw_match'] = temp.kw_match.astype(int)
    out['relevant'] = temp.relevant.astype(int)
    return out


# this method is called when relevant_articles and irrelevant_articles are not disjoint sets
# resolves conflict by upgrading all occurances of an article_id in relevant_articles to relevant
# ie. a sentence from an article is matched to an officer and appears in matchedsentence_officers data
#     a different sentence from same article is not matched and deemed irrelevant, appears in matchedsentence data
#     conflict occurs when datasets merged
def correct_relevant(df):
    copy = df.copy()
    relevant = copy.loc[copy.relevant == 1].article_id.unique().tolist()
    copy.loc[copy.article_id.isin(relevant), 'relevant'] = 1
    return copy


# This method builds the POSITIVE cases: keyword matched AND article relevant (per Rajiv)
def prep_pos_train_test(df, train_perc=0.80, test_perc=0.20):
    id_mask = (df.officer_id.notnull())
    possible = df.loc[id_mask].article_id.unique().tolist()
    train_list, test_list = train_test_split(possible, test_size=test_perc, train_size=train_perc, shuffle=True)
    assert set(train_list).isdisjoint(set(test_list))
    return train_list, test_list


# This method builds the NEGATIVE cases: keyword matched but not relevant
def prep_neg_train_test(df, pos_rate, curr_train_n, curr_test_n):
    assert 0 < pos_rate <= 0.5
    target_train = ceil(curr_train_n/pos_rate)
    target_test = ceil(curr_test_n/pos_rate)
    needed_train = target_train - curr_train_n
    needed_test = target_test - curr_test_n
    id_mask = (df.kw_match == 1) & (df.officer_id.isnull())
    possible = df.loc[id_mask].article_id.unique().tolist()
    assert needed_train + needed_test <= len(possible)
    train_list, test_list = train_test_split(possible, test_size=needed_test, train_size=needed_train, shuffle=True)
    assert set(train_list).isdisjoint(set(test_list))
    return train_list, test_list


def make_train_test_cols(df, pos_rate):
    copy = df.copy()
    # get pos/neg and train/test indice sets
    pos_train_idx, pos_test_idx = prep_pos_train_test(copy)
    neg_train_idx, neg_test_idx = prep_neg_train_test(copy, pos_rate, len(pos_train_idx), len(pos_test_idx))
    # train
    train_idx = pos_train_idx + neg_train_idx
    copy['train'] = [1 if val in train_idx else 0 for val in copy.article_id.values]
    # test
    test_idx = pos_test_idx + neg_test_idx
    copy['test'] = [1 if val in test_idx else 0 for val in copy.article_id.values]
    return copy[['article_id', 'matchedsentence_id', 'source_id', 'author', 'title', 'text', \
                'content', 'officer_id', 'extracted_keywords', 'kw_match', 'relevant', 'train', 'test']]


def make_train_test_df(df):
    full = df.loc[((df.train == 1) | (df.test == 1)), ['article_id', 'content', 'relevant', 'test']]
    full.drop_duplicates(subset='article_id', inplace=True)
    return full


# Since out.kw_match = out.relevant_count + out.irrelevant_count, and relevant can't be true without kw_match,
# (out.relevant_count) / (out.kw_match) should be the proportion of relevant samples given kw_match for col value
def make_report(df, col):
    kw_match_vc = df.loc[df.kw_match == 1][col].value_counts().to_dict()
    relevant_vc = df.loc[df.relevant == 1][col].value_counts().to_dict()
    irrelevant_match_vc = df.loc[(df.kw_match == 1) & (df.relevant != 1)][col].value_counts().to_dict()
    kws = set(list(kw_match_vc.keys()) + list(relevant_vc.keys()) + list(irrelevant_match_vc.keys()))
    out_data = {kw:{} for kw in kws}
    for kw in kws:
        if kw in kw_match_vc:
            out_data[kw]['kw_match'] = kw_match_vc[kw]
        else:
            out_data[kw]['kw_match'] = 0
        if kw in relevant_vc:
            out_data[kw]['relevant_count'] = relevant_vc[kw]
        else:
            out_data[kw]['relevant_count'] = 0
    out = pd.DataFrame.from_dict(out_data).T.reset_index().rename(columns={'index':col})
    out['relevant_perc'] = round((out.relevant_count) / (out.kw_match), 3)
    return out


def make_final_logs(text_df, sen_df, true_df, train_test_df, merged):
    print('I/O id summary')
    print('=======================================================================')
    print(pretty_str('all kw_match articles in raw data:', True))      # asserted by check_asserts()
    print(pretty_str('all matchedsentences in kw_match data:', True))
    print(pretty_str('unique articles:', len(text_df.id.unique())))
    print(pretty_str('unique articles w/ kw match:', len(sen_df.article_id.unique())))
    print(pretty_str('unique matched sentences:', len(sen_df.id.unique())))
    print(pretty_str('unique matched sentences relevant:', len(true_df.matchedsentence_id.unique())))
    print(pretty_str('unique matched officers relevant:', len(true_df.id.unique())))
    print(pretty_str('unique articles in train_test:', len(train_test_df.article_id.unique()), newline=True))
    return 1

In [None]:
# NEED TO OUTPUT:
# 1. article_id (given in data)
# 2. article text (given in data)
# 3. relevant (if article in true_df)
# 4. test (if article is reserved for testing model)     Per TS: 500 train, 100 test for initial train
# (may add cols like author or title)

# CONSIDERING
# correct_kw_match? does article_id conflict also occur with matchedsentence_id?

In [None]:
pd.set_option('mode.chained_assignment', 'raise')
# __main__

# newsarticle: initial dataset
#    - has all the data related to the article as it was pulled into feed
# matchedsentence: initial keyword filter
#    - has all the data related to every article with at least one sentence matching a keyword
# matchedsentence_officers: manual filter (Rajiv)
#    - has select columns linking identified officer badges and articles confirmed relevant by Rajiv
# NOTE: If an article is not in the manual filter set, it is not relevant
news_text_f = '../input/news_articles_newsarticle.csv.gz'
news_included_f = '../input/news_articles_matchedsentence.csv.gz'
news_true_f = '../input/news_articles_matchedsentence_officers.csv.gz'

text_df = open_gz(news_text_f)
sen_df = open_gz(news_included_f)
true_df = open_gz(news_true_f)
check_asserts(text_df, sen_df, true_df)

less_text, less_sen, less_true = prep_dfs(text_df, sen_df, true_df)
merged = merge_dfs(less_text, less_sen, less_true)
# report lost columns
print()
print('columns ignored by import')
print('=======================================================================')
all_cols = set(text_df.columns.tolist() + sen_df.columns.tolist() + true_df.columns.tolist())
kept = set(merged.columns)
not_kept = all_cols.difference(kept)
not_kept.remove('id')
print(str(not_kept)+'\n')

# make sure every article_id has a corresponding 'relevant' value
all_ids = set(merged.article_id.unique())
relevant_articles = set(merged.loc[(merged.relevant == 1)].article_id.unique())
irrelevant_articles = set(merged.loc[(merged.relevant == 0)].article_id.unique())
rel_vals = relevant_articles.union(irrelevant_articles)
assert all_ids.difference(rel_vals) == set()
overlap = relevant_articles.intersection(irrelevant_articles)
print('relevant && irrelevant articles check')
print('=======================================================================')
print(pretty_str('unique relevant articles:', len(relevant_articles)))
print(pretty_str('unique irrelevant articles:', len(irrelevant_articles)))
# check for conflicting 'relevant' values, correct if present
print(pretty_str('relevant and irrelevant disjoint:', overlap == set()))
if overlap != set():
    print(pretty_str('size of overlap:', len(overlap)))
    temp = merged
    merged = correct_relevant(temp)
    print(pretty_str('amended relevant column:', True, newline=True))

# Per TS, starting train/test size should be roughly 500/100
# ASSUMES initial balance of 50/50 positive/negative
# proceed with generating training data
merged = make_train_test_cols(merged, pos_rate=0.5)
train_test_df = make_train_test_df(merged)
print('train, test summary')
print('=======================================================================')
train = train_test_df.loc[train_test_df.test == 0, ['article_id', 'content', 'relevant']]
test = train_test_df.loc[train_test_df.test == 1, ['article_id', 'content', 'relevant']]
train_n = train.article_id.count()
test_n = test.article_id.count()
assert train_n + test_n == train_test_df.shape[0]
train_pos = train.loc[train.relevant == 1].article_id.count()
test_pos = test.loc[test.relevant == 1].article_id.count()
train_neg = train_n - train_pos
test_neg = test_n - test_pos
print(f'{train_n} datapoints available for training with balance: {train_pos}/{train_neg} (pos/neg)')
print(f'{test_n} datapoints available for testing with balance: {test_pos}/{test_neg} (pos/neg)\n')
print('train info')
print('=======================================================================')
train.info()
print('\ntest info')
print('=======================================================================')
test.info()
print()

# writing a subset of merged to use as pre-training data
temp = merged.loc[:, ['article_id', 'title', 'content']].drop_duplicates(subset='article_id')
assert len(temp.article_id.unique()) == temp.shape[0]
news = temp.sample(temp.shape[0]).reset_index(drop=True)
#merged.to_parquet('../output/merged.parquet')

assert make_final_logs(text_df, sen_df, true_df, train_test_df, merged)

# generate keyword report (source_id, author also helpful reports)
kw_report = make_report(merged, 'extracted_keywords')
print('keyword report')
print('=======================================================================')
print('{:50}{:15}{:10}'.format('extracted_keywords', 'relevant_perc', 'kw_match'))
sorted_kw_report = kw_report[['extracted_keywords', 'relevant_perc', 'kw_match']].sort_values(by='relevant_perc', ascending=False)
for tup in sorted_kw_report.itertuples():
    print(pretty_str(tup.extracted_keywords+':', tup.relevant_perc, b=tup.kw_match))

In [None]:
# Estimated true pos_rate
kw_match_n = merged.loc[merged.kw_match==1].article_id.count()
relevant_n = merged.loc[merged.relevant==1].article_id.count()
round(relevant_n/kw_match_n, 2)

In [None]:
# Are there any articles that contain a matched sentence but kw_match isn't True?
match = set(merged.loc[merged.kw_match == 1].article_id.unique())
no_match = set(merged.loc[merged.kw_match == 0].article_id.unique())
match.intersection(no_match)

In [None]:
# outgoing train_test asserts:
#    1. No article_id or content is missing from either train or test sets
#    2. No article_id or content appears in both train AND test sets
#    3. Every article_id has a relevant, test value
#    - Could assert pos_rate with margin of error +- 0.1

# outgoing rule 1
assert train_test_df.loc[train_test_df.article_id.isnull()].shape == (0,4)
assert train_test_df.loc[train_test_df.content.isnull()].shape == (0,4)

# outgoing rule 2
train_articles = set(train_test_df.loc[train_test_df.test == 0].article_id.unique())
test_articles = set(train_test_df.loc[train_test_df.test == 1].article_id.unique())
assert train_articles.isdisjoint(test_articles)
train_contents = set(train_test_df.loc[train_test_df.test == 0].content.unique())
test_contents = set(train_test_df.loc[train_test_df.test == 1].content.unique())
print(train_contents.isdisjoint(test_contents))

# caveat to rule 2: duplicate 
article_ids = set(train_test_df.article_id.unique())
contents = set(train_test_df.content.unique())
print(train_test_df.shape)
assert len(article_ids) == train_test_df.shape[0]
print(len(contents))

# outgoing rule 3
assert train_test_df.loc[train_test_df.relevant.isnull()].shape == (0,4)
assert train_test_df.loc[train_test_df.test.isnull()].shape == (0,4)

In [None]:
# Rule 1 violation: Why is unique content < number of rows
dup_content = train_test_df.loc[train_test_df.duplicated(subset='content')].content.values.tolist()
if dup_content != []:
    dup_content_df = train_test_df.loc[train_test_df.content.isin(dup_content)]
    print('duplicated content shape:', dup_content_df.shape)
    dup_content_ids = dup_content_df.article_id.unique().tolist()
    print('article_ids in train_test_df implicated by dup_content:', len(dup_content_ids))
    all_dup_content_ids = merged.loc[merged.article_id.isin(dup_content_ids)].article_id.unique().tolist()
    print('article_ids in merged implicated by dup_content:', len(all_dup_content_ids))

In [None]:
# Rule 1 violation: Why is unique content < number of rows
dup_content = merged.loc[merged.duplicated(subset='content')].content.values.tolist()
if dup_content != []:
    dup_content_df = merged.loc[merged.content.isin(dup_content)]
    print('duplicated content shape:', dup_content_df.shape)
    dup_content_ids = dup_content_df.article_id.unique().tolist()
    print('article_ids in merged implicated by dup_content:', len(dup_content_ids))

In [None]:
news.loc[news.article_id == 20541].content.values

In [None]:
# MOVED TO OWN SCRIPT
# Rule 1 violation: Why is unique content < number of rows
#dup_title = merged.loc[merged.duplicated(subset='title')].title.values.tolist()
#if dup_title != []:
#    dup_title_df = merged.loc[merged.title.isin(dup_title)]
#    print('duplicated title shape:', dup_title_df.shape)
#    dup_title_ids = dup_title_df.article_id.unique().tolist()
#    print('article_ids in merged implicated by dup_title:', len(dup_title_ids))
#    dup_title_ids = dup_title_df.loc[dup_title_df.content].article_id.unique().tolist()

In [None]:
# Are matchedsentence_ids unique by text, or by text in unique article?
dup_text = merged.loc[merged.duplicated(subset='text')].text.unique().tolist()
dup_text_df = merged.loc[merged.text.isin(dup_text)]

dup_text_ids = {}
for tup in dup_text_df.itertuples():
    this_art_id = tup.article_id
    this_mat_id = tup.matchedsentence_id
    if str(tup.text) != 'nan':
        this_art_id = tup.article_id
        this_mat_id = tup.matchedsentence_id
        if tup.text not in dup_text_ids:
            dup_text_ids[tup.text] = {'article_id': [this_art_id], 'matchedsentence_id': [this_mat_id]}
        else:
            if this_art_id not in dup_text_ids[tup.text]['article_id']:
                dup_text_ids[tup.text]['article_id'].append(this_art_id)
            if this_mat_id not in dup_text_ids[tup.text]['matchedsentence_id']:
                dup_text_ids[tup.text]['matchedsentence_id'].append(this_mat_id)

multiple_art = 0
for text, id_dict in dup_text_ids.items():
    if (len(id_dict['article_id']) > 1):
        multiple_art += 1
        # suspect sentences found in distinct articles are processed as distinct sentences
        # matchedsentence_id only refers to uniqueness within article, not in database
        assert len(id_dict['article_id']) == len(id_dict['matchedsentence_id'])

print('duplicate text shape:\t\t\t', dup_text_df.shape)
print('unique text samples duplicated:\t\t', len(dup_text_ids))
print('articles affected by duplicate text:\t', multiple_art)

# Reviewing output data

In [None]:
train_test_df

In [None]:
merged

## Reports
- `extracted_keywords` in train/test/rem
- sources in train/test/rem
- authors in train/test/rem

In [None]:
kw_report.sort_values(by='relevant_perc', ascending=False)

In [None]:
print(kw_report[['extracted_keywords', 'relevant_perc']].sort_values(by='relevant_perc', ascending=False))

In [None]:
src_report.sort_values(by='relevant_count', ascending=False)

In [None]:
author_report.sort_values(by='relevant_count', ascending=False)

# Reviewing input data

### `text_df`

In [None]:
pretty_print('full text shape:', text_df.shape)
print('full text cols:\n', list(text_df.columns))

In [None]:
text_df.info()

In [None]:
get_unique_report(text_df)

### `sen_df`

In [None]:
pretty_print('matched sen shape:', sen_df.shape)
print('matched sen cols:\n', list(sen_df.columns))

In [None]:
sen_df.info()

In [None]:
get_unique_report(sen_df)

### `true_df`

In [None]:
pretty_print('matched true shape:', true_df.shape)
print('matched true cols:\n', list(true_df.columns))

In [None]:
true_df.info()

In [None]:
get_unique_report(true_df)