In [1]:
# dependencies
from numpy import nan as nan
import pandas as pd
from sklearn.model_selection import train_test_split
from math import ceil

In [48]:
merged.loc[:, ['article_id', 'title', 'content']]

KeyError: "['title'] not in index"

### id tracking
| df | article_id | matchedsentence_id | officer_id |
|:--- | ---: | ---: | :---:|
| text_df | text_df.id|X|X|
| sen_df  | sen_df.article_id|sen_df.id|X|
| true_df | X |true_df.matchedsentence_id|true_df.id|

In [2]:
# support methods for notebook version
def get_unique_report(df):
    cols = list(df.columns)
    print('             distinct value count by col')
    print('=======================================================')
    for col in cols:
        pretty_print(col, len(df[col].unique()))

In [3]:
def get_args():
    parser = argparse.ArgumentParser()
    parser.add_argument("--included")
    parser.add_argument("--true")
    parser.add_argument("--text")
    parser.add_argument("--output")
    return parser.parse_args()


def get_logging(logname):
        logging.basicConfig(level=logging.DEBUG,
                            format='%(asctime)s %(levelname)s %(message)s',
                            handlers=[logging.FileHandler(logname),
                            logging.StreamHandler()])


def open_gz(f):
    return pd.read_csv(f, compression='gzip')


def pretty_str(label, a, b=False, newline=False):
    if newline:
        if not b:
            return '{:50}{}{}'.format(label, a, '\n')
        else:
            return '{:50}{:10}{:10}{}'.format(label, a, b, '\n')
    if b:
        return '{:50}{:10}{:10}'.format(label, a, b)
    return '{:50}{}'.format(label, a)


def check_asserts(text_df, sen_df, true_df):
    assert text_df.shape == (29707, 12)
    assert sen_df.shape == (10470, 7)
    assert true_df.shape == (735, 3)
    assert all(text_df.columns == ['created_at', 'link', 'guid', 'source_id', \
                                   'updated_at', 'content', 'published_date', 'id', \
                                   'title', 'is_processed', 'author', 'url'])
    assert all(sen_df.columns == ['id', 'created_at', 'updated_at', 'article_id', 
                                   'extracted_keywords', 'text', 'title'])
    assert all(true_df.columns == ['id', 'matchedsentence_id', 'officer_id'])
    most = set(text_df.id.unique())
    mid = set(sen_df.id.unique())
    least = set(true_df.id.unique())
    assert len(least) < len(mid) < len(most)
    assert len(most.intersection(mid)) == 9891
    assert all(true_df.id == true_df.officer_id)   # what does it mean that this is true? will it always?
    pairs = set()
    for tup in true_df.itertuples():
        pairs.add((tup.id, tup.matchedsentence_id))
    assert len(pairs) == true_df.shape[0]
    articles = text_df.id.unique()
    matched = sen_df.article_id.unique()
    assert len(matched) < len(articles)
    assert len(articles) == 29707
    assert len(matched) == 4323
    for match in matched:
        assert match in articles
    matched_sen = sen_df.id.unique()
    true_match_sen = true_df.matchedsentence_id.unique()
    true_match_off = true_df.id.unique()
    assert len(true_match_sen) < len(matched_sen)
    assert len(matched_sen) == 10470
    assert len(true_match_sen) == 479
    for match in true_match_sen:
        assert match in matched_sen


def format_extracted_str(list_str):
    if list_str is not None:
        clean = list_str.replace('[', '').replace(']', '').replace('"', '').lower()
        if ',' in clean:
            return str({val for val in clean.split(',')})
        return str({clean})
    return None


def prep_dfs(text_df, sen_df, true_df):
    less_text = text_df.loc[:, ['id', 'source_id', 'author', 'title', 'content']]
    temp = less_text
    less_text = temp.rename(columns={'id':'article_id'})
    less_sen = sen_df.loc[:, ['id', 'article_id', 'text']]
    temp = less_sen
    less_sen = temp.rename(columns={'id':'matchedsentence_id'})
    less_sen['extracted_keywords'] = sen_df.extracted_keywords.apply(format_extracted_str)
    less_sen['kw_match'] = [1 for val in range(less_sen.shape[0])]
    less_true = true_df.loc[:, ['officer_id', 'matchedsentence_id']]
    less_true['relevant'] = [1 for val in range(less_true.shape[0])]
    return less_text, less_sen, less_true


def merge_dfs(less_text, less_sen, less_true):
    less_text = less_text.set_index('article_id')
    less_sen = less_sen.set_index('article_id')
    out = less_text.join(less_sen, on='article_id', how='outer').reset_index().set_index('matchedsentence_id')
    temp = less_true
    less_true = temp.set_index('matchedsentence_id')
    out = out.join(less_true, on='matchedsentence_id', how='outer')
    out = out.reset_index()
    out.kw_match.fillna(value=0, axis=0, inplace=True)
    out.relevant.fillna(value=0, axis=0, inplace=True)
    temp = out
    out['kw_match'] = temp.kw_match.astype(int)
    out['relevant'] = temp.relevant.astype(int)
    return out


# this method is called when relevant_articles and irrelevant_articles are not disjoint sets
# resolves conflict by upgrading all occurances of an article_id in relevant_articles to relevant
# ie. a sentence from an article is matched to an officer and appears in matchedsentence_officers data
#     a different sentence from same article is not matched and deemed irrelevant, appears in matchedsentence data
#     conflict occurs when datasets merged
def correct_relevant(df):
    copy = df.copy()
    relevant = copy.loc[copy.relevant == 1].article_id.unique().tolist()
    copy.loc[copy.article_id.isin(relevant), 'relevant'] = 1
    return copy


# This method builds the POSITIVE cases: keyword matched AND article relevant (per Rajiv)
def prep_pos_train_test(df, train_perc=0.80, test_perc=0.20):
    id_mask = (df.officer_id.notnull())
    possible = df.loc[id_mask].article_id.unique().tolist()
    train_list, test_list = train_test_split(possible, test_size=test_perc, train_size=train_perc, shuffle=True)
    assert set(train_list).isdisjoint(set(test_list))
    return train_list, test_list


# This method builds the NEGATIVE cases: keyword matched but not relevant
def prep_neg_train_test(df, pos_rate, curr_train_n, curr_test_n):
    assert 0 < pos_rate <= 0.5
    target_train = ceil(curr_train_n/pos_rate)
    target_test = ceil(curr_test_n/pos_rate)
    needed_train = target_train - curr_train_n
    needed_test = target_test - curr_test_n
    id_mask = (df.kw_match == 1) & (df.officer_id.isnull())
    possible = df.loc[id_mask].article_id.unique().tolist()
    assert needed_train + needed_test <= len(possible)
    train_list, test_list = train_test_split(possible, test_size=needed_test, train_size=needed_train, shuffle=True)
    assert set(train_list).isdisjoint(set(test_list))
    return train_list, test_list


def make_train_test_cols(df, pos_rate):
    copy = df.copy()
    # get pos/neg and train/test indice sets
    pos_train_idx, pos_test_idx = prep_pos_train_test(copy)
    neg_train_idx, neg_test_idx = prep_neg_train_test(copy, pos_rate, len(pos_train_idx), len(pos_test_idx))
    # train
    train_idx = pos_train_idx + neg_train_idx
    copy['train'] = [1 if val in train_idx else 0 for val in copy.article_id.values]
    # test
    test_idx = pos_test_idx + neg_test_idx
    copy['test'] = [1 if val in test_idx else 0 for val in copy.article_id.values]
    return copy[['article_id', 'matchedsentence_id', 'source_id', 'author', 'title', 'text', \
                'content', 'officer_id', 'extracted_keywords', 'kw_match', 'relevant', 'train', 'test']]


def make_train_test_df(df):
    full = df.loc[((df.train == 1) | (df.test == 1)), ['article_id', 'content', 'relevant', 'test']]
    full.drop_duplicates(subset='article_id', inplace=True)
    return full


# Since out.kw_match = out.relevant_count + out.irrelevant_count, and relevant can't be true without kw_match,
# (out.relevant_count) / (out.kw_match) should be the proportion of relevant samples given kw_match for col value
def make_report(df, col):
    kw_match_vc = df.loc[df.kw_match == 1][col].value_counts().to_dict()
    relevant_vc = df.loc[df.relevant == 1][col].value_counts().to_dict()
    irrelevant_match_vc = df.loc[(df.kw_match == 1) & (df.relevant != 1)][col].value_counts().to_dict()
    kws = set(list(kw_match_vc.keys()) + list(relevant_vc.keys()) + list(irrelevant_match_vc.keys()))
    out_data = {kw:{} for kw in kws}
    for kw in kws:
        if kw in kw_match_vc:
            out_data[kw]['kw_match'] = kw_match_vc[kw]
        else:
            out_data[kw]['kw_match'] = 0
        if kw in relevant_vc:
            out_data[kw]['relevant_count'] = relevant_vc[kw]
        else:
            out_data[kw]['relevant_count'] = 0
    out = pd.DataFrame.from_dict(out_data).T.reset_index().rename(columns={'index':col})
    out['relevant_perc'] = round((out.relevant_count) / (out.kw_match), 3)
    return out


def make_final_logs(text_df, sen_df, true_df, train_test_df, merged):
    logging.info('I/O id summary')
    logging.info('=======================================================================')
    logging.info(pretty_str('all kw_match articles in raw data:', True))      # asserted by check_asserts()
    logging.info(pretty_str('all matchedsentences in kw_match data:', True))
    logging.info(pretty_str('unique articles:', len(text_df.id.unique())))
    logging.info(pretty_str('unique articles w/ kw match:', len(sen_df.article_id.unique())))
    logging.info(pretty_str('unique matched sentences:', len(sen_df.id.unique())))
    logging.info(pretty_str('unique matched sentences relevant:', len(true_df.matchedsentence_id.unique())))
    logging.info(pretty_str('unique matched officers relevant:', len(true_df.id.unique())))
    logging.info(pretty_str('unique articles in train_test:', len(train_test_df.article_id.unique()), newline=True))
    return 1

In [4]:
# NEED TO OUTPUT:
# 1. article_id (given in data)
# 2. article text (given in data)
# 3. relevant (if article in true_df)
# 4. test (if article is reserved for testing model)     Per TS: 500 train, 100 test for initial train
# (may add cols like author or title)

# CONSIDERING
# correct_kw_match? does article_id conflict also occur with matchedsentence_id?

In [5]:
pd.set_option('mode.chained_assignment', 'raise')
# __main__

# newsarticle: initial dataset
#    - has all the data related to the article as it was pulled into feed
# matchedsentence: initial keyword filter
#    - has all the data related to every article with at least one sentence matching a keyword
# matchedsentence_officers: manual filter (Rajiv)
#    - has select columns linking identified officer badges and articles confirmed relevant by Rajiv
# NOTE: If an article is not in the manual filter set, it is not relevant
news_text = '../input/news_articles_newsarticle.csv.gz'
news_included = '../input/news_articles_matchedsentence.csv.gz'
news_true = '../input/news_articles_matchedsentence_officers.csv.gz'

text_df = open_gz(news_text)
sen_df = open_gz(news_included)
true_df = open_gz(news_true)
check_asserts(text_df, sen_df, true_df)

less_text, less_sen, less_true = prep_dfs(text_df, sen_df, true_df)
merged = merge_dfs(less_text, less_sen, less_true)

# make sure every article_id has a corresponding 'relevant' value
all_ids = set(merged.article_id.unique())
relevant_articles = set(merged.loc[(merged.relevant == 1)].article_id.unique())
irrelevant_articles = set(merged.loc[(merged.relevant == 0)].article_id.unique())
rel_vals = relevant_articles.union(irrelevant_articles)
assert all_ids.difference(rel_vals) == set()
overlap = relevant_articles.intersection(irrelevant_articles)
print(pretty_str('unique relevant articles:', len(relevant_articles)))
print(pretty_str('unique irrelevant articles:', len(irrelevant_articles)))
# check for conflicting 'relevant' values, correct if present
print(pretty_str('relevant and irrelevant disjoint:', overlap == set()))
if overlap != set():
    print(pretty_str('size of overlap:', len(overlap)))
    temp = merged
    merged = correct_relevant(temp)
    print(pretty_str('amended relevant column:', True))

# proceed with generating training data
merged = make_train_test_cols(merged, pos_rate=0.50)
train_test_df = make_train_test_df(merged)

# generate source_id, author, keyword, reports
src_report = make_report(merged, 'source_id')
author_report = make_report(merged, 'author')
kw_report = make_report(merged, 'extracted_keywords')

# save outputs
train_test_df.to_parquet('../output/train-test.parquet')
#news.to_parquet('../output/news.parquet')
#src_report.to_parquet('../output/source_report.parquet')
#author_report.to_parquet('../output/author_report.parquet')
#kw_report.to_parquet('../output/keyword_report.parquet')

unique relevant articles:                         316
unique irrelevant articles:                       29616
relevant and irrelevant disjoint:                 False
size of overlap:                                  225
amended relevant column:                          True


In [13]:
# Estimated true pos_rate
kw_match_n = merged.loc[merged.kw_match==1].article_id.count()
relevant_n = merged.loc[merged.relevant==1].article_id.count()
round(relevant_n/kw_match_n, 2)

0.15

In [14]:
# Are there any articles that contain a matched sentence but kw_match isn't True?
match = set(merged.loc[merged.kw_match == 1].article_id.unique())
no_match = set(merged.loc[merged.kw_match == 0].article_id.unique())
match.intersection(no_match)

set()

In [93]:
# outgoing train_test asserts:
#    1. No article_id or content is missing from either train or test sets
#    2. No article_id or content appears in both train AND test sets
#    3. Every article_id has a relevant, test value
#    - Could assert pos_rate with margin of error +- 0.1

# outgoing rule 1
assert train_test_df.loc[train_test_df.article_id.isnull()].shape == (0,4)
assert train_test_df.loc[train_test_df.content.isnull()].shape == (0,4)

# outgoing rule 2
train_articles = set(train_test_df.loc[train_test_df.test == 0].article_id.unique())
test_articles = set(train_test_df.loc[train_test_df.test == 1].article_id.unique())
assert train_articles.isdisjoint(test_articles)
train_contents = set(train_test_df.loc[train_test_df.test == 0].content.unique())
test_contents = set(train_test_df.loc[train_test_df.test == 1].content.unique())
print(train_contents.isdisjoint(test_contents))

# caveat to rule 2: duplicate 
article_ids = set(train_test_df.article_id.unique())
contents = set(train_test_df.content.unique())
print(train_test_df.shape)
assert len(article_ids) == train_test_df.shape[0]
print(len(contents))

# outgoing rule 3
assert train_test_df.loc[train_test_df.relevant.isnull()].shape == (0,4)
assert train_test_df.loc[train_test_df.test.isnull()].shape == (0,4)

True
False
(602, 4)
597


In [88]:
# Rule 1 violation: Why is unique content < number of rows
dup_content = train_test_df.loc[train_test_df.duplicated(subset='content')].content.values.tolist()
if dup_content != []:
    dup_content_df = train_test_df.loc[train_test_df.content.isin(dup_content)]
    print('duplicated content shape:', dup_content_df.shape)
    dup_content_ids = dup_content_df.article_id.unique().tolist()
    print('article_ids in train_test_df implicated by dup_content:', len(dup_content_ids))
    all_dup_content_ids = merged.loc[merged.article_id.isin(dup_content_ids)].article_id.unique().tolist()
    print('article_ids in merged implicated by dup_content:', len(all_dup_content_ids))

duplicated content shape: (10, 4)
article_ids in train_test_df implicated by dup_content: 10
article_ids in merged implicated by dup_content: 10


In [90]:
# Are matchedsentence_ids unique by text, or by text in unique article?
dup_text = merged.loc[merged.duplicated(subset='text')].text.unique().tolist()
dup_text_df = merged.loc[merged.text.isin(dup_text)]

dup_text_ids = {}
for tup in dup_text_df.itertuples():
    this_art_id = tup.article_id
    this_mat_id = tup.matchedsentence_id
    if str(tup.text) != 'nan':
        this_art_id = tup.article_id
        this_mat_id = tup.matchedsentence_id
        if tup.text not in dup_text_ids:
            dup_text_ids[tup.text] = {'article_id': [this_art_id], 'matchedsentence_id': [this_mat_id]}
        else:
            if this_art_id not in dup_text_ids[tup.text]['article_id']:
                dup_text_ids[tup.text]['article_id'].append(this_art_id)
            if this_mat_id not in dup_text_ids[tup.text]['matchedsentence_id']:
                dup_text_ids[tup.text]['matchedsentence_id'].append(this_mat_id)

multiple_art = 0
for text, id_dict in dup_text_ids.items():
    if (len(id_dict['article_id']) > 1):
        multiple_art += 1
        # suspect sentences found in distinct articles are processed as distinct sentences
        # matchedsentence_id only refers to uniqueness within article, not in database
        assert len(id_dict['article_id']) == len(id_dict['matchedsentence_id'])

print('duplicate text shape:\t\t\t', dup_text_df.shape)
print('unique text samples duplicated:\t\t', len(dup_text_ids))
print('articles affected by duplicate text:\t', text_mult_art)

duplicate text shape:			 (26765, 12)
unique text samples duplicated:		 580
articles affected by duplicate text:	 410


In [21]:
print(merged.loc[merged.extracted_keywords == "{'terminated', 'officer', 'police'}"].content.values)

['Caddo Parish School announced Monday morning that Huntington High School will have an early release after power outage. Students will be released at 11 a.m. for pick up and bus release. Caddo Parish Schools said they are currently working with SWEPCO to restore power to the campus. SWEPCO estimates power will be restored later this afternoon. School will remain closed for the remainder of the day and will be in normal operation tomorrow. More:Shreveport Police Department officer terminated for violation of rules and regulations Makenzie Boucher is a reporter with the Shreveport Times. Contact her at mboucher@gannett.com.']


# Reviewing output data

In [6]:
train_test_df

Unnamed: 0,article_id,content,relevant,test
25391,31316,A prosecutor is denying accusations that detec...,1,0
25403,31266,(The Center Square) — Motorists on the Atchafa...,0,0
25404,31264,"Shreveport, La -- A Shreveport man who police ...",0,0
25410,31255,A 25-year-old man has been identified as the p...,0,0
25473,31151,\nNEW ORLEANS (WGNO) — On Monday morning the U...,0,0
...,...,...,...,...
36030,460,"Despite a slow down because of Hurricane Ida, ...",1,0
36040,413,The Innocence Project New Orleans has filed ba...,0,0
36045,405,A Louisiana State Police trooper who initiated...,1,1
36097,391,The number of COVID-19 cases among people inca...,0,0


In [7]:
merged

Unnamed: 0,article_id,matchedsentence_id,source_id,author,text,content,officer_id,extracted_keywords,kw_match,relevant,train,test
0,31383,,4,Keisha Swafford,,Living with diabetes is a lifetime challenge f...,,,0,0,0,0
1,31381,,2,Charles Salzer,,As the Live Oak softball team begins postseaso...,,,0,0,0,0
2,31380,,2,Community News Report,,North Oaks Sports Medicine certified athletic ...,,,0,0,0,0
3,31378,,2,Community News Report,,The Great American Cleanup Love the Boot event...,,,0,0,0,0
4,31377,,2,Community News Report,,Gordon McKernan Injury Attorneys has launched ...,,,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
36105,373,9.0,1,Marta Jewson,Chief Operations Officer Tiffany Delcour estim...,"At a press conference Wednesday, NOLA Public S...",,{'officer'},1,0,1,0
36106,366,3.0,1,Carly Berlin,At a Monday press conference with New Orlean...,"UPDATE: After this story was published, FEMA e...",,{'officer'},1,0,0,0
36107,365,2.0,1,Marta Jewson,"Before students return to any campus, NOLA P...",Outside Frederick Douglass High School Thursda...,,{'officer'},1,0,0,0
36108,362,1.0,1,Marta Jewson,"Damage report As of Tuesday, roughly half of t...","About 250,000 Louisiana students remain out of...",,{'officer'},1,0,0,0


## Reports
- `extracted_keywords` in train/test/rem
- sources in train/test/rem
- authors in train/test/rem

In [8]:
kw_report.sort_values(by='relevant_perc', ascending=False)

Unnamed: 0,extracted_keywords,kw_match,relevant_count,relevant_perc
6,"{'police', 'terminated'}",4,4,0.5
0,"{'terminated', 'officer'}",4,3,0.429
11,"{'nopd', 'police', 'officer'}",5,2,0.286
5,"{'police', 'officer'}",889,191,0.177
8,"{'nopd', 'officer'}",70,12,0.146
9,{'terminated'},91,14,0.133
2,{'officer'},2764,422,0.132
4,{'police'},6350,938,0.129
10,"{'police', 'nopd', 'officer'}",7,1,0.125
7,"{'nopd', 'police'}",20,2,0.091


In [31]:
print(kw_report[['extracted_keywords', 'relevant_perc']].sort_values(by='relevant_perc', ascending=False))

                     extracted_keywords  relevant_perc
6              {'police', 'terminated'}          0.500
0             {'terminated', 'officer'}          0.429
11        {'nopd', 'police', 'officer'}          0.286
5                 {'police', 'officer'}          0.177
8                   {'nopd', 'officer'}          0.146
9                        {'terminated'}          0.133
2                           {'officer'}          0.132
4                            {'police'}          0.129
10        {'police', 'nopd', 'officer'}          0.125
7                    {'nopd', 'police'}          0.091
12                             {'nopd'}          0.079
1   {'police', 'terminated', 'officer'}          0.000
3                    {'police', 'nopd'}          0.000


In [9]:
src_report.sort_values(by='relevant_count', ascending=False)

Unnamed: 0,source_id,kw_match,relevant_count,relevant_perc
1,2,3345,790,0.191
18,19,1497,144,0.088
15,16,831,107,0.114
0,1,255,75,0.227
16,17,154,68,0.306
23,25,102,67,0.396
28,30,1204,53,0.042
6,7,528,43,0.075
30,32,303,42,0.122
13,14,304,32,0.095


In [10]:
author_report.sort_values(by='relevant_count', ascending=False)

Unnamed: 0,author,kw_match,relevant_count,relevant_perc
266,Katie Gagliano,226,103,0.313
4,Claire Taylor,152,100,0.397
522,James Finn,237,81,0.255
235,Staff Report,437,68,0.135
179,Joe Gyan Jr.,167,59,0.261
...,...,...,...,...
239,Ron Faucheux,2,0,0.000
238,Bess Casserleigh,2,0,0.000
237,WBRZ News,4,0,0.000
234,Special to The Town Talk,2,0,0.000


# Reviewing input data

### `text_df`

In [11]:
pretty_print('full text shape:', text_df.shape)
print('full text cols:\n', list(text_df.columns))

full text shape:                                  (29707, 12)
full text cols:
 ['created_at', 'link', 'guid', 'source_id', 'updated_at', 'content', 'published_date', 'id', 'title', 'is_processed', 'author', 'url']


In [12]:
text_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29707 entries, 0 to 29706
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   created_at      29707 non-null  object
 1   link            29707 non-null  object
 2   guid            29707 non-null  object
 3   source_id       29707 non-null  int64 
 4   updated_at      29707 non-null  object
 5   content         29254 non-null  object
 6   published_date  29707 non-null  object
 7   id              29707 non-null  int64 
 8   title           29707 non-null  object
 9   is_processed    29707 non-null  bool  
 10  author          27270 non-null  object
 11  url             29707 non-null  object
dtypes: bool(1), int64(2), object(9)
memory usage: 2.5+ MB


In [13]:
get_unique_report(text_df)

             distinct value count by col
created_at                                        29707
link                                              29707
guid                                              29698
source_id                                         34
updated_at                                        29707
content                                           28581
published_date                                    529
id                                                29707
title                                             28392
is_processed                                      1
author                                            2200
url                                               29707


### `sen_df`

In [14]:
pretty_print('matched sen shape:', sen_df.shape)
print('matched sen cols:\n', list(sen_df.columns))

matched sen shape:                                (10470, 7)
matched sen cols:
 ['id', 'created_at', 'updated_at', 'article_id', 'extracted_keywords', 'text', 'title']


In [15]:
sen_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10470 entries, 0 to 10469
Data columns (total 7 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   id                  10470 non-null  int64 
 1   created_at          10470 non-null  object
 2   updated_at          10470 non-null  object
 3   article_id          10470 non-null  int64 
 4   extracted_keywords  10470 non-null  object
 5   text                10470 non-null  object
 6   title               10470 non-null  object
dtypes: int64(2), object(5)
memory usage: 572.7+ KB


In [16]:
get_unique_report(sen_df)

             distinct value count by col
id                                                10470
created_at                                        10470
updated_at                                        10470
article_id                                        4323
extracted_keywords                                60
text                                              9925
title                                             4179


### `true_df`

In [17]:
pretty_print('matched true shape:', true_df.shape)
print('matched true cols:\n', list(true_df.columns))

matched true shape:                               (735, 3)
matched true cols:
 ['id', 'matchedsentence_id', 'officer_id']


In [18]:
true_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 735 entries, 0 to 734
Data columns (total 3 columns):
 #   Column              Non-Null Count  Dtype
---  ------              --------------  -----
 0   id                  735 non-null    int64
 1   matchedsentence_id  735 non-null    int64
 2   officer_id          735 non-null    int64
dtypes: int64(3)
memory usage: 17.4 KB


In [19]:
get_unique_report(true_df)

             distinct value count by col
id                                                355
matchedsentence_id                                479
officer_id                                        355
