# Build the dataset

## Define filepaths

In [1]:
import os

In [2]:
root_dir = '../..'
data_dir = 'data/corpus'
src_dir = 'src'
full_corpus_filename = 'reduced_single_annotated_corpus.json'
full_corpus_filepath = os.path.join(root_dir, data_dir, full_corpus_filename)

---

## Load and parse the dataset into a DataFrame

In [3]:
import json

In [4]:
with open(full_corpus_filepath) as fd:
    full_corpus = json.load(fd)

In [5]:
import pandas as pd

In [6]:
full_corpus_df = pd.DataFrame({'doc_id': full_corpus['docs'],
                               'text': full_corpus['texts'],
                               'tokens': full_corpus['tokens']})
full_corpus_df['annotations'] = full_corpus_df['doc_id'].map(lambda x: full_corpus['annotations'][str(x)])
full_corpus_df = full_corpus_df.drop(columns='doc_id')
full_corpus_df.head()

Unnamed: 0,text,tokens,annotations
0,vice presidency; gov. roosevelt's nomination n...,"[vice, presidency, gov, roosevelt, nomination,...","[Q1124, Q207, Q23505]"
1,"clubs; ""civic army"" formation address issued i...","[clubs, civic, army, formation, address, issue...",[Q29468]
2,corruption fund; senator mccarren and w. j. st...,"[corruption, fund, senator, mccarren, w., j., ...",[Q1124]
3,bearing of the campaign in newark on the gover...,"[bearing, campaign, newark, governorship, elec...","[Q60, Q1384, Q1408]"
4,parade.danforth for vice president.,"[parade, danforth, vice, president]","[Q1124, Q23505]"


### Filter out texts with two or more annotations

In [7]:
full_corpus_df = full_corpus_df.loc[full_corpus_df['annotations'].map(len) == 1].copy()
full_corpus_df['annotation'] = full_corpus_df['annotations'].map(lambda x: x[0])
full_corpus_df = full_corpus_df.drop(columns='annotations')
full_corpus_df.head()

Unnamed: 0,text,tokens,annotation
1,"clubs; ""civic army"" formation address issued i...","[clubs, civic, army, formation, address, issue...",Q29468
2,corruption fund; senator mccarren and w. j. st...,"[corruption, fund, senator, mccarren, w., j., ...",Q1124
6,summary of campaigns.eleven republican nationa...,"[summary, campaigns, eleven, republican, natio...",Q29468
7,"hendricks, ex-senator francis; gov. roosevelt ...","[hendricks, ex-senator, francis, gov, roosevel...",Q23505
8,recorder goff's charge that law forbids the re...,"[recorder, goff, charge, law, forbids, retenti...",Q11201


In [8]:
full_corpus_df.shape

(7160, 3)

### Group by annotation and sample data

In [9]:
sample_size = 300
random_state = 3

In [10]:
grouped_full_corpus_df = full_corpus_df.groupby('annotation')
sampled_data_list = []

for group_name, group_df in grouped_full_corpus_df:
    group_size = group_df.shape[0]
    sampled_rows = group_df.sample(n=min(group_size, sample_size), random_state=random_state)
    sampled_data_list.append(sampled_rows)

In [11]:
sampled_df = pd.concat(sampled_data_list, axis=0)
sampled_df.shape

(3755, 3)

### Find out each group size 

In [12]:
grouped_sampled_df = sampled_df.groupby('annotation')
grouped_sampled_df.size().sort_values(ascending=False)

annotation
Q744448     300
Q66096      300
Q11211      300
Q1124       300
Q1384       300
Q148        300
Q11201      300
Q29468      300
Q330963     300
Q23505      281
Q60         200
Q3480437    163
Q11268      149
Q29552      138
Q1408        86
Q181648      28
Q207         10
dtype: int64

### Take the top N groups by size

In [13]:
top_n = 10
top_n_labels = grouped_sampled_df.size().sort_values(ascending=False).index.values[:10]
top_n_labels

array(['Q744448', 'Q66096', 'Q11211', 'Q1124', 'Q1384', 'Q148', 'Q11201',
       'Q29468', 'Q330963', 'Q23505'], dtype=object)

### Filter in documents within the top N labels

In [14]:
sampled_corpus_df = sampled_df.loc[sampled_df['annotation'].isin(top_n_labels)].copy()
sampled_corpus_df = sampled_corpus_df.reset_index(drop=True)
sampled_corpus_df.head()

Unnamed: 0,text,tokens,annotation
0,panel of new york state judges hear arguments ...,"[panel, new, york, state, judges, hear, argume...",Q11201
1,prof stephen gillers op-ed article says florid...,"[prof, stephen, gillers, op-ed, article, says,...",Q11201
2,s seabury will start sup ct action to bring ha...,"[seabury, start, sup, ct, action, bring, hasti...",Q11201
3,3-judge fed panel rules that nys election law ...,"[3-judge, fed, panel, rules, nys, election, la...",Q11201
4,federal appeals court judges who ruled that ca...,"[federal, appeals, court, judges, ruled, calif...",Q11201


In [15]:
sampled_corpus_df.shape

(2981, 3)

## Convert the data to the `TrainingCorpus` format

In [16]:
data_dict = {}

### Define the `docs` field

In [17]:
data_dict['docs'] = sampled_corpus_df.index.tolist()

### Define the `texts` field

In [18]:
data_dict['texts'] = sampled_corpus_df['text'].tolist()

### Define the `tokens` field

In [19]:
data_dict['tokens'] = sampled_corpus_df['tokens'].tolist()

### Define the `labels` field

In [20]:
data_dict['labels'] = sorted(top_n_labels.tolist())

### Define the `target` field

In [21]:
data_dict['target'] = sampled_corpus_df['annotation'].map(lambda x: [x]).to_dict()

---

## Save to JSON file

In [22]:
dataset_filename = 'nyt_corpus.json'
dataset_filepath = os.path.join(root_dir, data_dir, dataset_filename)

In [23]:
with open(dataset_filepath, 'w') as fd:
    json.dump(data_dict, fd)

---

## Open the dataset as an instance of the `TrainingCorpus` class

In [24]:
import sys
sys.path.append(os.path.join(root_dir, src_dir))

In [25]:
from training import TrainingCorpus

In [26]:
corpus = TrainingCorpus()
corpus.load(dataset_filepath)

In [27]:
corpus.get_text(0)

"panel of new york state judges hear arguments about whether former sen guy j velella of bronx should be returned to prison, from which he was released by mayoral panel after serving three months of one-year sentence for conspiring to accept bribes; photo (m).appeals court weighs velella's return to jail"

In [28]:
corpus.get_tokens(0)

['panel',
 'new',
 'york',
 'state',
 'judges',
 'hear',
 'arguments',
 'whether',
 'former',
 'sen',
 'guy',
 'j',
 'velella',
 'bronx',
 'returned',
 'prison',
 'released',
 'mayoral',
 'panel',
 'serving',
 'three',
 'months',
 'one-year',
 'sentence',
 'conspiring',
 'accept',
 'bribes',
 'photo',
 'appeals',
 'court',
 'weighs',
 'velella',
 'return',
 'jail']

---

## Compute noun chunks

In [29]:
chunks_filename = 'nyt_chunks.json'
chunks_filepath = os.path.join(root_dir, data_dir, chunks_filename)

In [30]:
chunks_filepath

'../../data/corpus/nyt_chunks.json'

In [31]:
corpus.detect_chunks()

100%|██████████| 2981/2981 [01:02<00:00, 47.70it/s]


In [32]:
corpus.save_chunks(chunks_filepath)

---

## Load chunks

In [33]:
corpus.load_chunks(chunks_filepath)

In [34]:
list(corpus.noun_chunks.items())[:3]

[('panel', 14), ('new_york_state_judges', 1), ('arguments', 11)]

In [35]:
corpus.get_chunk_document(0, threshold=0)

['panel',
 'new_york_state_judges',
 'hear',
 'arguments',
 'whether',
 'former_sen_guy_j_velella',
 'bronx',
 'returned',
 'prison',
 'released',
 'mayoral_panel',
 'serving',
 'three_months',
 'one-year_sentence',
 'conspiring',
 'accept',
 'bribes',
 'photo',
 'appeals_court',
 'weighs',
 'velella_return',
 'jail']

In [36]:
corpus.get_text(0)

"panel of new york state judges hear arguments about whether former sen guy j velella of bronx should be returned to prison, from which he was released by mayoral panel after serving three months of one-year sentence for conspiring to accept bribes; photo (m).appeals court weighs velella's return to jail"

In [37]:
corpus.get_tokens(0)

['panel',
 'new',
 'york',
 'state',
 'judges',
 'hear',
 'arguments',
 'whether',
 'former',
 'sen',
 'guy',
 'j',
 'velella',
 'bronx',
 'returned',
 'prison',
 'released',
 'mayoral',
 'panel',
 'serving',
 'three',
 'months',
 'one-year',
 'sentence',
 'conspiring',
 'accept',
 'bribes',
 'photo',
 'appeals',
 'court',
 'weighs',
 'velella',
 'return',
 'jail']

---