# Dataset conll04

This dataset is constructed from news articles and has 

In [22]:
import sys
sys.path.insert(0,'..')
import json
from abc import abstractmethod, ABC
from collections import OrderedDict
from logging import Logger
from typing import List
from tqdm import tqdm
from transformers import BertTokenizer

from spert import util
from spert.entities import Dataset, EntityType, RelationType, Entity, Relation, Document
from spert.opt import spacy
import pandas as pd



Unnamed: 0,short,verbose,symmetric
Work_For,Work,Work for,False
Kill,Kill,Kill,False
OrgBased_In,OrgBI,Organization based in,False
Live_In,Live,Live in,False
Located_In,LocIn,Located in,False


## Entities and Relation Types
Both types are in file `conll04_types.json` and can be extracted as follows: 

In [24]:

types_path = "../data/datasets/conll04/conll04_types.json"

types = json.load(open(types_path), object_pairs_hook=OrderedDict)
entity_types = pd.DataFrame(types['entities']).transpose()
relation_types = pd.DataFrame(types['relations']).transpose()

relation_types

Unnamed: 0,short,verbose,symmetric
Work_For,Work,Work for,False
Kill,Kill,Kill,False
OrgBased_In,OrgBI,Organization based in,False
Live_In,Live,Live in,False
Located_In,LocIn,Located in,False


In [25]:
entity_types

Unnamed: 0,short,verbose
Loc,Loc,Location
Org,Org,Organization
Peop,Peop,People
Other,Other,Other


## Data Splits

The data are pre-split into train, train-dev, 

In [31]:
paths = ["../data/datasets/conll04/conll04_train.json",
         "../data/datasets/conll04/conll04_train_dev.json",
         "../data/datasets/conll04/conll04_dev.json",
         "../data/datasets/conll04/conll04_test.json"]
labels = ["train","train_dev","dev","test"]
data_splits = {}
for pth, label in zip(paths,labels):
    data_splits[label] = json.load(open(pth), object_pairs_hook=OrderedDict)
data_splits.keys()


dict_keys(['train', 'train_dev', 'dev', 'test'])

In [45]:
ex1 = data_splits['test'][100]
print("Each example contains:",ex1.keys())
print('Tokens:', ex1['tokens'])
print()
print('Entities:')
print(pd.DataFrame(ex1['entities']))
print()
print('Relations:')
print(ex1['relations'])
print()
print('original id:', ex1['orig_id'])



Each example contains: odict_keys(['tokens', 'entities', 'relations', 'orig_id'])
Tokens: ['The', 'DEM', 'outlawed', 'plastic', 'ducks', 'in', 'Rhode', 'Island', 'waterways', 'after', 'a', 'race', 'in', 'Warwick', 'sent', '7', ',', '000', 'toy', 'ducks', 'floating', 'down', 'the', 'Pawtuxet', 'River.']

Entities:
  type  start  end
0  Loc      6    8
1  Loc     13   14
2  Loc     23   25

Relations:
[OrderedDict([('type', 'Located_In'), ('head', 1), ('tail', 0)]), OrderedDict([('type', 'Located_In'), ('head', 2), ('tail', 0)]), OrderedDict([('type', 'Located_In'), ('head', 2), ('tail', 1)])]

original id: 1996


The train-dev set is the union of train and dev as shown below:

In [49]:
[ "%s has %i examples"%(l,len(s)) for l,s in data_splits.items()]

['train has 922 examples',
 'train_dev has 1153 examples',
 'dev has 231 examples',
 'test has 288 examples']

In [66]:
for split in data_splits:
    print()
    print('Split',split)
    mentions = [ en['type'] for ex in data_splits[split] for en in ex['entities']]
    etypes = set(mentions)
    for et in etypes:
        print('entity type %s has %d mentions'%(et, sum([1 for m in mentions if m == et])))



Split train
entity type Peop has 1087 mentions
entity type Loc has 1219 mentions
entity type Other has 455 mentions
entity type Org has 616 mentions

Split train_dev
entity type Peop has 1370 mentions
entity type Loc has 1541 mentions
entity type Other has 573 mentions
entity type Org has 786 mentions

Split dev
entity type Peop has 283 mentions
entity type Loc has 322 mentions
entity type Other has 118 mentions
entity type Org has 170 mentions

Split test
entity type Peop has 321 mentions
entity type Loc has 427 mentions
entity type Other has 133 mentions
entity type Org has 198 mentions


False