In [78]:
# Get data statistics
import os

from flair.data import Corpus
from flair.datasets import ColumnCorpus
from flair.datasets import ColumnDataset
from collections import defaultdict

In [18]:
# Load data

# define columns
columns = {0: 'text', 1: 'np', 2: 'pos', 3: 'ner'}

# this is the folder in which train, test and dev files reside
data_folder = '/lfs1/shared/nlp/conll2003data/conll2003_centralized'

# init a corpus using column format, data folder and the names of the train, dev and test files
corpus: Corpus = ColumnCorpus(data_folder, columns,
                              train_file='train.txt',
                              test_file='test.txt',
                              dev_file='valid.txt')

2022-03-06 17:56:33,155 Reading data from /lfs1/shared/nlp/conll2003data/conll2003_centralized
2022-03-06 17:56:33,157 Train: /lfs1/shared/nlp/conll2003data/conll2003_centralized/train.txt
2022-03-06 17:56:33,158 Dev: /lfs1/shared/nlp/conll2003data/conll2003_centralized/valid.txt
2022-03-06 17:56:33,159 Test: /lfs1/shared/nlp/conll2003data/conll2003_centralized/test.txt


In [19]:
print('Report:')
print('Number of sentences in train:', len(corpus.train))
print('Number of sentences in dev:', len(corpus.dev))
print('Number of sentences in test:', len(corpus.test))

Report:
Number of sentences in train: 14987
Number of sentences in dev: 3466
Number of sentences in test: 3684


In [20]:
corpus.dev.sentences[1].tokens[0].annotation_layers

{'np': [NNP (1.0)], 'pos': [B-NP (1.0)], 'ner': [O (1.0)]}

In [77]:
corpus.dev.sentences[0].tokens[0].text

'-DOCSTART-'

In [80]:
corpus.dev.sentences[3].get_spans('ner')[0].annotation_layers['ner'][0].value

'West Indian'

In [89]:
counts = defaultdict(int)
entities = defaultdict(set)

for sentence in corpus.train.sentences:
    # Skip 'DOCSTART' lines
    if len(sentence.tokens)>0 and 'DOCSTART' in sentence.tokens[0].text:
        continue
    
    for span in sentence.get_spans('ner'):
        counts[span.annotation_layers['ner'][0].value] += 1
        entities[span.annotation_layers['ner'][0].value].add(span.text)

In [88]:
counts

defaultdict(int, {'ORG': 6321, 'MISC': 3438, 'PER': 6600, 'LOC': 7140})

In [86]:
entities

defaultdict(set,
            {'ORG': {'SPIF CESKY',
              'Treasury',
              'Chesapeake Police Department',
              'Shanghai Post and Telecomm',
              'Cowdenbeath',
              'WesBanco Bank',
              'Mentmore Abbey',
              'GENOA',
              'DSE',
              'Chicago Board Options Exchange',
              'Hassania Agadir',
              'Shanghai-Ek Chor',
              'First Union',
              'Gazprom',
              'PTT',
              'Bristol',
              'National Human Rights Commission',
              'Chievo',
              'EXPRESS',
              'Dinamo Bucharest',
              'Cosenza',
              'Collingwood',
              'Triple-A Columbus',
              'NZPA',
              'Frontier',
              'AD',
              'Australia Senate',
              'Orii Corp',
              'Steaua Bucharest',
              'National Abortion Rights Action League',
              'Baltika Kaliningrad',
   