In this notebook, we're going to find out the statistics of our datasets for each fold.

In [1]:
import sys
sys.path.insert(0, '..')

from collections import defaultdict
from statistics import mean, median, stdev
import os

from utils import CorpusReader

In [9]:
def report_statistics(reader):
    n_sents, n_tokens = 0, []
    seen = set()
    tag_counter = defaultdict(int)

    for tagged_sent in reader.tagged_sents():
        n_sents += 1
        count = 0
        for word, tag in tagged_sent:
            count += 1
            seen.add(word.lower())
            tag_counter[tag] += 1
        n_tokens.append(count)

    print('Number of sentences:', n_sents)
    print('Number of word tokens:', sum(n_tokens))
    print('Number of word types:', len(seen))
    print(f'Mean, median, and stdev of sentence length: {mean(n_tokens):.2f}, {median(n_tokens)}, {stdev(n_tokens):.2f}')
    print('Tag counts:')
    for tag, count in sorted(tag_counter.items(), key=lambda p: p[1], reverse=True):
        print(tag, '=>', count, f'({count/sum(n_tokens):.2%})')

In [10]:
num_folds = 5

for fold in range(num_folds):
    print('### Fold', fold + 1)
    for name in ('train', 'dev', 'test'):
        corpus_path = os.path.join(
            '..', 'data', 'working', '2018-04-14', 'idn-tagged', f'{name}.{fold+1:02}.tsv')
        reader = CorpusReader(corpus_path)
        print('**', name, 'set')
        report_statistics(reader)
    print()

### Fold 1
** train set
Number of sentences: 7222
Number of word tokens: 184345
Number of word types: 13870
Mean, median, and stdev of sentence length: 25.53, 25.0, 10.27
Tag counts:
NN => 44672 (24.23%)
NNP => 24845 (13.48%)
VB => 22790 (12.36%)
Z => 18862 (10.23%)
IN => 15371 (8.34%)
CD => 12758 (6.92%)
SC => 9366 (5.08%)
JJ => 6997 (3.80%)
PRP => 5408 (2.93%)
CC => 5322 (2.89%)
PR => 3896 (2.11%)
MD => 3774 (2.05%)
RB => 3463 (1.88%)
FW => 1666 (0.90%)
SYM => 1569 (0.85%)
NEG => 1113 (0.60%)
NND => 1021 (0.55%)
OD => 541 (0.29%)
X => 294 (0.16%)
DT => 277 (0.15%)
WH => 180 (0.10%)
RP => 137 (0.07%)
UH => 23 (0.01%)
** dev set
Number of sentences: 802
Number of word tokens: 20659
Number of word types: 4408
Mean, median, and stdev of sentence length: 25.76, 25.0, 10.65
Tag counts:
NN => 4944 (23.93%)
NNP => 2841 (13.75%)
VB => 2555 (12.37%)
Z => 2159 (10.45%)
IN => 1727 (8.36%)
CD => 1421 (6.88%)
SC => 1011 (4.89%)
JJ => 761 (3.68%)
PRP => 635 (3.07%)
CC => 574 (2.78%)
PR => 411 (1.99