In this notebook, we're going to do an exploratory analysis. Specifically, we are interested in finding out the statistics about our entire datasets.

In [11]:
import sys
sys.path.insert(0, '..')

from collections import defaultdict
from statistics import mean, median, stdev
import os

from utils import CorpusReader

In [6]:
corpus_path = os.path.join('..', 'data', 'input', 'idn-tagged', 'Indonesian_Manually_Tagged_Corpus.tsv')

In [10]:
reader = CorpusReader(corpus_path)

n_sents, n_tokens = 0, []
seen = set()
tag_counter = defaultdict(int)

for tagged_sent in reader.tagged_sents():
    n_sents += 1
    count = 0
    for word, tag in tagged_sent:
        count += 1
        seen.add(word.lower())
        tag_counter[tag] += 1
    n_tokens.append(count)

In [13]:
print('Number of sentences:', n_sents)
print('Number of word tokens:', sum(n_tokens))
print('Number of word types:', len(seen))
print(f'Mean, median, and stdev of sentence length: {mean(n_tokens):.2f}, {median(n_tokens)}, {stdev(n_tokens):.2f}')
print('Tag counts:')
for tag, count in tag_counter.items():
    print(tag, '=>', count)

Number of sentences: 10030
Number of word tokens: 256622
Number of word types: 16291
Mean, median, and stdev of sentence length: 25.59, 25.0, 10.29
Tag counts:
NN => 61940
SC => 13080
VB => 31733
NNP => 34649
JJ => 9724
RB => 4903
IN => 21311
Z => 26347
CD => 17819
CC => 7438
PR => 5348
PRP => 7583
MD => 5248
FW => 2366
NEG => 1520
DT => 381
NND => 1414
SYM => 2210
RP => 183
WH => 260
OD => 738
X => 397
UH => 30
