In [1]:
from postagger.utils.common import timeit, get_data_path
from postagger.utils.preprocess import load_save_preprocessed_data
from postagger.utils.decoder import CompData
import collections

In [2]:
# features occurrences
train_path = get_data_path('train.wtag')
train_sentences = CompData(train_path)
preprocessor = load_save_preprocessed_data('train_preprocessed.pickle', train_sentences)

In [3]:
preprocessor.pdict.keys()

dict_keys(['unigram-f105', 'trigram-f103', 'number_inside', 'bigram-f104', 'prefix-f102', 'suffix-f101', 'capital_inside', 'starting_capital', 'previousword-f106', 'nextword-f107', 'wordtag-f100'])

# exploration

In [4]:
def count(tuple_list, show_top=50):
    counter_obj = collections.Counter(tuple_list)
    total = sum(counter_obj.values())
    total_uni = len(counter_obj.items())
    most_common = counter_obj.most_common(50)
    return total, total_uni, most_common

In [5]:
def explore_all(pdict):
    for feature_key, tuples in pdict.items():
        print(feature_key)
        total, total_uni, most_common = count(tuples)
        print("Total: " + str(total))
        print("Unique: " + str(total_uni))
        for elem in most_common:
            print(elem)
        print()
        print()

In [6]:
explore_all(preprocessor.pdict)

unigram-f105
Total: 121815
Unique: 44
('NN', 16939)
('IN', 12819)
('NNP', 12417)
('DT', 10548)
('JJ', 7853)
('NNS', 7626)
(',', 6044)
('.', 4962)
('CD', 4658)
('RB', 3883)
('VBD', 3641)
('VB', 3468)
('CC', 2981)
('VBZ', 2936)
('TO', 2827)
('VBN', 2645)
('PRP', 2129)
('VBG', 2020)
('VBP', 1619)
('MD', 1311)
('POS', 1172)
('PRP$', 1026)
('``', 846)
("''", 823)
('$', 809)
('WDT', 594)
(':', 565)
('NNPS', 351)
('JJR', 333)
('RP', 332)
('WRB', 257)
('WP', 255)
('RBR', 236)
('JJS', 227)
('-LRB-', 178)
('-RRB-', 178)
('EX', 116)
('PDT', 69)
('RBS', 42)
('WP$', 27)
('FW', 25)
('UH', 15)
('#', 7)
('SYM', 6)


trigram-f103
Total: 111821
Unique: 7810
(('IN', 'DT', 'NN'), 1825)
(('DT', 'JJ', 'NN'), 1590)
(('DT', 'NN', 'IN'), 1423)
(('NN', 'IN', 'DT'), 1349)
(('NNP', 'NNP', 'NNP'), 1192)
(('IN', 'DT', 'JJ'), 1018)
(('JJ', 'NN', 'IN'), 886)
(('NNP', 'NNP', ','), 879)
(('NN', 'IN', 'NNP'), 810)
(('IN', 'NNP', 'NNP'), 806)
(('NNS', 'IN', 'DT'), 663)
(('DT', 'NN', 'NN'), 563)
(('$', 'CD', 'CD'), 563)
(

# Summary

Word / tag: unique 15415, keep top 50 (or greater than 228)

Prefix: unique 21525, most common are vastly NN/IN/DT, (pick top 20?)

Suffix: unique 12311, most common are NN/VB, (pick top 20?)

Unigram: 44 unique then keep all

Bigram: 1023 unique, keep 10% (top 100)

Trigram: 7810 unique, keep 10% (top 780)

Previous word: 32095 unique, keep 1% (top 320)

Next word: 30777 unique, keep 1% (top 307)

number_inside: 6 unique, keep top 3

capital inside: 21 unique, keep top 3

starting_capital: 33 unqiue, keep top 10

## Prefix:

In [7]:
counter_obj = collections.Counter(preprocessor.pdict['prefix-f102']).most_common(500)

In [8]:
[x for x in counter_obj if x[0][1] not in ['NN', 'NNS', 'NNPS', 'NNP', 'VBD', 'IN', 'DT', 'CD', '-LRB-', '-RRB-', 'WDT']],

([(('s', 'JJ'), 810),
  (('w', 'MD'), 755),
  (('c', 'JJ'), 625),
  (('f', 'JJ'), 616),
  (('a', 'RB'), 506),
  (('p', 'JJ'), 506),
  (('l', 'JJ'), 472),
  (('m', 'JJ'), 455),
  (('wi', 'MD'), 454),
  (('wil', 'MD'), 454),
  (('will', 'MD'), 454),
  (('s', 'VBZ'), 362),
  (('o', 'JJ'), 347),
  (('b', 'VBN'), 344),
  (('co', 'JJ'), 343),
  (('the', 'PRP'), 311),
  (('th', 'PRP'), 311),
  (('a', 'JJ'), 311),
  (('t', 'PRP'), 311),
  (('woul', 'MD'), 301),
  (('wou', 'MD'), 301),
  (('wo', 'MD'), 301),
  (('e', 'JJ'), 290),
  (('s', 'VB'), 279),
  (('r', 'JJ'), 277),
  (('say', 'VBZ'), 269),
  (('sa', 'VBZ'), 269),
  (('says', 'VBZ'), 269),
  (('i', 'JJ'), 268),
  (('h', 'VBP'), 266),
  (('al', 'RB'), 264),
  (('c', 'VB'), 261),
  (('ha', 'VBP'), 259),
  (('fi', 'JJ'), 258),
  (('hav', 'VBP'), 257),
  (('have', 'VBP'), 257),
  (('t', 'JJ'), 255),
  (('c', 'VBN'), 249),
  (('r', 'VB'), 248),
  (('n', 'JJ'), 245),
  (('be', 'VBN'), 242),
  (('s', 'VBN'), 235),
  (('they', 'PRP'), 228),
  ((

Alot of junk in that feature, even after filtering, possibly add its tuples manually (e.g., 'un', 'a', 'im')

# suffix

In [9]:
counter_obj = collections.Counter(preprocessor.pdict['suffix-f101']).most_common(100)

In [10]:
[x for x in counter_obj]

[(('s', 'NNS'), 7250),
 (('e', 'NN'), 3208),
 (('t', 'NN'), 2652),
 (('d', 'VBD'), 2235),
 (('es', 'NNS'), 2203),
 (('g', 'VBG'), 2016),
 (('ng', 'VBG'), 2016),
 (('ing', 'VBG'), 2016),
 (('d', 'VBN'), 2004),
 (('ed', 'VBN'), 1912),
 (('n', 'NN'), 1684),
 (('y', 'NN'), 1655),
 (('r', 'NN'), 1654),
 (('ed', 'VBD'), 1404),
 (('s', 'VBZ'), 1340),
 (('rs', 'NNS'), 1336),
 (('n', 'NNP'), 1321),
 (('e', 'JJ'), 1241),
 (('on', 'NN'), 1238),
 (('l', 'JJ'), 1212),
 (('ts', 'NNS'), 1209),
 (('y', 'RB'), 1194),
 (('ion', 'NN'), 1163),
 (('e', 'VB'), 1158),
 (('e', 'NNP'), 1112),
 (('ly', 'RB'), 1091),
 (('al', 'JJ'), 1055),
 (('t', 'IN'), 1037),
 (('er', 'NN'), 1031),
 (('t', 'JJ'), 1021),
 (('nt', 'NN'), 969),
 (('tion', 'NN'), 948),
 (('r', 'NNP'), 918),
 (('g', 'NN'), 889),
 (('s', 'NNP'), 859),
 (('ers', 'NNS'), 857),
 (('ng', 'NN'), 856),
 (('.', 'NNP'), 851),
 (('ing', 'NN'), 843),
 (('n', 'CD'), 830),
 (('y', 'NNP'), 826),
 (('lion', 'CD'), 812),
 (('on', 'CD'), 812),
 (('ion', 'CD'), 812)

Top 100 is enough.

## Not Rare words

In [17]:
fr
all_train_words = [w[0] for w in preprocessor.pdict['wordtag-f100']]

In [19]:
collections.Counter(all_train_words).most_common(100)

[(',', 6044),
 ('the', 5324),
 ('.', 4914),
 ('of', 2926),
 ('to', 2809),
 ('a', 2415),
 ('and', 2058),
 ('in', 1994),
 ("'s", 1250),
 ('that', 1130),
 ('for', 1022),
 ('is', 981),
 ('The', 882),
 ('``', 838),
 ("''", 813),
 ('$', 773),
 ('said', 772),
 ('on', 728),
 ('it', 644),
 ('%', 624),
 ('Mr.', 601),
 ('by', 576),
 ('with', 567),
 ('as', 566),
 ('from', 544),
 ('at', 533),
 ('million', 516),
 ('be', 491),
 ('are', 476),
 ('its', 460),
 ('will', 459),
 ('has', 459),
 ('was', 438),
 ('an', 421),
 ("n't", 412),
 ('have', 405),
 ('company', 370),
 ('year', 315),
 ('he', 307),
 ('which', 306),
 ('would', 301),
 ('about', 286),
 ('billion', 272),
 ('this', 270),
 ('says', 269),
 ('or', 265),
 ('--', 256),
 ('had', 248),
 ('market', 240),
 ('they', 228),
 ('their', 226),
 ('In', 226),
 ('up', 225),
 ('more', 223),
 ('been', 216),
 ('But', 212),
 ('but', 197),
 ('U.S.', 197),
 ('one', 196),
 ('were', 191),
 ('new', 190),
 ('than', 190),
 ('some', 177),
 ('also', 176),
 ('shares', 176),
