In [1]:
from itertools import chain
import nltk
from sklearn.metrics import accuracy_score
import pycrfsuite
import random

def load_data(files):
    data, sent = [], []
    for file in files:
        with open(file, 'r') as rf:
            for line in rf:
                if line.strip() != '':
                    # Note: the shared corpus is already tokenized
                    sent.append(line.strip().split('\t'))
                else:
                    if len(sent) > 0:
                        data.append(sent)
                        sent = []
    return data

sents = load_data(['data/FB_HI_EN_CR.txt', 'data/TWT_HI_EN_CR.txt', 'data/WA_HI_EN_CR.txt',
                   'data/FB_BN_EN_CR.txt', 'data/TWT_BN_EN_CR.txt', 'data/WA_BN_EN_CR.txt',
                   'data/FB_TE_EN_CR.txt', 'data/TWT_TE_EN_CR.txt', 'data/WA_TE_EN_CR.txt'])

random.shuffle(sents)
train_sents = sents[:int(0.8*len(sents))]
valid_sents = sents[int(0.8*len(sents)):]
print("# Train sentences: %d" % (len(train_sents)))
print("# Validation sentences: %d" % (len(valid_sents)))
# sents[78]

# Train sentences: 4186
# Validation sentences: 1047


In [2]:
def word2features(sent, k):
    word = sent[k][0]
    features = [
        'token=%s' % (word)
    ]
    # extracting n-grams, for n=1 to 5
    for i in range(1,6):
        # if the value of n is greater than the word length, we exit the loop
        if i > len(word):
            break
        character_features = [word[j:j+i] for j in range(len(word)-i+1)]
        features.extend([
            # is count of individual n-grams important? is the order important?
            "char-%d-gram=%s" % (i, ' '.join(list(set(character_features))))
        ])
    if k == 0:
        # first word in the sentence
        features.append('BOS')
    else:
        features.extend([
            "-1:word=%s" % (sent[k-1][0])
        ])
    if i == len(sent):
        # last word in the sentence         
        features.append('EOS')
 
    return features
        
def sent2features(sent):
    # generating features for all the words/tokens in a sentence `sent`    
    return [word2features(sent, i) for i in range(len(sent))]

def sent2langs(sent):
    return [language_label for token, language_label, pos_tag in sent]

def sent2pos(sent):
    return [pos_tag for token, language_label, pos_tag in sent]

def sent2tokens(sent):
    return [token for token, language_label, pos_tag in sent]



X_train = [sent2features(sent) for sent in train_sents]
y_train = [sent2pos(sent) for sent in train_sents]

X_test = [sent2features(sent) for sent in valid_sents]
y_test = [sent2pos(sent) for sent in valid_sents]

# X_train[32]


In [3]:
trainer = pycrfsuite.Trainer(verbose=True)

for xseq, yseq in zip(X_train, y_train):
    trainer.append(xseq, yseq)
    
trainer.set_params({
    'c1': 1.0,   # coefficient for L1 penalty
    'c2': 1e-3,  # coefficient for L2 penalty
    'max_iterations': 50,  # stop earlier

    # include transitions that are possible, but not observed
    'feature.possible_transitions': True
})

In [4]:
trainer.train('model.crfsuite')

Feature generation
type: CRF1d
feature.minfreq: 0.000000
feature.possible_states: 0
feature.possible_transitions: 1
0....1....2....3....4....5....6....7....8....9....10
Number of features: 144706
Seconds required: 0.746

L-BFGS optimization
c1: 1.000000
c2: 0.001000
num_memories: 6
max_iterations: 50
epsilon: 0.000010
stop: 10
delta: 0.000010
linesearch: MoreThuente
linesearch.max_iterations: 20

***** Iteration #1 *****
Loss: 185821.093255
Feature norm: 1.000000
Error norm: 12567.871815
Active features: 46168
Line search trials: 1
Line search step: 0.000077
Seconds required for this iteration: 0.162

***** Iteration #2 *****
Loss: 160024.062507
Feature norm: 5.670922
Error norm: 35484.717075
Active features: 46107
Line search trials: 4
Line search step: 0.125000
Seconds required for this iteration: 0.317

***** Iteration #3 *****
Loss: 146666.166804
Feature norm: 4.879248
Error norm: 8116.782324
Active features: 46093
Line search trials: 1
Line search step: 1.000000
Seconds required f

***** Iteration #44 *****
Loss: 59615.464370
Feature norm: 132.994538
Error norm: 161.189684
Active features: 28726
Line search trials: 2
Line search step: 0.500000
Seconds required for this iteration: 0.158

***** Iteration #45 *****
Loss: 59582.772898
Feature norm: 133.565337
Error norm: 123.500848
Active features: 28605
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.082

***** Iteration #46 *****
Loss: 59560.669614
Feature norm: 134.126295
Error norm: 203.259001
Active features: 28480
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.081

***** Iteration #47 *****
Loss: 59538.290600
Feature norm: 134.529754
Error norm: 111.515859
Active features: 28334
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.082

***** Iteration #48 *****
Loss: 59517.969410
Feature norm: 134.950147
Error norm: 88.995441
Active features: 28217
Line search trials: 1
Line search step: 1.000000


In [5]:
tagger = pycrfsuite.Tagger()
tagger.open('model.crfsuite')

<contextlib.closing at 0x10aeaf9e8>

In [6]:
example_sent = valid_sents[56]
print(' '.join(sent2tokens(example_sent)), end='\n\n')

print("Predicted:", ' '.join(tagger.tag(sent2features(example_sent))))
print("Correct:  ", ' '.join(sent2pos(example_sent)))

" Helo page likers :P .. its been a long time since we interacted wth yu guys !! so from the past 1 week we hve been very active in page , endhukante page dull avthundani page likers chaala mandhi msg chesthunnaru so ika dandayatra modhalu pettinam :P

Predicted: G_X G_N G_N G_N E G_X G_N G_V DT G_J G_N PSP G_PRP G_N G_N G_N G_N G_X G_R PSP DT CC $ G_N G_PRP G_V G_V G_R G_J PSP G_N G_X G_N G_N G_N G_N G_N G_N G_X G_N G_N G_N G_R G_J G_N G_N G_N E
Correct:   G_X G_PRT G_N G_N E G_X G_N G_V DT G_J G_N PSP G_PRP G_V G_PRT G_PRP G_N G_X G_R PSP DT G_J $ G_N G_PRP G_V G_V G_R G_J PSP G_N G_X CC G_N G_J G_V G_N G_N G_SYM G_N G_N G_V PSP G_X G_X G_N G_V E


In [7]:
y_true = sent2pos(example_sent)
y_pred = tagger.tag(sent2features(example_sent))
acc = accuracy_score(y_true,y_pred)
acc

0.7083333333333334