In [1]:
import pandas
import numpy
import sklearn_crfsuite
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics

In [2]:
from itertools import chain

import sklearn
import scipy.stats
from sklearn.metrics import make_scorer
from sklearn.cross_validation import cross_val_score
from sklearn.grid_search import RandomizedSearchCV




In [33]:
import pprint
class MyPrettyPrinter(pprint.PrettyPrinter):
	def format(self, _object, context, maxlevels, level):
		if isinstance(_object, unicode):
			return "'%s'" % _object.encode('utf8'), True, False
		elif isinstance(_object, str):
			_object = unicode(_object,'utf8')
			return "'%s'" % _object.encode('utf8'), True, False
		return pprint.PrettyPrinter.format(self, _object, context, maxlevels, level)

In [24]:
def loadData(data):
    sentList = []
    with open(data) as f:
        words = []
        for line in f:
            atts = line.strip().split("\t")
            if len(atts)<=1:
                sentList.append(words)
                words=[]
                continue
            no,word,tag = atts
            words.append((word,tag))
            
    return sentList
            

In [25]:
where="/Users/a1/Source/play_data/nlp-challenge/missions/ner/data/train/train_data"
tagData = loadData(where)

In [27]:
tagData[0]

[('\xeb\xb9\x84\xed\x86\xa0\xeb\xa6\xac\xec\x98\xa4', 'PER_B'),
 ('\xec\x96\x91\xec\x9d\xbc', 'DAT_B'),
 ('\xeb\xa7\x8c\xec\x97\x90', '-'),
 ('\xec\x98\x81\xec\x82\xac\xea\xb4\x80', 'ORG_B'),
 ('\xea\xb0\x90\xed\x98\xb8', 'CVL_B'),
 ('\xec\x9a\xa9\xed\x87\xb4,', '-'),
 ('\xed\x95\xad\xeb\xa3\xa1', '-'),
 ('\xec\x95\x95\xeb\xa0\xa5\xec\x84\xa4', '-'),
 ('\xec\x9d\x98\xec\x8b\xac\xeb\xa7\x8c', '-'),
 ('\xea\xb0\x80\xec\x9c\xa8', '-')]

In [38]:
def word2features(sent, i):
    word = sent[i][0]

    features = {
        'bias': 1.0,
        'word.lower()': word.lower(),
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isdigit()': word.isdigit(),
    }
    if i > 0:
        word1 = sent[i-1][0]
        postag1 = sent[i-1][1]
        features.update({
            '-1:word.lower()': word1.lower(),
        })
    else:
        features['BOS'] = True

    if i < len(sent)-1:
        word1 = sent[i+1][0]
        postag1 = sent[i+1][1]
        features.update({
            '+1:word.lower()': word1.lower(),
        })
    else:
        features['EOS'] = True

    return features


def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for token, label in sent]

def sent2tokens(sent):
    return [token for token, label in sent]

In [39]:
X_train = [sent2features(s) for s in tagData]
y_train = [sent2labels(s) for s in tagData]

In [43]:
X_test= X_train[80000:]
y_test = y_train[80000:] 

In [41]:
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)
crf.fit(X_train, y_train)

CRF(algorithm='lbfgs', all_possible_states=None,
  all_possible_transitions=True, averaging=None, c=None, c1=0.1, c2=0.1,
  calibration_candidates=None, calibration_eta=None,
  calibration_max_trials=None, calibration_rate=None,
  calibration_samples=None, delta=None, epsilon=None, error_sensitive=None,
  gamma=None, keep_tempfiles=None, linesearch=None, max_iterations=100,
  max_linesearch=None, min_freq=None, model_filename=None,
  num_memories=None, pa_type=None, period=None, trainer_cls=None,
  variance=None, verbose=False)

In [42]:
crf.classes_

['PER_B',
 'DAT_B',
 '-',
 'ORG_B',
 'CVL_B',
 'NUM_B',
 'LOC_B',
 'EVT_B',
 'TRM_B',
 'TRM_I',
 'EVT_I',
 'PER_I',
 'CVL_I',
 'NUM_I',
 'TIM_B',
 'TIM_I',
 'ANM_B',
 'DAT_I',
 'FLD_B',
 'ORG_I',
 'MAT_B',
 'MAT_I',
 'AFW_B',
 'LOC_I',
 'AFW_I',
 'PLT_B',
 'FLD_I',
 'ANM_I',
 'PLT_I']

In [44]:
X_test
y_pred = crf.predict(X_test)


In [46]:
labels = list(crf.classes_)
labels

['PER_B',
 'DAT_B',
 '-',
 'ORG_B',
 'CVL_B',
 'NUM_B',
 'LOC_B',
 'EVT_B',
 'TRM_B',
 'TRM_I',
 'EVT_I',
 'PER_I',
 'CVL_I',
 'NUM_I',
 'TIM_B',
 'TIM_I',
 'ANM_B',
 'DAT_I',
 'FLD_B',
 'ORG_I',
 'MAT_B',
 'MAT_I',
 'AFW_B',
 'LOC_I',
 'AFW_I',
 'PLT_B',
 'FLD_I',
 'ANM_I',
 'PLT_I']

In [47]:
metrics.flat_f1_score(y_test, y_pred,
                      average='weighted', labels=labels)

  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


0.9897225089618146

In [48]:
sorted_labels = sorted(
    labels,
    key=lambda name: (name[1:], name[0])
)
print(metrics.flat_classification_report(
    y_test, y_pred, labels=sorted_labels, digits=3
))

             precision    recall  f1-score   support

          -      0.990     0.997     0.994     81348
      DAT_B      0.989     0.979     0.984      2874
      MAT_B      1.000     0.773     0.872        22
      DAT_I      0.971     0.982     0.977       936
      MAT_I      1.000     1.000     1.000         3
      PER_B      0.994     0.983     0.988      4761
      PER_I      1.000     0.995     0.997       596
      AFW_B      0.983     0.943     0.963       491
      AFW_I      0.969     0.995     0.982       191
      TIM_B      0.979     0.974     0.976       380
      TIM_I      0.979     0.968     0.974        95
      FLD_B      0.991     0.863     0.922       255
      FLD_I      1.000     0.889     0.941         9
      PLT_B      1.000     0.864     0.927        22
      PLT_I      0.000     0.000     0.000         0
      ANM_B      0.977     0.954     0.966       720
      ANM_I      1.000     1.000     1.000         5
      LOC_B      0.995     0.971     0.983   

  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
