In [5]:
def balance_data():
    filtered_data = []
    for text, tags in train_data:
        article = " ".join(text)
        filtered_article = []
        filtered_tags = []
        sentences = sent_tokenize(article)
        start_index = 0
        end_index = 0
        for sentence in sentences:
            start_index = end_index
            words = sentence.split(" ")
            for word in words:
                if not tags[end_index] == 0:
                    end_index = start_index + len(words)
                    filtered_article += text[start_index: end_index]
                    filtered_tags += tags[start_index: end_index]
                    assert len(tags[start_index: end_index]) == len(text[start_index: end_index])
                    break
                end_index += 1

            assert len(filtered_tags) == len(filtered_article)
        filtered_data.append( [filtered_article, filtered_tags])
    return filtered_data

In [30]:
import train as train
import time
import scipy.sparse
import pycrfsuite as crf
import helper
from nltk.tokenize import sent_tokenize, word_tokenize

reload(helper)
training_file = "../data/tagged_data/whole_text_full_city2/train.tag"
trained_model = "trained_model_crf.p"


helper.load_constants()
#Load data and split into train/dev
all_data, all_identifier = train.load_data(training_file)
train_split = .6
split_index = int(len(all_data)*train_split)

train_data, train_identifier = all_data[:split_index], all_identifier[:split_index]
balanced_train_data  = balance_data() #Balance data to handle skew

test_data, test_identifier = all_data[split_index:], all_identifier[split_index:]


#Feature extraction
trainX, trainY = featureExtract(balanced_train_data)
testX, testY = featureExtract(test_data)


In [26]:
def trainModel():
    ## extract features
    trainer = crf.Trainer(verbose=True)
    
    for xseq, yseq in zip(trainX, trainY):
        trainer.append(xseq, yseq, group = 0)

    for xseq, yseq in zip(testX, testY):
        trainer.append(xseq, yseq, group = 1)
        
    trainer.set_params({
        'c1': 1.0,   # coefficient for L1 penalty
        'c2': 0,  # coefficient for L2 penalty

        # include transitions that are possible, but not observed
        'feature.possible_transitions': True,
        'feature.possible_states': True,
    })
    
    print trainer.params
    trainer.train(trained_model, holdout=1)
    return trainer

In [8]:
def predict():
    tagger = crf.Tagger()
    tagger.open(trained_model)
    
    predictedY  =  []
    confidences =  []
    confidences_beam = []
    
    for xseq in testX:  
        yseq = tagger.tag(xseq)
        predictedY.append([train.tags2int[y] for y in yseq])
        confidences.append([tagger.marginal(yseq[i],i) for i in range(len(yseq))])   
        confidences_beam.append([ [tagger.marginal(tag, i)  for tag in train.int2tags]   for i in range(len(yseq))])
    return predictedY, testY, confidences, confidences_beam, tagger.info()

In [9]:
def predict(article, trained_model):
    tagger = crf.Tagger()
    tagger.open(trained_model)
    
    xseq = articleFeatureExtract(article)
    yseq  =  tagger.tag(xseq)
    tags  = [train.tags2int[y] for y in yseq]
    confidences =  [tagger.marginal(yseq[i],i) for i in range(len(yseq))]
    confidences_beam = [ [tagger.marginal(tag, i)  for tag in train.int2tags]   for i in range(len(yseq))]
     
    return tags, confidences

In [10]:
## Aggregates results to pick best entity for each tag by mode
## Data is expected to be in format of train_data or test_data
## TODO Add confidences
def aggregateMode(raw_data, predictedY, confidences):
    modes = []
    entity_confidences = []
    for data, predictions in zip(raw_data, predictedY):
        mode = {}
        modeList = []
        for tag in train.int2tags:
            mode[tag] = {}
        tokens = data[0]
        for token, prediction in zip(tokens, predictions):
            if not prediction == "TAG":
                if not prediction in mode:
                    mode[prediction] = {}
                if not token in mode[prediction]:
                    mode[prediction][token] = 1
                else:
                    mode[prediction][token] += 1
        for tag in train.int2tags:
            if tag == "TAG":
                continue
            maxValue = 0
            maxKey   = ""
            for key in mode[tag].keys():
                if mode[tag][key] > maxValue :
                    maxValue = mode[tag][key] 
                    maxKey = key
            modeList.append(maxKey)
        modes.append(modeList)        
    assert len(modes) == len(predictedY)
    return modes, entity_confidences


In [12]:
## Aggregates results to pick best entity for each tag by max cumulative confidence
def aggregateMaxConf(raw_data, predictedY, confidences):
    entities = []
    entity_confidences = []
    for data, predictions, confs in zip(raw_data, predictedY, confidences):
        max_conf = {}
        max_conf_list = []
        for tag in train.int2tags:
            max_conf[tag]= (0, "")
        tokens = data[0]
        for token, prediction, confidence  in zip(tokens, predictions, confs):
            if not prediction == "TAG":
                if confidence > max_conf[prediction][0]:
                    max_conf[prediction] = (confidence, token)
        for tag in train.int2tags:
            if tag == "TAG":
                continue
            max_conf_list.append(max_conf[tag])
        entities.append(max_conf_list[1])
        entity_confidences.append(max_conf_list[0])
    assert len(entities) == len(predictedY)
    return entities, entity_confidences



In [45]:
def featureExtract(data):
    features = []
    labels   = []
    for article, article_labels in data:
        article_features = []
        if '.' in article:
            title = article[:article.index('.')]
        title_features = {}
        for t in title:
#             t = t.lower()
            title_features[t] = 1
        for token_ind in range(len(article)):
            token = article[token_ind]
            context = {}
            prev_n = 4
            next_n = 4
            for i in range(max(0, token_ind - prev_n), min(token_ind + next_n, len(article))):
                context_token = article[i]
                context[context_token] = 1
                context["other"] = helper.getOtherFeatures(context_token)
                context["token"] = context_token
#             token = token.lower()
            token_features = {}
            token_features["context"] = context
            token_features["title"] = title_features
            token_features["token"] = token
            token_features[token]   = 1
            other_features = helper.getOtherFeatures(token)
            token_features["other"] = helper.getOtherFeatures(token)
            article_features.append(token_features)
        features.append(article_features)
        labels.append([train.int2tags[tag] for tag in article_labels])

    return features, labels

trainX, trainY = featureExtract(balanced_train_data)
testX, testY = featureExtract(test_data)


In [46]:
trainModel()

<built-in method params of Trainer object at 0x107f0ba28>
Holdout group: 2

Feature generation
type: CRF1d
feature.minfreq: 0.000000
feature.possible_states: 1
feature.possible_transitions: 1
0....1....2....3....4....5....6....7....8....9....10
Number of features: 86955
Seconds required: 1.542

L-BFGS optimization
c1: 1.000000
c2: 0.000000
num_memories: 6
max_iterations: 2147483647
epsilon: 0.000010
stop: 10
delta: 0.000010
linesearch: MoreThuente
linesearch.max_iterations: 20

***** Iteration #1 *****
Loss: 11245.461902
Feature norm: 1.000000
Error norm: 8506.244195
Active features: 42401
Line search trials: 1
Line search step: 0.000018
Seconds required for this iteration: 0.136
Performance by label (#match, #model, #ref) (precision, recall, F1):
    woundedNum: (0, 0, 215) (0.0000, 0.0000, 0.0000)
    TAG: (56121, 57356, 56121) (0.9785, 1.0000, 0.9891)
    city: (0, 0, 603) (0.0000, 0.0000, 0.0000)
    shooterName: (0, 0, 291) (0.0000, 0.0000, 0.0000)
    killedNum: (0, 0, 126) (0.00

<pycrfsuite._pycrfsuite.Trainer at 0x107f0ba28>

In [2]:
def articleFeatureExtract(article):
    article_features = []
    for token in article:
        token_features = {}
        token_features["token"] = token
        token_features["other"] = helper.getOtherFeatures(token)
        article_features.append(token_features)
    return article_features
