In [2]:
from itertools import chain

import nltk
import sklearn
import scipy.stats
from sklearn.metrics import make_scorer
from sklearn.grid_search import RandomizedSearchCV

import sklearn_crfsuite
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics

import spacy
from spacy.tokens import Doc
from spacy.pipeline import Tagger
import csv
import pickle

In [16]:
#Load model
nlp = spacy.load("en_core_web_md")

In [None]:
# #Read data:
# data =[]
# sents = []
# file1 = open("./data/file42.txt.anns","r", encoding="utf8")
# file2 = open("./data/file43.txt.anns","r", encoding="utf8")
# file3 = open("./data/file44.txt.anns","r", encoding="utf8")
# file4 = open("./data/file45.txt.anns","r", encoding="utf8")

# for line in file1.readlines():
#     data.append(line)
# for line in file2.readlines():
#     data.append(line)
# for line in file3.readlines():
#     data.append(line)
# for line in file4.readlines():
#     data.append(line)

# c_sent = []
# for line in data:
#     if line == "\n":
#         if len(c_sent) > 0:
#             sents.append(c_sent)
#             c_sent = []
#     else:
#         c_sent.append((line.split(" ")[0],line.split(" ")[1].replace("\n","")))

In [None]:
# #Get POS and DEP
# final_res = []
# for sent in sents:
#     doc = Doc(nlp.vocab, words=[word[0] for word in sent])
#     deps = nlp.parser(doc)
#     tagger = nlp.tagger(doc)
#     for i,word in enumerate(deps):
#         final_res.append([word.text, tagger[i].pos_, word.dep_, word.head, sent[i][1]])
#     final_res.append([0,0,0,0,0])

In [None]:
# #Save file
# import csv
# with open("./data/data_POS_DEP.csv","w", encoding="utf8") as outputf:
#     writer = csv.writer(outputf)
#     for line in final_res:
#         writer.writerow(line)

In [3]:
#Read file
raw_data = []
with open("./data/data_POS_DEP.csv", "r", encoding ="utf8") as inputf:
    reader = csv.reader(inputf)
    for line in reader:
        raw_data.append(line)
with open("./data/gen_data_POS_TAG.csv", "r", encoding ="utf8") as inputf:
    reader = csv.reader(inputf)
    for line in reader:
        raw_data.append(line)
data = []
tmp = []
for line in raw_data:
    if (str(line[1]) == "0"):
        data.append(tmp)
        tmp = []
    else:
        tmp.append(line)

In [4]:
def word2features(sent,i):
    word = sent[i][0]
    features = {
        'bias': 1.0,
        'word.lower()': word.lower(),
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
        'postag': sent[i][1],
        'dep' : sent[i][2],
        'head' : sent[i][3],
    }
    
    if i > 0:
        word1 = sent[i-1][0]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
            '-1:postag': sent[i-1][1],
            '-1:dep': sent[i-1][2],
            '-1:head' : sent[i-1][3]
        })
    else:
        features['BOS'] = True
        
    if i < len(sent)-1:
        word1 = sent[i+1][0]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
            '+1:postag': sent[i+1][1],
            '+1:dep': sent[i+1][2],
            '+1:head' : sent[i+1][3]
        })
    else:
        features['EOS'] = True

    return features

In [5]:
def sent2features(sent):
    return [word2features(sent,i) for i in range(len(sent))]

In [6]:
def sent2labels(sent):
    return [word[4] for word in sent]

In [7]:
X_train = [sent2features(s) for s in data]
y_train = [sent2labels(s) for s in data]

In [13]:
%%time
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.7467260106178021,
    c2=0.012122583759942334,
    max_iterations=100,
    all_possible_transitions=True
)
crf.fit(X_train, y_train)

CPU times: user 11.3 s, sys: 0 ns, total: 11.3 s
Wall time: 11.3 s


In [11]:
%%time
labels = list(crf.classes_)
labels.remove('O')
# define fixed parameters and parameters to search
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    max_iterations=100,
    all_possible_transitions=True
)
params_space = {
    'c1': scipy.stats.expon(scale=0.5),
    'c2': scipy.stats.expon(scale=0.05),
}

# use the same metric for evaluation
f1_scorer = make_scorer(metrics.flat_f1_score,
                        average='weighted', labels=labels)

# search
rs = RandomizedSearchCV(crf, params_space,
                        cv=3,
                        verbose=1,
                        n_jobs=-1,
                        n_iter=50,
                        scoring=f1_scorer)
rs.fit(X_train, y_train)


Fitting 3 folds for each of 50 candidates, totalling 150 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  2.9min
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed: 10.6min finished


CPU times: user 1min 28s, sys: 5.54 s, total: 1min 33s
Wall time: 10min 47s


In [12]:
rs.best_params_

{'c1': 0.7467260106178021, 'c2': 0.012122583759942334}

In [14]:
#Save model
from sklearn.externals import joblib

filename = './model/crf_gen_model_10ktrain.sav'
joblib.dump(crf, filename)

['./model/crf_gen_model_10ktrain.sav']

In [20]:
test_sents = ["Mobile Applications and Device Management"]
data_test = []
for sent in test_sents:
    c_sent = []
    doc = nlp(sent)
    for i,word in enumerate(doc):
        c_sent.append([word.text, word.pos_, word.dep_, word.head.text])
    data_test.append(c_sent)
    
X_test = [sent2features(s) for s in data_test]

In [21]:
y_pred = crf.predict(X_test)
for i in range(len(y_pred[0])):
    print(data_test[0][i][0],y_pred[0][i])

Mobile B-SKILL
Applications E-SKILL
and O
Device B-SKILL
Management E-SKILL


In [None]:
# from collections import Counter

# def print_state_features(state_features):
#     for (attr, label), weight in state_features:
#         print("%0.6f %-8s %s" % (weight, label, attr))
# print("Top positive:")
# print_state_features(Counter(crf.state_features_).most_common(30))
# print("\nTop negative:")
# print_state_features(Counter(crf.state_features_).most_common()[-30:])