In [1]:
import pycrfsuite
import spacy
import re
import sys

nlp = spacy.load("en")



def get_data(file):
    data = []
    with open(file, "r", encoding='utf8') as f:
        """
        input: "<tag>w w w w."
        output: list of ([w pos y])
        """
        for course in f.readlines(): 
            sentences = re.split('\<\/\w+\>', course)
            c_data = []
            for sentence in sentences:
                if not sentence.isspace():
                    try:
                        match = re.match('\<\w+\>', sentence.strip())
                        chunk_name = match.group()
                        sentence = sentence[:match.start()]+ sentence[match.end():]
                        docs = nlp(sentence)
                        for token in docs:
                            if token.text != '>':
                                c_data.append((token.text, token.tag_, chunk_name))  
                    except:
                        continue
            data.append(c_data)
        return data


def word2features(doc, i):
    """
    input:
        doc ->list(list[string]): tuples of (words, pos, label)
    output:
        features -> list(string): features of a single word, gotten from last and next word
    """
    word = doc[i][0] #word
    postag = doc[i][1] #tag
    features = [
        'word.lower=' + word.lower(),
        'word.isupper=%s' % word.isupper(),
        'word.istitle=%s' % word.istitle(),
        'word.isdigit=%s' % word.isdigit(),
        'word.isdot=%s' % isdot(word),
        'word.length=' + str(len(word)),
        'postag=%s' % postag,
        'postag[:2]=%s' % postag[:2]
    ]
    
    if i > 0:
        prev_word = doc[i-1][0]
        prev_postag = doc[i-1][1]
        features.extend([
            '-1:word.lower=' + prev_word.lower(),
            '-1:word.isupper=%s' % prev_word.isupper(),
            '-1:word.istitle=%s' % prev_word.istitle(),
            '-1:word.isdigit=%s' % prev_word.isdigit(),
            '-1:word.isdot=%s' % isdot(prev_word),
            '-1:word.length=' + str(len(prev_word)),
            '-1:postag=%s' % prev_postag,
            '-1:postag[:2]=%s' % prev_postag[:2], 
            'last|word=%s|%s' %(prev_word,word)
        ])
    else:
        features.append('BOS')
        
    if i > 1:
        prev_word = doc[i-2][0]
        prev_postag = doc[i-2][1]
        features.extend([
            '-2:word.lower=' + prev_word.lower(),
            '-2:word.length=' + str(len(prev_word)),
            '-2:word.isupper=%s' % prev_word.isupper(),
            '-2:word.istitle=%s' % prev_word.istitle(),
            '-2:word.isdigit=%s' % prev_word.isdigit(),
            '-2:word.isdot=%s' % isdot(prev_word),
            '-2:postag=%s' % prev_postag,
            '-2:postag[:2]=%s' % prev_postag[:2]
        ])

    if i < len(doc)-1:
        next_word = doc[i+1][0]
        next_postag = doc[i+1][1]
        features.extend([
            '+1:word.lower=' + next_word.lower(),
            '+1:word.isupper=%s' % next_word.isupper(),
            '+1:word.istitle=%s' % next_word.istitle(),
            '+1:word.isdigit=%s' % next_word.isdigit(),
            '+1:word.isdot=%s' % isdot(next_word),
            '+1:postag=%s' % next_postag,
            '+1:word.length=' + str(len(next_word)),
            '+1:postag[:2]=%s' % next_postag[:2], 
            'word|next=%s|%s' %(word,next_word), 
        ])
    else:
        features.append('EOS')
        
    if i < len(doc)-2:
        next_word = doc[i+2][0]
        next_postag = doc[i+2][1]
        features.extend([
            '+2:word.lower=' + next_word.lower(),
            '+2:word.length=' + str(len(next_word)),
            '+2:word.isupper=%s' % next_word.isupper(),
            '+2:word.istitle=%s' % next_word.istitle(),
            '+2:word.isdigit=%s' % next_word.isdigit(),
            '+2:word.isdot=%s' % isdot(next_word),
            '+2:postag=%s' % next_postag,
            '+2:postag[:2]=%s' % next_postag[:2]
        ])
    

    return features

def isdot(word):
    return True if '.' in word else False

def get_features(doc):
    """
    input: doc
    output: 
        feature list: list of features by each word
    """
    return [word2features(doc,i) for i in range(len(doc))] 

def get_labels(doc):
    return [label for (token, postag, label) in doc]


train_data = get_data('train-calpoly_tag.txt')
# test_data = get_data('test-calpoly-tag-s.txt')
X_train = [get_features(course_doc) for course_doc in train_data]
y_train = [get_labels(course_doc) for course_doc in train_data]

# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
# X_test = [get_features(course_doc) for course_doc in test_data]
# Y_test = [get_labels(course_doc) for course_doc in test_data]

trainer = pycrfsuite.Trainer(verbose=False)
for xseq, yseq in zip(X_train, y_train):
    trainer.append(xseq, yseq)
trainer.set_params({
        'c1': 0.1,  
        'c2': 0.01, 
        'max_iterations': 10
    })    
trainer.train('calpoly.model')










In [None]:
tagger = pycrfsuite.Tagger()
tagger.open('ucla.model')
predict = [tagger.tag(xseq) for xseq in X_test] 

# Y_test = [train.extract_label(course) for course in data_test]
tp = 0 
tn = 0 
fp = 0 
fn = 0 
for j in range(len(Y_test)):
    for i in range(1,len(Y_test[j])):
        if Y_test[j][i] == Y_test[j][i-1]: 
            test_result = 'N' 
        else:
            test_result = 'P'
        if predict[j][i] != predict[j][i-1] : #P
            if test_result == 'P':
                tp += 1
            else:
                fp += 1  
        else:
            if test_result == 'N':
                if data_test[j][i-1][0] == '.':
                    tp += 1
                else:
                    tn += 1
            else:
                fn += 1 
print(tp, fp+fn)


precision = float(tp)/(tp+fp)
recall = float(tp)/(tp+fn)
f = 2*precision*recall / (precision + recall)
print('Precision = %f\nRecall = %f\nF1 score = %f' %(precision, recall, f))