In [2]:
features = {}
current_word = "test"
features['has_(%s)' % current_word] = 1
print(features)

{'has_(test)': 1}


In [49]:
from nltk.classify.maxent import MaxentClassifier
from sklearn.metrics import (accuracy_score, fbeta_score, precision_score,
                             recall_score)
import os
import pickle
import re
from nltk.tokenize import word_tokenize


class MEMM():
    def __init__(self):
        self.train_path = "../data/train"
        self.dev_path = "../data/dev"
        self.beta = 0
        self.max_iter = 0
        self.classifier = None

    def features(self, words, previous_label, position):
        """
        Note: The previous label of current word is the only visible label.

        :param words: a list of the words in the entire corpus
        :param previous_label: the label for position-1 (or O if it's the start
                of a new sentence)
        :param position: the word you are adding features for
        """

        features = {}
        """ Baseline Features """
        current_word = words[position]
        features['has_(%s)' % current_word] = 1
        features['prev_label'] = previous_label
        if current_word[0].isupper():
            features['Titlecase'] = 1

        #===== TODO: Add your features here =======#
        #  ALLCAP[0:-1].isupper()
        cap_flag = True
        for i in range(len(current_word)):
            if current_word[i].islower():
                cap_flag = False
                break
            else:
                continue
        if cap_flag==True:
            features['ALLCAP'] = 1
        
        #  all lowercase
        low_flag = True
        for i in range(len(current_word)):
            if current_word[i].isupper():
                low_flag = False
                break
            else:
                continue
        if low_flag==True:
            features['lowercase'] = 1
            
        #  after symbols
        symbol = ['.',',','"','\'','(',')']
        if words[position-1] in symbol:
            features['symbol'] = 1
        
        #  have number
        if re.match(r'[0-9]', current_word):
            features['number'] = 1
        
        #  prev word is mr. ms. mrs. Mister Mistress Miss President, Minister
        pretitle = ['mr.','ms.','mrs.','mister','mistress','miss','president','minister']
        if words[position-1].lower() in pretitle:
            features['pretitle'] = 1
          
        #=============== TODO: Done ================#
        return features

    def load_data(self, filename):
        words = []
        labels = []
        for line in open(filename, "r", encoding="utf-8"):
            doublet = line.strip().split("\t")
            if len(doublet) < 2:     # remove emtpy lines
                continue
            words.append(doublet[0])
            labels.append(doublet[1])
        return words, labels

    def train(self):
        print('Training classifier...')
        words, labels = self.load_data(self.train_path)
        previous_labels = ["O"] + labels
        features = [self.features(words, previous_labels[i], i)
                    for i in range(len(words))]
        train_samples = [(f, l) for (f, l) in zip(features, labels)]
        classifier = MaxentClassifier.train(
            train_samples, max_iter=self.max_iter)
        self.classifier = classifier

    def test(self):
        print('Testing classifier...')
        words, labels = self.load_data(self.dev_path)
        previous_labels = ["O"] + labels
        features = [self.features(words, previous_labels[i], i)
                    for i in range(len(words))]
        results = [self.classifier.classify(n) for n in features]

        f_score = fbeta_score(labels, results, average='macro', beta=self.beta)
        precision = precision_score(labels, results, average='macro')
        recall = recall_score(labels, results, average='macro')
        accuracy = accuracy_score(labels, results)

        print("%-15s %.4f\n%-15s %.4f\n%-15s %.4f\n%-15s %.4f\n" %
              ("f_score=", f_score, "accuracy=", accuracy, "recall=", recall,
               "precision=", precision))

        return True

    def show_samples(self, bound):
        """Show some sample probability distributions.
        """
        words, labels = self.load_data(self.train_path)
        previous_labels = ["O"] + labels
        features = [self.features(words, previous_labels[i], i)
                    for i in range(len(words))]
        (m, n) = bound
        pdists = self.classifier.prob_classify_many(features[m:n])

        print('  Words          P(PERSON)  P(O)\n' + '-' * 40)
        for (word, label, pdist) in list(zip(words, labels, pdists))[m:n]:
            if label == 'PERSON':
                fmt = '  %-15s *%6.4f   %6.4f'
            else:
                fmt = '  %-15s  %6.4f  *%6.4f'
            print(fmt % (word, pdist.prob('PERSON'), pdist.prob('O')))

    def dump_model(self):
        with open('../model.pkl', 'wb') as f:
            pickle.dump(self.classifier, f)

    def load_model(self):
        with open('../model.pkl', 'rb') as f:
            self.classifier = pickle.load(f)

    def predict_sentence(self, string):
        # split sentence
        words = ['.'] + word_tokenize(string)
        # First word of the input, it will not be predicted as person
        # predict each word in sentence
        prev_label = ['O'] 
        for i in range(len(words)):
            prev_label.append('') #(unknown labels)
        features = [self.features(words, prev_label[i], i)
                    for i in range(len(words))]
        results = [self.classifier.classify(n) for n in features]
        #print(words[1:])
        #print(results[1:])
        result_list = []
        index = 0
        for i in range(1,(len(results))):
            result_list.append(words[i])
            result_list.append(results[i])
        print(result_list)
        return result_list

In [24]:
BETA = 0.5
MAX_ITER = 5
BOUND = (0, 20)
classifier = MEMM()

def train_test():
    classifier.max_iter = MAX_ITER
    classifier.train()
    classifier.dump_model()
def dev_test():
    try:
        classifier.load_model()
        classifier.beta = BETA
        classifier.test()
    except Exception as e:
        print(e)
def show_test():
    try:
        classifier.load_model()
        classifier.show_samples(BOUND)
    except Exception as e:
        print(e)

train_test()

Training classifier...
  ==> Training (5 iterations)

      Iteration    Log Likelihood    Accuracy
      ---------------------------------------
             1          -0.69315        0.055
             2          -0.06094        0.946
             3          -0.05451        0.969
             4          -0.04894        0.970
         Final          -0.04437        0.981


In [13]:
train_test()
dev_test()
show_test()

Training classifier...
  ==> Training (5 iterations)

      Iteration    Log Likelihood    Accuracy
      ---------------------------------------
             1          -0.69315        0.055
             2          -0.09383        0.946
             3          -0.08134        0.968
             4          -0.07136        0.969
         Final          -0.06330        0.969
Testing classifier...
f_score=        0.8715
accuracy=       0.9641
recall=         0.7143
precision=      0.9642

  Words          P(PERSON)  P(O)
----------------------------------------
  EU               0.0544  *0.9456
  rejects          0.0286  *0.9714
  German           0.0544  *0.9456
  call             0.0286  *0.9714
  to               0.0284  *0.9716
  boycott          0.0286  *0.9714
  British          0.0544  *0.9456
  lamb             0.0286  *0.9714
  .                0.0281  *0.9719
  Peter           *0.4059   0.5941
  Blackburn       *0.5057   0.4943
  BRUSSELS         0.4977  *0.5023
  1996-08-22   

In [50]:
classifier = MEMM()

def predict(string):
    classifier.load_model()
    classifier.predict_sentence(string)
    
string = "Patricia isn't a president artist in, kpop team named BLACKPINK in 2020. Tom is in China."
list_ = predict(string)
print(list_)

None


In [45]:
train_test()
dev_test()
show_test()

Training classifier...
  ==> Training (5 iterations)

      Iteration    Log Likelihood    Accuracy
      ---------------------------------------
             1          -0.69315        0.055
             2          -0.06095        0.946
             3          -0.05451        0.969
             4          -0.04894        0.969
         Final          -0.04436        0.981
Testing classifier...
f_score=        0.9219
accuracy=       0.9739
recall=         0.7924
precision=      0.9780

  Words          P(PERSON)  P(O)
----------------------------------------
  EU               0.0486  *0.9514
  rejects          0.0008  *0.9992
  German           0.1529  *0.8471
  call             0.0008  *0.9992
  to               0.0008  *0.9992
  boycott          0.0008  *0.9992
  British          0.1547  *0.8453
  lamb             0.0008  *0.9992
  .                0.0002  *0.9998
  Peter           *0.4445   0.5555
  Blackburn       *0.3947   0.6053
  BRUSSELS         0.2494  *0.7506
  1996-08-22   