# Setting up the experiment

In [2]:
from NPLearner import NPLearner
from NPLearner import default_feature_func
from NPLearner import NLTK_Model

import nltk

from nltk.classify import SklearnClassifier
from nltk.classify import MaxentClassifier, \
                        DecisionTreeClassifier, \
                        NaiveBayesClassifier, \
                        WekaClassifier
import numpy as np
import pandas as pd

IOB_LABEL_MAP = {"O": 0, "B-NP": 1, "I-NP": 2}
IO_LABEL_MAP = {"O": 0, "I-NP": 1}

PTB = "treebank/"
RANDOM_SEED = 42 # Arbitrarily set so that the training and testing split is consistent accross all experiments 

In [3]:
## traditional Machine Learning approaches

#setting max_iter for maxentclassifier to 10000
MaxEnt1 = NLTK_Model(MaxentClassifier, "MaximumEntropyClass_default", optional_args={"max_iter":10000, "min_lldelta":1e-6})
DecTree1 = NLTK_Model(DecisionTreeClassifier, "DecisionTreeClassifier_default")
NB1 = NLTK_Model(NaiveBayesClassifier, "NaiveBayesClassifier_default")

## putting experiments in a list for IO
io_mods = [MaxEnt1, DecTree1, NB1]


MaxEnt2 = NLTK_Model(MaxentClassifier, "MaximumEntropyClass_default", optional_args={"max_iter":10000, "min_lldelta":1e-6})
DecTree2 = NLTK_Model(DecisionTreeClassifier, "DecisionTreeClassifier_default")
NB2 = NLTK_Model(NaiveBayesClassifier, "NaiveBayesClassifier_default")
## putting experiments in a list for IOB
iob_mods = [MaxEnt2, DecTree2, NB2]

# Training + predicting + evaluating the models

In [4]:
# Max = 100 iterations
IOB_experiment = NPLearner(PTB, iob_mods, default_feature_func, verbose=True, random_state=RANDOM_SEED)

IOB_experiment.fit()
IOB_experiment.predict()

IOB_metrics_default = IOB_experiment.evaluate()

---------- TRAINING ----------
Training MaximumEntropyClass_default...
  ==> Training (10000 iterations)

      Iteration    Log Likelihood    Accuracy
      ---------------------------------------
             1          -1.09861        0.561
             2          -0.38307        0.835
             3          -0.30433        0.871
             4          -0.26866        0.890
             5          -0.24637        0.901
             6          -0.23030        0.909
             7          -0.21778        0.916
             8          -0.20756        0.920
             9          -0.19896        0.925
            10          -0.19155        0.929
            11          -0.18508        0.931
            12          -0.17935        0.934
            13          -0.17422        0.936
            14          -0.16959        0.939
            15          -0.16539        0.941
            16          -0.16155        0.942
            17          -0.15803        0.944
            18      

In [35]:
## experiment with IO labeling instead of IOB -- all accuracies are expected to be higher

IO_experiment = NPLearner(PTB, mods, default_feature_func, label_map=IO_LABEL_MAP, 
                        NP_tagging_type="IO", verbose=True, random_state=RANDOM_SEED)

IO_experiment.fit()
IO_experiment.predict()

IO_metrics_default = IO_experiment.evaluate()



---------- TRAINING ----------
Training MaximumEntropyClass_default...
  ==> Training (10000 iterations)

      Iteration    Log Likelihood    Accuracy
      ---------------------------------------
             1          -0.69315        0.709
             2          -0.22021        0.894
             3          -0.17594        0.918
             4          -0.15610        0.927
             5          -0.14376        0.933
             6          -0.13493        0.938
             7          -0.12810        0.942
             8          -0.12256        0.945
             9          -0.11790        0.948
            10          -0.11390        0.950
            11          -0.11041        0.952
            12          -0.10732        0.954
            13          -0.10455        0.956
            14          -0.10204        0.957
            15          -0.09977        0.958
            16          -0.09769        0.959
            17          -0.09578        0.960
            18      

In [5]:
## Displaying tabulated metrics_default for IOB labeling:

print("For IOB labeling:")

for mod_dic in IOB_metrics_default:
    ## format is [ModelName]: [Accuracy]
    print("{}:{}".format(mod_dic["Model type"], mod_dic["Accuracy score"]))

print("For IO labeling:")

for mod_dic in IO_metrics_default:
    ## format is [ModelName]: [Accuracy]
    print("{}:{}".format(mod_dic["Model type"], mod_dic["Accuracy score"]))

    

For IOB labeling:
MaximumEntropyClass_default:0.8520486793856619
DecisionTreeClassifier_default:0.809746505819206
NaiveBayesClassifier_default:0.820694053249721
For IO labeling:


NameError: name 'IO_metrics_default' is not defined

# Saving the models

In [40]:
# saving from IOExperiments

IO_models = IO_experiment.getModels()
IOB_models = IOB_experiment.getModels()

for mod in IO_models:
    # saving the model to the current directory
    mod.save("IO_labeling_models/")

# saving from IOBExperiments

for mod in IOB_models:
    mod.save("IOB_labeling_models/")
    


NameError: name 'os' is not defined

In [7]:
# Saving the models manually -- this part should be removed if
# save function is correctly implemented.

import os
try:
    import cPickle as pickle
except:
    import pickle


def save(model, subdir = ""):
    """
    Inputs:
        model(NLTK_Model): model being saved
        subdir(str, optional): subdirectory for model 
    """
    
    file_name = "nltk" + model.model_name + ".pkl"
    file_path = os.path.join ("models/", subdir, file_name)
    
    pickle.dump(model, open(file_path, "wb"))
    
    
#IO_models = IO_experiment.getModels()
IOB_models = IOB_experiment.getModels()

#for mod in IO_models:
    # saving the model to the current directory
    # save(mod, subdir="IO_labeling_models/")

# saving from IOBExperiments

for mod in IOB_models:
    save(mod, subdir="IOB_labeling_models/")
