In [1]:
import pickle
from typing import Tuple, List
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.linear_model import SGDClassifier
from scribe_classifier.data.canada.NOCdb.models.simple_model import SimpleModel
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from scribe_classifier.data.canada.NOCdb.readers import TitleSet, TitleRecord
from sklearn.naive_bayes import MultinomialNB
import numpy as np
from scipy import sparse
from sklearn.preprocessing import LabelEncoder
from scribe_classifier.data.canada.NOCdb.readers import AllCodes
from scribe_classifier.data.canada.NOCdb.models.neural_networks.artificial_neural_net import ANNclassifier
# from keras.layers import Dense, Dropout
# from keras.models import Sequential, load_model
from sklearn import metrics
from sklearn.preprocessing import LabelBinarizer
from scribe_classifier.data.canada.NOCdb.models.neural_networks.combined_models import CombinedModels

Using TensorFlow backend.


In [2]:
mdl_strs = dict()
for target_level in range(1,4):
    level_mdl_strs = dict()
    level_mdl_strs['sgd'] = 'source_data/pickles/canada/trained_models/simple.lvl%d.sgdsv.P' % target_level
    level_mdl_strs['bayes'] = 'source_data/pickles/canada/trained_models/simple.lvl%d.bayes.P' % target_level
    if target_level != 3:
        level_mdl_strs['ann'] = 'nnmodels/ANN/neural_net_level%d.P' % target_level
    mdl_strs[target_level]= level_mdl_strs

In [3]:
mdls = dict()
mdl_strs
for target_level in range(1,4):
    level_mdls = dict()
    level_mdls['sgd'] = SimpleModel.load_from_pickle('source_data/pickles/canada/trained_models/simple.lvl%d.sgdsv.P' % target_level, is_path=True)  # type: SimpleModel
    level_mdls['bayes'] = SimpleModel.load_from_pickle('source_data/pickles/canada/trained_models/simple.lvl%d.bayes.P' % target_level, is_path=True)  # type: SimpleModel
    if target_level != 3:
        level_mdls['ann'] = ANNclassifier.load_from_pickle('nnmodels/ANN/neural_net_level%d.P' % target_level)  # type: ANNclassifier
    mdls[target_level]= level_mdls

Loading data...
Loading data...


In [4]:
ac = AllCodes.load_from_pickle('source_data/pickles/canada/tidy_sets/all_codes.P', is_path=True)
ac.add_emptyset()
lbl_bin = LabelBinarizer()

In [5]:
target_level=3
lbl_bin.fit(ac.get_codes_for_level(target_level))

LabelBinarizer(neg_label=0, pos_label=1, sparse_output=False)

In [6]:
cmb_mdls = CombinedModels('source_data/pickles/canada/tidy_sets/all_codes.P',
                          mdl_strs[1],
                          mdl_strs[2],
                          mdl_strs[3],
                          target_level=2)

Loading data...


In [7]:
train = TitleSet.load_from_pickle('source_data/pickles/canada/test_sets/train.set.lvl%d.P' % target_level, is_path=True).copy_and_append_empty_string_class()
test = TitleSet.load_from_pickle('source_data/pickles/canada/test_sets/test.set.lvl%d.P' % target_level, is_path=True).copy_and_append_empty_string_class()
valid = TitleSet.load_from_pickle('source_data/pickles/canada/test_sets/valid.set.lvl%d.P' % target_level, is_path=True).copy_and_append_empty_string_class()

level_mdls = mdls[target_level]
if 'sgd' in level_mdls:
    trained_simple_sgd = mdls[target_level]['sgd']
else:
    trained_simple_sgd = None
if 'bayes' in level_mdls:
    trained_simple_bayes = mdls[target_level]['bayes']
else:
    trained_simple_bayes=None
if 'ann' in level_mdls:
    trained_nn_clf = mdls[target_level]['ann']
else:
    trained_nn_clf = None

In [8]:
T_titles = train.get_title_vec()
t_titles = test.get_title_vec()
v_titles = valid.get_title_vec()
T_labels = train.get_code_vec(target_level=target_level)
t_labels = test.get_code_vec(target_level=target_level)
v_labels = valid.get_code_vec(target_level=target_level)


In [9]:
num_models = 0
if trained_simple_sgd is not None:
    t_sgd_proba = trained_simple_sgd.predict_proba(t_titles)
    num_models += 1
else:
    t_sgd_proba = np.zeros((len(t_titles),len(ac.get_codes_for_level(target_level))))
if trained_simple_bayes is not None:
    t_bayes_proba = trained_simple_bayes.predict_proba(t_titles)
    num_models += 1
else:
    t_bayes_proba = np.zeros((len(t_titles),len(ac.get_codes_for_level(target_level))))
if trained_nn_clf is not None:
    t_nn_proba = trained_nn_clf.predict_proba(t_titles)
    num_models += 1
else:
    t_nn_proba = np.zeros((len(t_titles),len(ac.get_codes_for_level(target_level))))


In [10]:
avg_proba = (t_sgd_proba + t_bayes_proba + t_nn_proba) / num_models

In [11]:
avg_preds = lbl_bin.inverse_transform(avg_proba)

In [12]:
print(metrics.classification_report(t_labels, avg_preds))

             precision    recall  f1-score   support

        001       0.94      0.91      0.93        82
        011       0.98      0.96      0.97        57
        012       0.93      0.93      0.93        59
        013       1.00      0.87      0.93        15
        021       0.93      0.97      0.95        40
        031       0.92      0.92      0.92        26
        041       0.98      0.98      0.98        82
        042       0.94      0.89      0.91        54
        043       0.93      0.72      0.81        18
        051       0.95      0.93      0.94        58
        060       1.00      0.88      0.93         8
        062       0.96      0.86      0.91        28
        063       0.71      1.00      0.83        20
        065       0.95      0.95      0.95        19
        071       0.86      0.91      0.88        33
        073       0.93      1.00      0.96        27
        081       0.83      1.00      0.91        10
        082       0.91      0.91      0.91   

In [13]:
cmb_mdls.sum_relevant_probas(avg_proba, 3)

array([[  5.69757132e-04,   4.03430575e-04,   1.28636151e-04, ...,
          1.97105072e-02,   1.16477776e-02,   6.44573616e-03],
       [  6.41162100e-04,   8.56447034e-04,   2.35132829e-04, ...,
          1.81672093e-03,   2.97235674e-03,   2.83007324e-02],
       [  2.28395584e-04,   2.39655245e-04,   7.47933227e-05, ...,
          1.00664787e-04,   2.07467958e-01,   3.10722244e-04],
       ..., 
       [  5.24177449e-03,   7.83312786e-03,   2.39668367e-03, ...,
          3.53964604e-02,   1.83501113e-02,   5.04097521e-01],
       [  5.24177449e-03,   7.83312786e-03,   2.39668367e-03, ...,
          3.53964604e-02,   1.83501113e-02,   5.04097521e-01],
       [  5.24177449e-03,   7.83312786e-03,   2.39668367e-03, ...,
          3.53964604e-02,   1.83501113e-02,   5.04097521e-01]], dtype=float32)

In [18]:
arr = np.asarray([[1,2,3,4],[5,6,7,8]])

In [31]:
np.array_split(arr,(0,1,2),1)

[array([], shape=(2, 0), dtype=int64), array([[1],
        [5]]), array([[2],
        [6]]), array([[3, 4],
        [7, 8]])]