In [1]:
train_ds_path = 'output/dataset-200k-noisy.train.csv'
test_ds_path = 'output/dataset-200k-noisy.test.csv'
abbreviation_mapping_path = 'dicts/mle_abbreviation_mapping_from_ani_20190925.csv'

In [2]:
import sys
sys.path.append('.')

In [3]:
import os
if os.getcwd().endswith('/notebooks'):
    os.chdir(os.path.join(os.getcwd(), '..'))

In [4]:
import pandas as pd
from sklearn.base import TransformerMixin
from sklearn.model_selection import train_test_split
from collections import Counter, defaultdict, OrderedDict
from sklearn.metrics import precision_recall_fscore_support
from sklearn.preprocessing import KBinsDiscretizer, OneHotEncoder, LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.datasets import dump_svmlight_file
from classifyGenericModified import ExperimentalClassifier, normalizeName
from tqdm import tqdm
import numpy as np
from sklearn.pipeline import FeatureUnion
from sklearn.pipeline import make_pipeline
from sklearn.svm import SVC, LinearSVC
from sklearn.kernel_approximation import Nystroem
import itertools
import re
import joblib

In [5]:
train_ds = pd.read_csv(train_ds_path)
test_ds = pd.read_csv(test_ds_path)

In [6]:
train_ds.sample(3)

Unnamed: 0,input,id,label,numdocs
16864,Nailsea,,SPE,51.0
71106,New Zealand Institute For Plant And Food Research,,SPE,50.0
47901,Akaki Tsereteli State University,,SPE,100.0


In [35]:
train_ds.label.value_counts()

SPE    75000
GEN    23400
Name: label, dtype: int64

In [36]:
class_weight = {'SPE': 1, 'GEN': 15}

# Word frequency

In [37]:
tokens = Counter(t.lower() for s in train_ds.input for t in s.split())

In [38]:
tokens_ordered = [t for t, _ in tokens.most_common()]

In [39]:
tokens_group1 = set(tokens_ordered[:1000])
tokens_group2 = set(tokens_ordered[1000:10000])
tokens_group3 = set(tokens_ordered[10000:])

In [40]:
tokens_ordered[1000:1010] # more frequent tokens are more likely to be common words

['usda-ars',
 'toronto',
 'biochemical',
 'neurosurgery',
 'rochester',
 'metall.',
 'bordeaux',
 'complejo',
 'wissenschaften',
 'w']

In [41]:
tokens_ordered[10000:10010] # less frequent tokens are more likely to be names

['icon',
 'ripault',
 'rijeci',
 'lessi',
 'biol./environmental',
 'cmw.',
 'sentara',
 'bamboo',
 'rattan',
 'neurocirugía']

# Feature extraction

In [44]:
rule_based_classifier = ExperimentalClassifier('rule', use_multilingual_dicts=True, use_zipcode_us_rule=True)

Loading dictionary...


generating approximate dict: 100%|██████████| 50434/50434 [00:28<00:00, 1777.06it/s]


Loading dictionary done in 29.72 sec.


In [45]:
top_level_types = '''Academy
Branch
Center
Centers
Central Institute
Centre
Centre National
Centro
Centro Nacional
Centro Ricerche
Children's Hospital
Clinic
Clinica
Clinical Division
Clininque
Clinique
Clinique chirurgicale
College
Escola
Escuel
Escuela
Estación
Graduate Institute
Graduate School
Group
Groupe
Grupo
Hôpital
Hospital
Institut
Institute
Institutes
Institution
Institut Municipal
Institut National
Instituto
Instituto Nacional
Istituto
Klinik
Klinik und Poliklinik
Lab
Lab.
Laboratoire
Laboratories
Laboratorio
Laboratório
Laboratório Especial
Laboratory
Ministry
Municipal Institute
National Center
National Centre
National Institute
National Institutes
National Laboratory
Program
Programa
Research Center
Research Centre
Research Institute
Research Unit
Sch.
School
Scuola
Sección
Section
Service
Servicio
Serviço
Servizio
Sezione
Specialist Clinic
University Hospital
Zentrum'''.lower().split()
# top_level_types

In [46]:
top_level_words = itertools.chain(rule_based_classifier.cn, rule_based_classifier.univ,
                                  rule_based_classifier.companyTypes, top_level_types)
top_level_word_map = {
    w: '^top' for w in top_level_words
}

In [47]:
word_lists = [ # later lists have precedence
    ('^L', rule_based_classifier.allLoc),
    ('^sbE', rule_based_classifier.expandedSubjectDict),
    ('^S', rule_based_classifier.subjectDict),
    ('^cS', rule_based_classifier.commonSubjectsDict),
    ('^sb', rule_based_classifier.subjModDict),
    ('^mO', rule_based_classifier.orgModDict),
    ('^cn', rule_based_classifier.cn),
    ('^univ', rule_based_classifier.univ),
    ('^cnS', rule_based_classifier.companySuffixes),
    ('^cnT', rule_based_classifier.companyTypes),
    ('^T', rule_based_classifier.typeDict),
    ('^e', rule_based_classifier.wordEndingsDict),
    ('^sw', rule_based_classifier.sw),
    ('^lf', tokens_group3),
    ('^and', 'and|&|y|und|e|og|i|και|ja|et|és|en|ból|ve'.split('|')),
    ('^of', 'of|de|des|der|di|fur|fr|for|für|voor|in|zu|és'.split('|')),
    ('^in', 'in|a|op|zu'.split('|')),
    ('^lat', 'I|II|III|IV|V|VI|VII|VIII|IX|X|XI|XII|XIII|XIV|XV|XVI|XVII|XVIII|XIX|XX'.split('|')),
]

In [48]:
word2tag = {}
for name, word_list in word_lists:
    # word lists from ExperimentalClassifier are preprocessed already, 
    # strings were lower-cased and there are no duplicates
    for w in tqdm(word_list, position=0, leave=True):
        if w:
            # ideally, each word occurs in only one list
            # in reality, it's commonly that they occur in many lists
            word2tag[w] = name

100%|██████████| 72276/72276 [00:00<00:00, 1316761.72it/s]
100%|██████████| 15072/15072 [00:00<00:00, 1624310.75it/s]
100%|██████████| 50434/50434 [00:00<00:00, 734855.58it/s]
100%|██████████| 236/236 [00:00<00:00, 312553.12it/s]
100%|██████████| 3077/3077 [00:00<00:00, 451506.91it/s]
100%|██████████| 112/112 [00:00<00:00, 183357.55it/s]
100%|██████████| 542/542 [00:00<00:00, 419507.80it/s]
100%|██████████| 36/36 [00:00<00:00, 142448.06it/s]
100%|██████████| 99/99 [00:00<00:00, 172799.04it/s]
100%|██████████| 95/95 [00:00<00:00, 280013.27it/s]
100%|██████████| 569/569 [00:00<00:00, 299630.76it/s]
100%|██████████| 154/154 [00:00<00:00, 156663.31it/s]
100%|██████████| 1911/1911 [00:00<00:00, 758738.64it/s]
100%|██████████| 27550/27550 [00:00<00:00, 1095082.21it/s]
100%|██████████| 14/14 [00:00<00:00, 8534.92it/s]
100%|██████████| 13/13 [00:00<00:00, 26585.06it/s]
100%|██████████| 4/4 [00:00<00:00, 2631.72it/s]
100%|██████████| 20/20 [00:00<00:00, 42733.61it/s]


In [49]:
[(name, len(wl)) for name, wl in word_lists]

[('^L', 72276),
 ('^sbE', 15072),
 ('^S', 50434),
 ('^cS', 236),
 ('^sb', 3077),
 ('^mO', 112),
 ('^cn', 542),
 ('^univ', 36),
 ('^cnS', 99),
 ('^cnT', 95),
 ('^T', 569),
 ('^e', 154),
 ('^sw', 1911),
 ('^lf', 27550),
 ('^and', 14),
 ('^of', 13),
 ('^in', 4),
 ('^lat', 20)]

In [50]:
len(word2tag)

150122

In [51]:
def identity_analyzer(x): 
    ''' This is verbose but a lambda can't be serialized '''
    return x

In [52]:
rule_encoder = OneHotEncoder(handle_unknown='ignore')
freq_number_vectorizer_func = lambda: CountVectorizer(analyzer=identity_analyzer, max_features=20, binary=True)
ngram_pattern_vectorizer = CountVectorizer(analyzer=identity_analyzer, ngram_range=(2,5), min_df=10, binary=True)
joined_pattern_vectorizer = CountVectorizer(analyzer=identity_analyzer, min_df=20, binary=True)
ngram_pattern_vectorizer2 = CountVectorizer(analyzer=identity_analyzer, ngram_range=(2,5), min_df=10, binary=True)
joined_pattern_vectorizer2 = CountVectorizer(analyzer=identity_analyzer, min_df=20, binary=True)
discretizer = KBinsDiscretizer(n_bins=(5, 5, 5, 5, 5), encode='onehot', strategy='quantile')
feature_extractor = make_pipeline(
    TokenTranslator(abbreviation_mapping_path),
    FeatureUnion([
        ('freq-char-len', make_pipeline(CharLenFeature(), freq_number_vectorizer_func())),
        ('freq-word-len', make_pipeline(WordLenFeature(), freq_number_vectorizer_func())),
        ('freq-min-word-freq', make_pipeline(WordFrequencyFeatures(tokens, stats=['min']), freq_number_vectorizer_func())),
        ('freq-max-word-freq', make_pipeline(WordFrequencyFeatures(tokens, stats=['max']), freq_number_vectorizer_func())),
        ('numerical-discretized', make_pipeline(
            FeatureUnion([
                ('char_len', CharLenFeature()),
                ('word_len', WordLenFeature()),
                ('word_freq', WordFrequencyFeatures(tokens)),
            ]),
            discretizer,
        )),
        ('word_list', make_pipeline( # todo: slow to serialize
            # didn't use num_top_freq_words because I don't want to override "^top" tags
            WordListTagger(word2tag, 0, default_tag='^nf'),
            FeatureUnion([
                ('joined', make_pipeline(StringConcat(), joined_pattern_vectorizer)),
                ('n-grams', ngram_pattern_vectorizer)
            ])
        )),
        ('word_list_top_freq', make_pipeline( # todo: slow to serialize
            WordListTagger(word2tag, 0, default_tag='^nf'),
            FeatureUnion([
                ('joined', make_pipeline(StringConcat(), joined_pattern_vectorizer2)),
                ('n-grams', ngram_pattern_vectorizer2)
            ])
        )),
        ('top_level_word', make_pipeline( # check for words such as "university", "institution", "ltd."
            WordListTagger(top_level_word_map, 1000, default_tag='other'),
            CountVectorizer(analyzer=identity_analyzer, ngram_range=(1,3), binary=True)
        )),
        ('rule', make_pipeline( # todo: slow to serialize
            # ignore the last one (default rule) because it just emit 'GEN' for everything
            RuleFeatures(rule_based_classifier.classification_functions[:-1],
                         rule_based_classifier.preprocess), 
            rule_encoder
        ))
    ]),
    TfidfTransformer(norm=None), # so that feature vectors are more meaningful (for debugging)
)
pipeline = make_pipeline(
    feature_extractor,
    Nystroem(kernel='poly', degree=2),
    LinearSVC(class_weight=class_weight)
)

In [53]:
%%time
# _, train_sample = train_test_split(train_ds, test_size=0.01, stratify=train_ds.label)
# train_sample = train_sample.copy()
train_sample = train_ds
pipeline.fit(train_sample.input, y=train_sample.label)

  'decreasing the number of bins.' % jj)
  'decreasing the number of bins.' % jj)
  'decreasing the number of bins.' % jj)


CPU times: user 2min 41s, sys: 1.29 s, total: 2min 42s
Wall time: 2min 38s


Pipeline(memory=None,
         steps=[('pipeline',
                 Pipeline(memory=None,
                          steps=[('tokentranslator',
                                  <__main__.TokenTranslator object at 0x2fae6a0b8>),
                                 ('featureunion',
                                  FeatureUnion(n_jobs=None,
                                               transformer_list=[('freq-char-len',
                                                                  Pipeline(memory=None,
                                                                           steps=[('charlenfeature',
                                                                                   <__main__.CharLenFeature object at 0x2fae6a0f0>),
                                                                                  ('countvectorizer',
                                                                                   CountVectorizer(analyzer=<fun...
                ('nystroem',
          

In [54]:
# _, test_sample = train_test_split(test_ds, test_size=0.1, stratify=test_ds.label)
# test_sample = test_sample.copy()
test_sample = test_ds

In [55]:
%%time
classifyOrg(rule_based_classifier, pipeline, test_sample)

CPU times: user 2min 27s, sys: 1.45 s, total: 2min 28s
Wall time: 2min 24s


In [56]:
test_sample

Unnamed: 0,input,id,label,numdocs,prediction
0,Royal Signals & Radar Establishment,,SPE,79.0,SPE
1,Hôpital Neuchâtelois,,SPE,87.0,SPE
2,Facultad De Medicina,,SPE,442.0,GEN
3,Fraunhofer-institute For Applied Polymer Research,,SPE,44.0,GEN
4,Inst. Fiziol.,,SPE,101.0,SPE
...,...,...,...,...,...
98396,College of Law,60120362.0,GEN,1.0,GEN
98397,Departments of Human Development and Family St...,60120172.0,GEN,1.0,GEN
98398,National University Of Defense And Technology,,SPE,376.0,SPE
98399,Us Doe Joint Genome Institute,,SPE,36.0,SPE


In [57]:
(test_sample['label'] == test_sample['prediction']).mean()

0.8007337323807685

In [58]:
(test_sample['prediction'] == 'GEN').sum()

41215

In [59]:
len(test_sample)

98401

In [60]:
precision_recall_fscore_support(test_sample.label, test_sample.prediction, labels=['GEN'])

(array([0.5460148]), array([0.9616683]), array([0.69654575]), array([23401]))

## Examining transformations

### Abbreviation resolution

In [167]:
precision_recall_fscore_support(test_sample.label, test_sample.prediction, labels=['GEN'])

(array([0.61084165]), array([0.94718174]), array([0.74270779]), array([23401]))

In [61]:
abbreviation_solver = make_pipeline(
    WordSplitter(),
    TokenTranslator(abbreviation_mapping_path)
)

In [62]:
train_sample['input_abbr_solved'] = abbreviation_solver.transform(train_sample.input)

In [63]:
train_sample[train_sample.input.str.match(r'\b(\w{2,}\.){2,}')][['input', 'input_abbr_solved']].sample(5)

Unnamed: 0,input,input_abbr_solved
69799,Inst.Zool.,institute zoology
42600,Lab.Molec. Genet.,laboratory molecular genetics
25779,Lab.Mikrobiol.,laboratory mikrobiologie
89410,Dept.Pathol.,department pathology
52223,Dept.Neonatol. Thorac. Cardiovasc. Surg.,department neonatology thoracic cardiovascular...


In [64]:
train_sample[train_sample.input.str.match(r'\b\w{3,}\.?(/\w{3,}\.?)+')][['input', 'input_abbr_solved']].sample(5)

Unnamed: 0,input,input_abbr_solved
8097,Cnrs/ups,Cnrs ups
21093,Biophysics/Biochemistry Dept.,Biophysics Biochemistry department
12895,ESAT/SCD,ESAT SCD
70368,Ecology/Evolution Department,Ecology Evolution Department
2576,Niaaa/nih,Niaaa nih


In [65]:
train_sample[train_sample.input.str.match(r'\b\w+\.\s')][['input', 'input_abbr_solved']].sample(50)

Unnamed: 0,input,input_abbr_solved
68650,Depts. Biochem. Molec. Biol.,department biochemistry molecular biology
88356,Dept. Pharmacol.Therapeut.,department pharmacology therapeutics
81976,Lab. Of Cad And Cg,laboratory Of Cad And Cg
44032,Inst. of Soil Sci. and Plant Nutr.,institute of Soil science and Plant nutrition
71319,St. Petersburg State Technical Univ,St. Petersburg State Technical Univ
22168,Depto. de Quim. Organica,department de quimica Organica
48813,Inst. Of Gen. And Inorg. Chemistry,institute Of general And inorganic Chemistry
62658,Lab. Phys. Theor. L'ecl. Normale S.,laboratory physics theoretical L'ecl. Normale S.
45874,Clin. Obstet.,clinical obstetrics
71915,Ctr. For Math. And Computer Science,ctra For mathematics And Computer Science


### Pattern tagging

In [66]:
pattern_tagger = make_pipeline(
    WordSplitter(),
    TokenTranslator(abbreviation_mapping_path),
    WordListTagger(word2tag, 0, default_tag='^nf'),
)
pattern_tagger2 = make_pipeline(
    WordSplitter(),
    TokenTranslator('dicts/mle_abbreviation_mapping_from_ani_20190925.csv'),
    WordListTagger(word2tag, 100, default_tag='^nf'),
)

In [67]:
train_sample['input_tagged'] = pattern_tagger.fit_transform(train_sample.input)
train_sample['input_tagged2'] = pattern_tagger2.fit_transform(train_sample.input)

In [68]:
(train_sample['input_tagged'] != train_sample['input_tagged2']).mean()

0.8429776422764228

In [69]:
train_sample[['input', 'input_tagged', 'input_tagged2']].sample(50)

Unnamed: 0,input,input_tagged,input_tagged2
17079,Wellcome Trust Genome Camp.,"[^nf, ^cnT, ^sb, ^nf]","[^nf, ^cnT, ^sb, ^nf]"
95932,Japanese Foundation For Cancer Research,"[^nf, ^T, ^of, ^sb, ^cnS]","[^nf, foundation, for, cancer, research]"
56454,Mt Sinai Hosp.,"[^S, ^L, ^T]","[^S, ^L, hospital]"
16542,Fakultät Für Architektur,"[^nf, ^of, ^lf]","[^nf, für, ^lf]"
20522,Université De Paris-sud,"[^nf, ^of, ^nf]","[université, de, ^nf]"
59718,Economics Department,"[^S, ^T]","[^S, department]"
35120,Hubei Univ. Of Technology,"[^L, ^univ, ^of, ^cS]","[^L, university, of, technology]"
51746,Department of Social Sciences,"[^T, ^of, ^sb, ^S]","[department, of, ^sb, sciences]"
83995,Amazon.com,[^lf],[^lf]
66234,Tokyo Electron Ltd,"[^L, ^S, ^cnT]","[^L, ^S, ^cnT]"


In [70]:
pattern_tagger2.steps[2][1].top_freq_words

OrderedDict([('of', 31354),
             ('university', 16260),
             ('department', 13923),
             ('and', 11552),
             ('institute', 10333),
             ('hospital', 8545),
             ('research', 6994),
             ('de', 6764),
             ('for', 4481),
             ('center', 4471),
             ('college', 3887),
             ('engineering', 3867),
             ('medical', 3864),
             ('sciences', 3787),
             ('science', 3725),
             ('medicine', 3721),
             ('school', 3568),
             ('technology', 3105),
             ('national', 2703),
             ('health', 2321),
             ('the', 2195),
             ('centre', 1963),
             ('academy', 1880),
             ('state', 1745),
             ('laboratory', 1528),
             ('chemistry', 1159),
             ('inc.', 1146),
             ('biology', 1133),
             ('institut', 1045),
             ('physics', 967),
             ('faculty', 930),
          

In [71]:
top_level_tagger = WordListTagger(top_level_word_map, 0, default_tag='other')

In [72]:
train_sample['input_top_lvl_tagged'] = top_level_tagger.fit_transform(train_sample.input)

In [73]:
train_sample[['input', 'input_top_lvl_tagged']].sample(10)

Unnamed: 0,input,input_top_lvl_tagged
26264,Max-planck-institut Für Tierzucht Und Tierernä...,"[other, ^top, other]"
13609,Institut fur Physikalische Chemie,"[^top, other]"
11839,Council Of Labor Affairs,[other]
66393,Hospital General La Mancha-centro,"[^top, other]"
41908,Departments of Neurological Sciences,[other]
58711,Inst. of Biosciences and Technology,[other]
63551,Depts. Pathol.,[other]
88680,Hospital De Mataro,"[^top, other]"
23173,Pennsylvania Coll. Of Podiatric Med.,[other]
70069,Biology Centre Ascr,"[other, ^top, other]"


## Examining feaures

In [74]:
discretizer.bin_edges_

array([array([  2.,  20.,  26.,  33.,  41., 140.]),
       array([ 1.,  3.,  4.,  5., 19.]), array([0., 1.]),
       array([1.0000e+00, 6.5600e+02, 6.7640e+03, 1.1552e+04, 3.1354e+04]),
       array([3.33333333e-01, 2.16500000e+02, 1.84800000e+03, 3.53600000e+03,
       7.28085714e+03, 1.51640000e+04])], dtype=object)

In [75]:
ngram_pattern_vectorizer.vocabulary_

{'^nf': 12,
 '^mO': 11,
 '^T': 2,
 '^of': 13,
 '^L': 0,
 '^sb': 14,
 '^S': 1,
 '^lf': 10,
 '^and': 3,
 '^cnT': 7,
 '^univ': 17,
 '^cnS': 6,
 '^sw': 16,
 '^cS': 4,
 '^sbE': 15,
 '^in': 9,
 '^e': 8,
 '^cn': 5}

In [76]:
joined_pattern_vectorizer.vocabulary_

{'^T_^of_^sb_^S_^lf': 215,
 '^nf_^lf': 387,
 '^T_^nf': 150,
 '^T_^of_^sb_^and_^sb': 223,
 '^T_^S': 121,
 '^lf': 261,
 '^lf_^cnT': 290,
 '^nf_^S': 340,
 '^nf': 329,
 '^univ_^of_^L_^sb_^T': 526,
 '^univ_^of_^L_^nf': 525,
 '^nf_^S_^lf': 344,
 '^sb_^T_^S': 458,
 '^nf_^univ_^nf': 439,
 '^sb_^and_^sb_^T': 466,
 '^T_^of_^S': 162,
 '^L_^T': 10,
 '^univ_^nf_^sw_^nf_^lf': 517,
 '^S_^nf_^T': 103,
 '^nf_^T_^of_^sb_^S': 372,
 '^L_^T_^of_^cS': 15,
 '^univ_^of_^L': 519,
 '^nf_^lf_^T_^of_^S': 390,
 '^lf_^mO_^T': 291,
 '^S_^sb_^T': 112,
 '^lf_^univ_^of_^L': 317,
 '^L_^nf': 32,
 '^nf_^univ': 434,
 '^lf_^T': 272,
 '^T_^sb': 243,
 '^sb_^S_^T': 452,
 '^nf_^T_^of_^cS': 366,
 '^L_^univ': 57,
 '^L_^nf_^lf_^T': 41,
 '^nf_^sb_^S': 414,
 '^sb_^univ': 483,
 '^S_^T': 76,
 '^S_^T_^lf': 79,
 '^T_^of_^sb_^S': 212,
 '^S_^cS_^T': 93,
 '^nf_^T_^of_^S': 362,
 '^T_^of_^S_^nf': 176,
 '^sb_^nf_^T': 480,
 '^T_^of_^sb': 211,
 '^univ_^T_^of_^S': 507,
 '^L_^nf_^cnT': 40,
 '^sb_^S_^sb_^T': 455,
 '^T_^nf_^of_^nf': 158,
 '^T_^of_^

In [77]:
len(joined_pattern_vectorizer.vocabulary_)

545

# Debugging

In [78]:
gen_orgs = train_ds[train_ds.label == 'GEN'].iloc[:10000].copy()
spe_orgs = train_ds[train_ds.label == 'SPE'].iloc[:10000]

In [79]:
classifyOrg(rule_based_classifier, pipeline, gen_orgs)

In [80]:
gen_orgs_misclassified = gen_orgs[gen_orgs['prediction'] != 'GEN']

In [81]:
len(gen_orgs_misclassified)

376

In [82]:
_ = gen_orgs_misclassified.sample(50).input.apply(print)

Zhejiang Children’s Hospital
UK Cambridge Research Institute (CRI)
Kansas City School of Dentistry Kansas City
Centre for Excellence in Teaching in Higher Education
VIRUTUS
THIS Institute
KievUniversity
Department of Oncology and Leuven Cancer Institute (LKI)
Germany ‡Institute for Medical Statistics and Epidemiology
Depts. Pharmacol. Toxicol. B.
Institutionen för Datavetenskap
School of Civil and Environmental Engineering University of Technology Sydney
Trinity Collage
Russian State University Justice
Aberdeen Business School
Sterrenkunde Leuven
Education Seoul National University Seoul
Physik-Department E17
Australian Institute for Health Innovation
School of Social Welfare, University of California
Kiev Univ.
Tip Fakultesi (Dekanlik Binasi)
Center for State Health Policy
Nautral Products Research Institute
Nephrol.Abt.
Second Hospital Affiliated to Medical College
Economics Department University of Melbourne
Division of Cardiovascular Disease Birmingham
ACCMS
Hals-Nasen-Ohren-Klini

In [86]:
gen_vecs = feature_extractor.transform(gen_orgs_misclassified.input)
spe_vecs = feature_extractor.transform(spe_orgs.input)

In [87]:
muls = gen_vecs.dot(spe_vecs.transpose())

In [88]:
gen_vecs.shape, spe_vecs.shape, muls.shape

((376, 2256), (10000, 2256), (376, 10000))

In [89]:
# sorted_indices = muls.argsort(axis=None) # only work for dense matrix
sorted_items = sorted(muls[:100].todok().items(), key=lambda x: -x[1])

In [90]:
def print_similar_orgs(i):
#     idx = sorted_indices[-i-1] # only work for dense matrix
#     gen_idx, spe_idx = idx // muls.shape[0], idx % muls.shape[0] # only work for dense matrix
    gen_idx, spe_idx = sorted_items[i][0]
    print('Generic:', gen_orgs['input'].iloc[gen_idx])
    print('Specific:', spe_orgs['input'].iloc[spe_idx])
    # print vector elements (not very informative without feature names)
#     for v in zip(gen_vecs[gen_idx].todok().items(), spe_vecs[spe_idx].todok().items()):
#         print(v)

In [91]:
print_similar_orgs(1)

Generic: School of Electrical and Electronics
Specific: National Children's Research Centre


In [92]:
print_similar_orgs(2)

Generic: Faculty for Mechanical Engineering
Specific: National Research Council


In [93]:
print_similar_orgs(3)

Generic: Faculty for Mechanical Engineering
Specific: National Research Council


In [94]:
print_similar_orgs(5)

Generic: School of Electrical and Electronics
Specific: United Aircraft Research Laboratories


In [95]:
print_similar_orgs(6)

Generic: School of Electrical and Electronics
Specific: Electricity Council Research Cent


In [96]:
print_similar_orgs(10)

Generic: Faculty for Mechanical Engineering
Specific: Northwest Research Associates


In [97]:
print_similar_orgs(13)

Generic: Departamento de Inmunología y Reumatología
Specific: Aberdeen Business School


In [98]:
print_similar_orgs(15)

Generic: Abt. Pathol.
Specific: Otago Business School


In [99]:
print_similar_orgs(25)

Generic: Abt. Pathol.
Specific: Trinity Business School


In [100]:
print_similar_orgs(35)

Generic: 3rd Department of Medicine
Specific: The First Affiliated Hospital Of Wenzhou Medical University


In [101]:
print_similar_orgs(100)

Generic: Departments of Biochemistry and Microbiology
Specific: St Barnabas Hospital


# Storing to file in LibSVM-friendly format

In [102]:
%%time
pipeline.fit(train_ds.input, y=train_ds.label)

  'decreasing the number of bins.' % jj)
  'decreasing the number of bins.' % jj)
  'decreasing the number of bins.' % jj)


Pipeline(memory=None,
         steps=[('pipeline',
                 Pipeline(memory=None,
                          steps=[('tokentranslator',
                                  <__main__.TokenTranslator object at 0x2fae6a0b8>),
                                 ('featureunion',
                                  FeatureUnion(n_jobs=None,
                                               transformer_list=[('freq-char-len',
                                                                  Pipeline(memory=None,
                                                                           steps=[('charlenfeature',
                                                                                   <__main__.CharLenFeature object at 0x2fae6a0f0>),
                                                                                  ('countvectorizer',
                                                                                   CountVectorizer(analyzer=<fun...
                ('nystroem',
          

In [103]:
%%time
label_encoder = LabelEncoder()
features = feature_extractor.transform(train_ds.input)
labels = label_encoder.fit_transform(train_ds.label)

CPU times: user 1min 32s, sys: 853 ms, total: 1min 33s
Wall time: 1min 31s


In [104]:
dump_svmlight_file(features, labels, 'output/dataset-200k-noisy.train.svm')

In [105]:
%%time
test_features = feature_extractor.transform(test_ds.input)
test_labels = label_encoder.transform(test_ds.label)

CPU times: user 1min 34s, sys: 685 ms, total: 1min 35s
Wall time: 1min 32s


In [106]:
dump_svmlight_file(test_features, test_labels, 'output/dataset-200k-noisy.test.svm')

In [107]:
joblib.dump(label_encoder, 'output/label_encoder.pkl')

['output/label_encoder.pkl']

In [108]:
joblib.dump(feature_extractor, 'output/feature_extractor.pkl')

['output/feature_extractor.pkl']

In [None]:
joblib.dump(hybrid_classifier, 'output/hybrid_classifier.pkl')