In [10]:
from keras.models import Sequential
from keras.layers import Dense, Activation
from sklearn.model_selection import train_test_split
import re
from collections import defaultdict
from sklearn.feature_extraction import DictVectorizer
from scipy.sparse import issparse
import pandas as pd
import os
from datetime import datetime

import matplotlib.pyplot as plt
plt.style.use('ggplot')
plt.rcParams['figure.figsize'] = (25, 13)
plt.rcParams['font.family'] = 'sans-serif'
plt.rcParams['font.size'] = 20
%matplotlib inline

In [11]:
class Featurizer:

    class InvalidTag(Exception):
        pass

    tag_re = re.compile(r'^[A-Z]+')

    def __init__(self, last_char, N, data_path, full_tag=False,
                 use_padding=True,
                 use_word_as_feature=False, tag_filter=None,
                 include_smaller_ngrams=False,
                 encoding='utf-8',
                 sample_per_class=100,
                 max_lines=2000000,
                 **kwargs):
        self.last_char = last_char
        self.N = N
        self.full_tag = full_tag
        self.use_padding = use_padding
        self.use_word_as_feature = use_word_as_feature
        self.include_smaller_ngrams = include_smaller_ngrams
        self.data_path = data_path
        self.sample_per_class = sample_per_class
        self.max_lines = max_lines
        self.encoding = encoding
        if tag_filter is not None:
            self.tag_filter = set(tag_filter)
        else:
            self.tag_filter = None
        self.raw_input = []

    def featurize(self):
        X = []
        y = []
        sample_cnt = defaultdict(int)
        line_cnt = 0
        with open(self.data_path, encoding=self.encoding) as f:
            for line in f:
                line_cnt += 1
                if self.max_lines > 0 and line_cnt > self.max_lines:
                    break
                try:
                    word, tag = self.extract_word_and_tag(line)
                except Featurizer.InvalidTag:
                    continue
                if not word.strip() or not tag.strip():
                    continue
                if self.tag_filter is not None and tag not in self.tag_filter:
                    continue
                if self.sample_per_class > 0 and \
                    (all(v >= self.sample_per_class
                         for v in sample_cnt.values()) and
                     len(sample_cnt) > 1):
                    break
                sample_cnt[tag] += 1
                if sample_cnt[tag] > self.sample_per_class:
                    continue
                self.__featurize_and_store_sample(word, tag, X, y)
                self.raw_input.append((word, tag))
        return self.create_feature_matrix(X, y)

    def create_feature_matrix(self, X, y):
        self.X = X
        self.y = y
        self.X_mtx = self.__get_or_create_vectorizer('X_vectorizer', X)
        self.y_vec = self.__get_or_create_vectorizer('y_vectorizer', y)
        return self.X_mtx, self.y_vec
    
    def create_train_test(self, ratio):
        Xtr, Xte, ytr, yte = train_test_split(self.X_mtx, self.y_vec, train_size=ratio)
        self.X_train = Xtr
        self.X_test = Xte
        self.y_train = ytr
        self.y_test = yte
        

    def __get_or_create_vectorizer(self, name, data):
        if not hasattr(self, name):
            dv = DictVectorizer()
            v = dv.fit_transform(data)
            setattr(self, name, dv)
        return v

    def extract_word_and_tag(self, line):
        fd = line.strip().split('\t')
        word = fd[0]
        tag = fd[-1].split('/')[-1]
        if self.full_tag is False:
            try:
                tag = Featurizer.tag_re.match(tag).group(0)
            except AttributeError:
                raise Featurizer.InvalidTag()
        return word, tag

    def __featurize_and_store_sample(self, word, tag, X, y):
        if self.use_word_as_feature:
            f = {'word': word}
        else:
            f = self.__featurize_ngram(word)
        X.append(f)
        y.append({'class': tag})

    def __featurize_ngram(self, word):
        feats = {}
        if self.last_char > 0:
            word = word[-self.last_char:]
        if self.include_smaller_ngrams:
            for n in range(1, self.N+1):
                feats.update(Featurizer.extract_ngrams(
                    word, n, self.use_padding))
        else:
            feats.update(Featurizer.extract_ngrams(
                word, self.N, self.use_padding))
        return feats

    @staticmethod
    def extract_ngrams(text, N, padding=False):
        if padding is True:
            text = '{0}{1}{0}'.format(" " * (N-1), text)
        feats = {}
        for i in range(len(text)-N+1):
            feats['{0}_{1}'.format(N, i)] = text[i:i+N]
        return feats

    def get_theoretical_max(self):
        samples = defaultdict(lambda: defaultdict(int))
        for i in range(len(self.X)):
            xi = self.X[i]
            yi = self.y[i]
            f_str = ','.join('{}:{}'.format(feat, val)
                             for feat, val in sorted(xi.items()))
            samples[f_str][yi['class']] += 1
        return sum(max(v.values()) for v in samples.values()) / len(self.X)

In [12]:
class FFNN:
    @staticmethod
    def create_list_if_str(param, length):
        if isinstance(param, str):
            return [param] * length
        return param
    
    def __init__(self, layers, activations, input_dim, output_dim, *,
                 loss='binary_crossentropy', optimizer='rmsprop', metrics=['accuracy'],
                nb_epoch=50, batch_size=50, **kwargs):
        self.layers = layers
        self.activations = FFNN.create_list_if_str(activations, len(self.layers) + 1)
        self.input_dim = input_dim
        self.output_dim = output_dim
        self.model_fit_args = {
            'nb_epoch': nb_epoch,
            'batch_size': batch_size,
        }
        self.model_compile_args = {
            'optimizer': optimizer,
            'metrics': metrics,
            'loss': loss,
        }
        self.create_network()
        
    def create_network(self):
        self.model = Sequential()
        # input layer
        self.model.add(Dense(self.layers[0], input_dim=self.input_dim, activation=self.activations[0]))
        for i in range(1, len(self.layers)):
            self.model.add(Dense(self.layers[i], activation=self.activations[i]))
        #output layer
        self.model.add(Dense(self.output_dim, activation=self.activations[-1]))
        self.model.compile(**self.model_compile_args)
        
    def fit(self, X, y):
        X = FFNN.densify(X)
        y = FFNN.densify(y)
        start = datetime.now()
        self.model.fit(X, y, verbose=0, **self.model_fit_args)
        return datetime.now() - start
        
    def evaluate(self, X, y, **kwargs):
        X = FFNN.densify(X)
        y = FFNN.densify(y)
        return self.model.evaluate(X, y, **kwargs)
    
    @staticmethod
    def densify(mtx):
        if issparse(mtx):
            return mtx.todense()
        return mtx

In [13]:
class Config:
    def __init__(self, conf_d):
        self.glob_conf = Config.defaults.get('global', {})
        self.feat_conf = Config.defaults.get('featurizer', {})
        self.model_conf = Config.defaults.get('model', {})
        
        self.glob_conf.update(conf_d.get('global', {}))
        self.feat_conf.update(conf_d.get('featurizer', {}))
        self.model_conf.update(conf_d.get('model', {}))
        
    def to_dict(self):
        d = {}
        d.update(self.serialize_config(self.glob_conf, 'global'))
        d.update(self.serialize_config(self.feat_conf, 'feat'))
        d.update(self.serialize_config(self.model_conf, 'model'))
        return d
        
    def serialize_config(self, section, pre):
        d = {}
        for k, v in section.items():
            d['{0}.{1}'.format(pre, k)] = v
        return d
        
    defaults = {
        'global': {
            'train_test_split': .9,
            'nolog': False,
        },
        'model': {
            'loss': 'binary_crossentropy',
            'optimizer': 'rmsprop',
            'metrics': ['accuracy'],
            'nb_epoch': 50,
            'batch_size': 64,
        },
        'featurizer': {
            'data_path': '/mnt/store/hlt/Language/Hungarian/Crawl/Web2/ana/xaa.tagged',
            'encoding': 'latin2',
            'include_smaller_ngrams': False,
            'use_padding': True,
            'tag_filter': ("NOUN", "VERB"),
        },
    }

In [14]:
class Result:
    __slots__ = ('train_sample_count', 'test_sample_count', 'feature_count',
                 'success', 'exception',
                 'running_time', 'timestamp',
                 'train_acc', 'test_acc', 'train_loss', 'test_loss')
    
    def to_dict(self):
        d = {}
        for k in Result.__slots__:
            try:
                d['result.{}'.format(k)] = getattr(self, k)
            except AttributeError:
                d['result.{}'.format(k)] = None
        return d
    
    def __str__(self):
        return "Train accuracy: {}\nTest accuracy: {}".format(self.train_acc, self.test_acc)

In [15]:
class Experiment:
    
    df_path = 'results.tsv'
    
    def __init__(self, conf_d):
        self.config = Config(conf_d)
        self.featurizer = Featurizer(**self.config.feat_conf)
        self.featurizer.featurize()
        input_dim = self.featurizer.X_mtx.shape[1]
        output_dim = self.featurizer.y_vec.shape[1]
        self.config.model_conf['input_dim'] = input_dim
        self.config.model_conf['output_dim'] = output_dim
        self.initialize_model()
        self.result = Result()
        self.result.feature_count = input_dim
        
    def initialize_model(self):
        if self.config.model_conf['architecture'].lower() == 'ffnn':
            self.model = FFNN(**self.config.model_conf)
        else:
            raise ValueError("Model architecture [{}] not supported".format(
                    self.config.model['architecture']))
            
    def fit_train(self):
        self.result.timestamp = datetime.now()
        self.featurizer.create_train_test(self.config.glob_conf['train_test_split'])
        rt = self.model.fit(self.featurizer.X_train.todense(), self.featurizer.y_train.todense())
        self.result.running_time = rt
        
    def evaluate_train(self):
        l =  self.model.evaluate(self.featurizer.X_train, self.featurizer.y_train, batch_size=16)
        self.result.train_loss = l[0]
        self.result.train_acc = l[1]
        self.result.train_sample_count = self.featurizer.X_train.shape[0]
        
    def evaluate_test(self):
        l =  self.model.evaluate(self.featurizer.X_test, self.featurizer.y_test, batch_size=16)
        self.result.test_loss = l[0]
        self.result.test_acc = l[1]
        self.result.test_sample_count = self.featurizer.X_test.shape[0]
        
    def run_and_save(self):
        try:
            self.fit_train()
            self.evaluate_train()
            self.evaluate_test()
        except Exception as e:
            self.result.success = False
            self.result.exception = type(e).__name__
        else:
            self.result.success = True
        if self.config.glob_conf['nolog'] is False:
            self.save_results()
        
    def save_results(self):
        d = {}
        d.update(self.config.to_dict())
        d.update(self.result.to_dict())
        Experiment.save_to_dataframe(d, Experiment.df_path)
        
    @staticmethod
    def save_to_dataframe(data, fn):
        if os.path.exists(fn):
            df = pd.read_table(fn)
        else:
            df = pd.DataFrame(columns=data.keys())
            
        new_cols = set(data.keys()) - set(df.columns)
        for c in new_cols:
            df[c] = None
        df = df.append(data, ignore_index=True)
        df.to_csv(fn, sep='\t', index=False)

# Actual experiments

In [None]:
%%time

cfg = {
    'global': {
        'nolog': False,
        'comment': "other optimizers",
    },
    'featurizer': {
        'last_char': 6,
        'N': 2,
        'use_padding': True,
        'sample_per_class': 30000,
        'include_smaller_ngrams': True,
        'tag_filter': ("NOUN", "VERB"),
    },
    'model': {
        'architecture': 'FFNN',
        'layers': (40, 40),
        'activations': ('sigmoid', 'sigmoid', 'sigmoid'),
        'optimizer': 'rmsprop',
        'loss': 'binary_crossentropy',
        'metrics': ['accuracy'],
        'nb_epoch': 300,
        'batch_size': 500,
    },
}

optimizers = ['sgd', 'adagrad', 'adam', 'nadam', 'adamax', 'rmsprop']

for opt in optimizers:
    print(opt)
    cfg['model']['optimizer'] = opt
    for last_char in range(4, 9):
        cfg['featurizer']['last_char'] = last_char
        for N in range(1, 3):
            cfg['featurizer']['N'] = N
            e = Experiment(cfg)
            e.run_and_save()


sgd


In [21]:
print(12)

12
