# ULM baseline experiments

My first attempts at using tensorflow

In [1]:
import tensorflow as tf
from sys import stdin
from collections import defaultdict
from scipy.io import mmread
from sklearn.model_selection import train_test_split
import numpy as np
import re
import datetime as dt
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
from os import path

In [2]:
data_path = '/mnt/store/hlt/Language/Hungarian/Crawl/Web2/ana/xaa.tagged'

In [3]:
class ResearchDiary:
    diary_file = "baseline_experiment_results.tsv"
        
    @staticmethod
    def load_or_create_dataframe():
        if path.exists(ResearchDiary.diary_file):
            result_diary = pd.read_table(ResearchDiary.diary_file)
        else:
            result_diary = pd.DataFrame(columns=[
                                            'train_accuracy',
                                            'test_accuracy',
                                            'timestamp',
                                            'data_path',
                                            'theoretical_max',
                                            'architecture',
                                            'min_sample_per_class',
                                            'max_sample_per_class',
                                            'sample_count',
                                            'feature_count',
                                            'max_lines',
                                            'N',
                                            'last_char',
                                            'full_tag',
                                            'tag_filter',
                                            'include_smaller_ngrams',
                                            'use_padding',
                                            'epochs',
                                            'layers',
                                            'activation',
                                            'batch_size',
                                            'optimizer',
                                            'optimizer_kwargs',
                                            'gpu_memory_fracion',
                                            'running_time'
                                        ])
        return result_diary
        
    def __init__(self):
        self.df = ResearchDiary.load_or_create_dataframe()
        
    def __enter__(self):
        return self
    
    def __exit__(self, exc_type, exc_val, exc_tb):
        self.df.to_csv(ResearchDiary.diary_file, sep='\t', index=False)
    
    def add_experiment(self, data, create_cols_if_new=True):
        new_index = len(self.df)
        if create_cols_if_new:
            for new_col in set(data.keys()) - set(self.df.columns):
                self.df[new_col] = None
        else:
            missing_cols = set(data.keys()) - set(self.df.columns)
            for c in missing_cols:
                del data[c]
        self.df = self.df.append(data, ignore_index=True)

In [4]:
import re
from collections import defaultdict
from sklearn.feature_extraction import DictVectorizer


class Featurizer:
    
    class InvalidTag(Exception):
        pass
    
    tag_re = re.compile(r'^[A-Z]+')
    
    def __init__(self, last_char, N, full_tag=False, use_padding=True, use_word_as_feature=False, tag_filter=None,
                 include_smaller_ngrams=False, **kwargs):
        self.last_char = last_char
        self.N = N
        self.full_tag = full_tag
        self.use_padding = use_padding
        self.use_word_as_feature = use_word_as_feature
        self.include_smaller_ngrams = include_smaller_ngrams
        if tag_filter is not None:
            self.tag_filter = set(tag_filter)
        else:
            self.tag_filter = None
        self.raw_input = []
        
    def featurize(self, data_path, encoding='utf-8', min_sample_per_class=0, max_sample_per_class=0, max_lines=0,
                 **kwargs):
        X = []
        y = []
        self.min_sample_per_class = min_sample_per_class
        self.max_sample_per_class = max_sample_per_class
        self.max_lines = max_lines
        sample_cnt = defaultdict(int)
        line_cnt = 0
        with open(data_path, encoding=encoding) as f:
            for line in f:
                line_cnt += 1
                if max_lines > 0 and line_cnt > max_lines:
                    break
                try:
                    word, tag = self.extract_word_and_tag(line)
                except Featurizer.InvalidTag:
                    continue
                if not word.strip() or not tag.strip():
                    continue
                if self.tag_filter is not None and tag not in self.tag_filter:
                    continue
                if max_sample_per_class > 0 and sample_cnt[tag] >= max_sample_per_class:
                    continue
                if all(v >= max_sample_per_class for v in sample_cnt.values()) and len(sample_cnt) > 1:
                    break
                self.__featurize_and_store_sample(word, tag, X, y)
                self.raw_input.append((word, tag))
                sample_cnt[tag] += 1
        return self.create_feature_matrix(X, y)
                
    def create_feature_matrix(self, X, y):
        self.X = X
        self.y = y
        self.X_mtx = self.__get_or_create_vectorizer('X_vectorizer', X)
        self.y_vec = self.__get_or_create_vectorizer('y_vectorizer', y)
        return self.X_mtx, self.y_vec
        
    def __get_or_create_vectorizer(self, name, data):
        if not hasattr(self, name):
            dv = DictVectorizer()
            v = dv.fit_transform(data)
            setattr(self, name, dv)
        return v
    
    def extract_word_and_tag(self, line):
        fd = line.strip().split('\t')
        word = fd[0]
        tag = fd[-1].split('/')[-1]
        if self.full_tag is False:
            try:
                tag = Featurizer.tag_re.match(tag).group(0)
            except AttributeError:
                raise Featurizer.InvalidTag()
        return word, tag
    
    def __featurize_and_store_sample(self, word, tag, X, y):
        if self.use_word_as_feature:
            f = {'word': word}
        else:
            f = self.__featurize_ngram(word)
        X.append(f)
        y.append({'class': tag})
            
    def __featurize_ngram(self, word):
        feats = {}
        if self.last_char > 0:
            word = word[-self.last_char:]
        if self.include_smaller_ngrams:
            for n in range(1, self.N+1):
                feats.update(Featurizer.extract_ngrams(word, n, self.use_padding))
        else:
            feats.update(Featurizer.extract_ngrams(word, self.N, self.use_padding))
        return feats
    
    @staticmethod
    def extract_ngrams(text, N, padding=False):
        if padding is True:
            text = '{0}{1}{0}'.format(" " * (N-1), text)
        feats = {}
        for i in range(len(text)-N+1):
            feats['{0}_{1}'.format(N, i)] = text[i:i+N]
        return feats
            
    def get_theoretical_max(self):
        samples = defaultdict(lambda: defaultdict(int))
        for i in range(len(self.X)):
            xi = self.X[i]
            yi = self.y[i]
            f_str = ','.join('{}:{}'.format(feat, val) for feat, val in sorted(xi.items()))
            samples[f_str][yi['class']] += 1
        return sum(max(v.values()) for v in samples.values()) / len(self.X)

In [5]:
import logging
from scipy.sparse import issparse


class FFNN:
    
    def __init__(self, n_feature, n_class, layers, batch_size=0, epochs=5000, verbose=False,
                 activation=tf.sigmoid, gpu_memory_fraction=0.5,
                 optimizer="GradientDescentOptimizer", optimizer_kwargs={}):
        self.n_feature = n_feature
        self.n_class = n_class
        self.shape = layers
        self.batch_size = batch_size
        self.n_input = tf.placeholder(tf.float32, shape=[None, n_feature],
                         name="n_input")
        self.n_output = tf.placeholder(tf.float32, shape=[None, n_class],
                          name="n_output")
        self.bias = []
        self.W = []
        self.hidden = []
        self.activation = activation
        self.create_input_layer()
        self.create_hidden_layers()
        self.create_output_layer()
        self.cost = tf.reduce_mean(tf.square(self.n_output - self.output))
        self.optimizer_kwargs = optimizer_kwargs
        self.optimizer = getattr(tf.train, optimizer)(**optimizer_kwargs)
        self.train = self.optimizer.minimize(self.cost)
        self.init = tf.initialize_all_variables()
        self.gpu_memory_fraction = gpu_memory_fraction
        gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=gpu_memory_fraction)
        self.sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options))
        self.sess.run(self.init)
        self.verbose = verbose
        self.epochs = epochs
        
    def create_input_layer(self):
        self.bias.append(tf.Variable(tf.random_normal([self.shape[0]]), name="bias0"))
        self.W.append(tf.Variable(tf.random_normal([self.n_feature, self.shape[0]]), name="weights0"))
        self.hidden.append(self.activation(tf.matmul(self.n_input, self.W[0]) + self.bias[0]))
        
    def create_hidden_layers(self):
        for i in range(1, len(self.shape)):
            self.bias.append(tf.Variable(tf.random_normal([self.shape[i]]), name="bias{}".format(i)))
            self.W.append(tf.Variable(tf.random_normal([self.shape[i-1], self.shape[i]]), name="weights0"))
            self.hidden.append(self.activation(tf.matmul(self.hidden[-1], self.W[-1]) + self.bias[-1]))
            
    def create_output_layer(self):
        self.bias.append(tf.Variable(tf.random_normal([self.n_class]), name="bias{}".format(len(self.shape)-1)))
        self.W.append(tf.Variable(tf.random_normal([self.shape[-1], self.n_class]), name="weights{}".format(len(self.shape)-1)))
        self.output = self.activation(tf.matmul(self.hidden[-1], self.W[-1]) + self.bias[-1])
        
    def dotrain(self, X_train, y_train):
        X_train = FFNN.convert_sparse_if_needed(X_train)
        y_train = FFNN.convert_sparse_if_needed(y_train)
        cvalues = []
        step = self.epochs // 10
        step = 1 if step < 1 else step
        cnt = 0
        for epoch in range(0, self.epochs):
            if self.batch_size > 0:
                X_batch, y_batch = self.get_minibatch(X_train, y_train)
            else:
                X_batch = X_train
                y_batch = y_train
            cvalues.append(self.sess.run([self.train, self.cost] + self.W + self.bias,
                    feed_dict={self.n_input: X_batch, self.n_output: y_batch}))
            cnt += 1
            if cnt % step == 0 and self.verbose is True:
                print('{0} epochs, cvalue: {1}'.format(epoch+1, cvalues[-1][1]))
        self.correct_prediction = tf.equal(tf.argmax(self.n_output,1), tf.argmax(self.output,1))
        return cvalues
        
    def get_minibatch(self, X, y):
        batch_index = np.random.choice(np.arange(0, X.shape[0]), self.batch_size)
        return X[batch_index], y[batch_index]
    
    def dotest(self, X_test, y_test):
        X_test = FFNN.convert_sparse_if_needed(X_test)
        y_test = FFNN.convert_sparse_if_needed(y_test)
        accuracy = tf.reduce_mean(tf.cast(self.correct_prediction, tf.float32))
        prediction = self.correct_prediction.eval(session=self.sess,
                                                  feed_dict={self.n_input: X_test, self.n_output: y_test})
        return accuracy.eval(session=self.sess, feed_dict={self.n_input: X_test, self.n_output: y_test}), prediction
        
    @staticmethod
    def convert_sparse_if_needed(mtx):
        if issparse(mtx):
            mtx = mtx.todense()
        return mtx
    

class Experiment:
    
    def __init__(self, config):
        self.config = config
        if 'global' not in config:
            glob_config = {}
        else:
            glob_config = config['global']
        self.featurizer = Featurizer(**config['featurizer'])
        X, y = self.featurizer.featurize(**config['featurizer'])
        self.X = X
        self.y = y
        self.test_size = glob_config.get('test_size', .1)
        verbose = glob_config.get('verbose', False)
        self.ffnn = FFNN(X.shape[1], y.shape[1], verbose=verbose, **config['ffnn'])
        
    def run(self):
        self.train_mask = np.random.random(size=self.X.shape[0]) > self.test_size
        X_train = self.X[self.train_mask]
        X_test = self.X[np.invert(self.train_mask)]
        y_train = self.y[self.train_mask]
        y_test = self.y[np.invert(self.train_mask)]
        cvalues = self.ffnn.dotrain(X_train, y_train)
        train_acc, train_pred = self.ffnn.dotest(X_train, y_train)
        test_acc, test_pred = self.ffnn.dotest(X_test, y_test)
        self.test_pred = test_pred
        self.train_acc = train_acc
        self.test_acc = test_acc
        return train_acc, train_pred, test_acc, test_pred
    
    def run_decision_tree(self):
        self.clf = DecisionTreeClassifier()
        return cross_val_score(self.clf, self.X.toarray(), self.y.toarray(), cv=10)
        
    def get_test_errors(self):
        test_samples = [p[1] for p in filter(lambda x: not self.train_mask[x[0]], enumerate(self.featurizer.raw_input))]
        errors = []
        for i, s in enumerate(test_samples):
            if not self.test_pred[i]:
                errors.append(s)
        return errors 
    
    def add_results_to_diary(self, diary):
        d = {
            'timestamp': dt.datetime.now(),
            'data_path': data_path,
            'last_char': self.featurizer.last_char,
            'N': self.featurizer.N,
            'full_tag': self.featurizer.full_tag,
            'use_padding': self.featurizer.use_padding,
            'include_smaller_ngrams': self.featurizer.include_smaller_ngrams,
            'tag_filter': self.featurizer.tag_filter,
            'min_sample_per_class': self.featurizer.min_sample_per_class,
            'max_sample_per_class': self.featurizer.max_sample_per_class,
            'max_lines': self.featurizer.max_lines,
            'sample_count': self.featurizer.X_mtx.shape[0],
            'feature_count': self.featurizer.X_mtx.shape[1],
            'architecture': 'FFNN',
            'layers': self.ffnn.shape,
            'batch_size': self.ffnn.batch_size,
            'epochs': self.ffnn.epochs,
            'activation': self.ffnn.activation.__name__,
            'gpu_memory_fracion': self.ffnn.gpu_memory_fraction,
            'optimizer': self.ffnn.optimizer.__class__,
            'optimizer_kwargs': self.ffnn.optimizer_kwargs,
            'train_accuracy': self.train_acc,
            'test_accuracy': self.test_acc,
            'theoretical_max': self.featurizer.get_theoretical_max(),
            'running_time': self.running_time,
        }
        if 'global' in self.config and 'comment' in self.config['global']:
            d['comment'] = self.config['global']['comment']
        diary.add_experiment(d)
    
    def run_and_save(self):
        start = dt.datetime.now()
        train_acc, train_pred, test_acc, test_pred = self.run()
        end = dt.datetime.now()
        self.running_time = end - start
        print("Training accuracy: {0}\nTest accuracy: {1}".format(train_acc, test_acc))
        with ResearchDiary() as rd:
            self.add_results_to_diary(rd)

In [6]:
%%time
e = Experiment({
        'global': {
            'verbose': True,
            'comment': "toy example used for testing the framework"
        },
        'featurizer': {
            'last_char': 8,
            'N': 1,
            'use_padding': True,
            'include_smaller_ngrams': True,
            'tag_filter': ("NOUN", "VERB"),
            'data_path': data_path,
            'encoding': 'latin2',
            'min_sample_per_class': 100,
            'max_sample_per_class': 100,
            'max_lines': 2000000,
        },
        'ffnn': {
            'layers': (40, 40, 40),
            'optimizer': 'MomentumOptimizer',
            'batch_size': 1000,
            'optimizer_kwargs': {
                'learning_rate': 1,
                'momentum': .1,
            },
            'gpu_memory_fraction': 1,
            'epochs': 50,
        }
})
e.run_and_save()

5 epochs, cvalue: 0.4877081513404846
10 epochs, cvalue: 0.4663439393043518
15 epochs, cvalue: 0.3880554437637329
20 epochs, cvalue: 0.37930601835250854
25 epochs, cvalue: 0.3763933479785919
30 epochs, cvalue: 0.36746475100517273
35 epochs, cvalue: 0.3538150489330292
40 epochs, cvalue: 0.3549867868423462
45 epochs, cvalue: 0.3539010286331177
50 epochs, cvalue: 0.35251584649086
Training accuracy: 0.5105263590812683
Test accuracy: 0.30000001192092896
CPU times: user 3.24 s, sys: 524 ms, total: 3.76 s
Wall time: 3.67 s


In [8]:
e.run_decision_tree()

array([ 0.85,  0.45,  0.7 ,  0.75,  0.75,  0.75,  0.6 ,  0.85,  0.4 ,  0.9 ])

In [24]:
rd = ResearchDiary()
rd.df[['train_accuracy', 'test_accuracy', 'sample_count', 'feature_count', 'last_char', 'N', 'use_padding', 'tag_filter', 'include_smaller_ngrams', 'layers', 'epochs']]

Unnamed: 0,train_accuracy,test_accuracy,sample_count,feature_count,last_char,N,use_padding,tag_filter,include_smaller_ngrams,layers,epochs
0,0.502825,0.478261,200.0,207.0,8.0,1.0,True,"{'VERB', 'NOUN'}",True,"(40, 40, 40)",50.0
1,0.510526,0.3,200.0,207.0,8.0,1.0,True,"{'NOUN', 'VERB'}",True,"(40, 40, 40)",50.0
2,0.955026,0.952151,100000.0,437.0,5.0,1.0,True,"{'NOUN', 'VERB'}",False,"(40, 40)",10000.0
3,0.979138,0.929889,5000.0,1773.0,5.0,2.0,True,"{'NOUN', 'VERB'}",False,"(40, 40, 40)",5000.0
4,0.740301,0.685155,6000.0,2414.0,5.0,5.0,False,"{'NOUN', 'VERB'}",False,"(300,)",5000.0
5,0.966052,0.951476,48882.0,4059.0,5.0,2.0,True,"{'NOUN', 'VERB'}",False,"(40, 40, 40)",10000.0
6,0.421262,0.423828,30454.0,579.0,10.0,1.0,False,,False,"(30, 30)",5000.0
7,0.696985,0.671896,15000.0,4961.0,10.0,2.0,False,"{'ADV', 'VERB', 'NOUN', 'ADJ', 'CONJ'}",True,"(30, 30)",10000.0


In [21]:
rd.df.columns

Index(['train_accuracy', 'test_accuracy', 'timestamp', 'data_path',
       'theoretical_max', 'architecture', 'min_sample_per_class',
       'max_sample_per_class', 'sample_count', 'feature_count', 'max_lines',
       'N', 'last_char', 'full_tag', 'tag_filter', 'include_smaller_ngrams',
       'use_padding', 'epochs', 'layers', 'activation', 'batch_size',
       'optimizer', 'optimizer_kwargs', 'gpu_memory_fracion', 'running_time',
       'comment'],
      dtype='object')

In [9]:
%%time
e = Experiment({
        'global': {
            'verbose': True,
        },
        'featurizer': {
            'last_char': 5,
            'N': 1,
            'use_word_as_feature': False,
            'tag_filter': ("NOUN", "VERB"),
            'data_path': data_path,
            'encoding': 'latin2',
            'min_sample_per_class': 3,
            'max_sample_per_class': 50000,
            'max_lines': 2000000,
        },
        'ffnn': {
            'layers': (40, 40),
            'batch_size': 1000,
            'optimizer': 'MomentumOptimizer',
            'optimizer_kwargs': {
                'learning_rate': 1,
                'momentum': .1,
            },
            'gpu_memory_fraction': 1,
            'epochs': 10000,
        }
})
e.run_and_save()

1000 epochs, cvalue: 0.11534780263900757
2000 epochs, cvalue: 0.08339373767375946
3000 epochs, cvalue: 0.06078900769352913
4000 epochs, cvalue: 0.060510020703077316
5000 epochs, cvalue: 0.061203502118587494
6000 epochs, cvalue: 0.04734017699956894
7000 epochs, cvalue: 0.04871213063597679
8000 epochs, cvalue: 0.04383450001478195
9000 epochs, cvalue: 0.040955979377031326
10000 epochs, cvalue: 0.042818546295166016
Training accuracy: 0.9550263285636902
Test accuracy: 0.9521514773368835
CPU times: user 46.7 s, sys: 2.94 s, total: 49.6 s
Wall time: 36 s


In [10]:
errors = e.get_test_errors()
errors[:10]
d = defaultdict(int)
for i in errors:
    d[i[1]] += 1
d
errors[:20]

[('egésze', 'NOUN'),
 ('fölöttük', 'NOUN'),
 ('köze', 'NOUN'),
 ('mindezek', 'NOUN'),
 ('mellette', 'NOUN'),
 ('bizonyíték', 'NOUN'),
 ('vízum', 'NOUN'),
 ('egybecseng', 'VERB'),
 ('hömpölyög', 'VERB'),
 ('körbeér', 'VERB'),
 ('keveri', 'VERB'),
 ('állattá', 'NOUN'),
 ('besétál', 'VERB'),
 ('ennél', 'NOUN'),
 ('Hérakleitosznál', 'NOUN'),
 ('tűzzel', 'NOUN'),
 ('kár', 'NOUN'),
 ('annak', 'NOUN'),
 ('kivétellel', 'NOUN'),
 ('közéjük', 'NOUN')]

In [11]:
%%time
e = Experiment({
        'global': {
            'verbose': True,
        },
        'featurizer': {
            'last_char': 5,
            'N': 2,
            'use_word_as_feature': False,
            'tag_filter': ("NOUN", "VERB"),
            'data_path': data_path,
            'encoding': 'latin2',
            'min_sample_per_class': 3,
            'max_sample_per_class': 2500,
            'max_lines': 2000000,
        },
        'ffnn': {
            'layers': (40, 40, 40),
            'batch_size': 100,
            'optimizer': 'MomentumOptimizer',
            'optimizer_kwargs': {
                'learning_rate': 1,
                'momentum': .1,
            },
            'gpu_memory_fraction': 1,
            'epochs': 5000,
        }
})
e.run_and_save()

500 epochs, cvalue: 0.15165449678897858
1000 epochs, cvalue: 0.08083382248878479
1500 epochs, cvalue: 0.07529635727405548
2000 epochs, cvalue: 0.0778464823961258
2500 epochs, cvalue: 0.06110145524144173
3000 epochs, cvalue: 0.03207963705062866
3500 epochs, cvalue: 0.023041512817144394
4000 epochs, cvalue: 0.022178513929247856
4500 epochs, cvalue: 0.02886219508945942
5000 epochs, cvalue: 0.01383767370134592
Training accuracy: 0.9791384935379028
Test accuracy: 0.9298893213272095
CPU times: user 16.4 s, sys: 720 ms, total: 17.2 s
Wall time: 12.3 s


In [12]:
%%time
e = Experiment({
        'global': {
            'verbose': True,
            'comment': "memory",
        },
        'featurizer': {
            'last_char': 5,
            'N': 5,
            'use_padding': False,
            'use_word_as_feature': False,
            'tag_filter': ("NOUN", "VERB"),
            'data_path': data_path,
            'encoding': 'latin2',
            'min_sample_per_class': 3,
            'max_sample_per_class': 3000,
            'max_lines': 200000,
        },
        'ffnn': {
            'layers': (300, ),
            'optimizer': 'MomentumOptimizer',
            'optimizer_kwargs': {
                'learning_rate': 1,
                'momentum': .1,
            },
            'gpu_memory_fraction': 1,
            'epochs': 5000,
        }
})
e.run_and_save()

500 epochs, cvalue: 0.39113083481788635
1000 epochs, cvalue: 0.3751225173473358
1500 epochs, cvalue: 0.3640808165073395
2000 epochs, cvalue: 0.3556898832321167
2500 epochs, cvalue: 0.34844157099723816
3000 epochs, cvalue: 0.34273603558540344
3500 epochs, cvalue: 0.27323392033576965
4000 epochs, cvalue: 0.22539880871772766
4500 epochs, cvalue: 0.20112769305706024
5000 epochs, cvalue: 0.18312877416610718
Training accuracy: 0.7403007745742798
Test accuracy: 0.6851550340652466
CPU times: user 2min 50s, sys: 50.3 s, total: 3min 41s
Wall time: 3min 41s


In [13]:
%%time
e = Experiment({
        'global': {
            'verbose': True,
        },
        'featurizer': {
            'last_char': 5,
            'N': 2,
            'use_word_as_feature': False,
            'tag_filter': ("NOUN", "VERB"),
            'data_path': data_path,
            'encoding': 'latin2',
            'min_sample_per_class': 3,
            'max_sample_per_class': 30000,
            'max_lines': 200000,
        },
        'ffnn': {
            'layers': (40, 40, 40),
            'batch_size': 1000,
            'optimizer': 'MomentumOptimizer',
            'optimizer_kwargs': {
                'learning_rate': 1,
                'momentum': .1,
            },
            'gpu_memory_fraction': 1,
            'batch_size': 500,
            'epochs': 10000,
        }
})

e.run_and_save()

1000 epochs, cvalue: 0.1329595148563385
2000 epochs, cvalue: 0.0919666439294815
3000 epochs, cvalue: 0.07032275199890137
4000 epochs, cvalue: 0.06475789844989777
5000 epochs, cvalue: 0.06308519840240479
6000 epochs, cvalue: 0.044318508356809616
7000 epochs, cvalue: 0.03979392722249031
8000 epochs, cvalue: 0.0352330207824707
9000 epochs, cvalue: 0.02733471244573593
10000 epochs, cvalue: 0.03111211583018303
Training accuracy: 0.9660520553588867
Test accuracy: 0.9514761567115784
CPU times: user 1min 53s, sys: 3.72 s, total: 1min 56s
Wall time: 1min 41s


In [14]:
%%time
e = Experiment({
        'global': {
            'verbose': True,
            'comment': "no tag filter"
        },
        'featurizer': {
            'last_char': 10,
            'N': 1,
            'use_padding': False,
            'use_word_as_feature': False,
            'data_path': data_path,
            'encoding': 'latin2',
            'min_sample_per_class': 3,
            'max_sample_per_class': 3000,
            'max_lines': 200000,
        },
        'ffnn': {
            'layers': (30, 30),
            'batch_size': 1000,
            'optimizer': 'MomentumOptimizer',
            'optimizer_kwargs': {
                'learning_rate': 1,
                'momentum': .1,
            },
            'gpu_memory_fraction': 1,
            'epochs': 5000,
        }
})
e.run_and_save()

500 epochs, cvalue: 0.06266084313392639
1000 epochs, cvalue: 0.06152767315506935
1500 epochs, cvalue: 0.059499263763427734
2000 epochs, cvalue: 0.05672406405210495
2500 epochs, cvalue: 0.05543501302599907
3000 epochs, cvalue: 0.05370552837848663
3500 epochs, cvalue: 0.051537346094846725
4000 epochs, cvalue: 0.05039873346686363
4500 epochs, cvalue: 0.04706898331642151
5000 epochs, cvalue: 0.0474327877163887
Training accuracy: 0.4212619960308075
Test accuracy: 0.4238280951976776
CPU times: user 26.9 s, sys: 1.49 s, total: 28.4 s
Wall time: 20.9 s


In [15]:
errors = e.get_test_errors()
errors[:10]
d = defaultdict(int)
for i in errors:
    d[i[1]] += 1
print(d)
errors[:20]
print(e.featurizer.X_mtx.shape, e.featurizer.y_vec.shape)
y = e.featurizer.y
classes = defaultdict(int)
for c in y:
    classes[list(c.values())[0]] += 1
print(classes)

defaultdict(<class 'int'>, {'PUNCT': 50, 'NOUN': 292, 'POSTP': 109, 'DET': 5, 'ADJ': 263, 'CONJ': 308, 'PREP': 2, 'ONO': 1, 'UTT': 88, 'VERB': 172, 'ART': 11, 'PREV': 130, 'ADV': 221, 'UNKNOWN': 5, 'NUM': 113})
(30454, 579) (30454, 15)
defaultdict(<class 'int'>, {'PUNCT': 3000, 'NOUN': 3000, 'NUM': 3000, 'DET': 31, 'ADJ': 3000, 'CONJ': 3000, 'PREP': 17, 'ONO': 10, 'UTT': 695, 'VERB': 3000, 'ART': 3000, 'PREV': 2657, 'ADV': 3000, 'UNKNOWN': 44, 'POSTP': 3000})


In [16]:
%%time
e = Experiment({
        'global': {
            'verbose': True,
        },
        'featurizer': {
            'last_char': 10,
            'N': 2,
            'use_padding': False,
            'include_smaller_ngrams': True,
            'tag_filter': ["VERB", "NOUN", "ADJ", "CONJ", "ADV"],
            'data_path': data_path,
            'encoding': 'latin2',
            'min_sample_per_class': 300,
            'max_sample_per_class': 3000,
            'max_lines': 200000,
        },
        'ffnn': {
            'layers': (30, 30),
            'batch_size': 1000,
            'optimizer': 'MomentumOptimizer',
            'optimizer_kwargs': {
                'learning_rate': 1,
                'momentum': .1,
            },
            'gpu_memory_fraction': 1,
            'epochs': 10000,
        }
})
e.run_and_save()

1000 epochs, cvalue: 0.13049420714378357
2000 epochs, cvalue: 0.11941518634557724
3000 epochs, cvalue: 0.11810123175382614
4000 epochs, cvalue: 0.10926355421543121
5000 epochs, cvalue: 0.1102619618177414
6000 epochs, cvalue: 0.09652210772037506
7000 epochs, cvalue: 0.09469000995159149
8000 epochs, cvalue: 0.09023628383874893
9000 epochs, cvalue: 0.08500447869300842
10000 epochs, cvalue: 0.0812755599617958
Training accuracy: 0.6969850659370422
Test accuracy: 0.6718963384628296
CPU times: user 3min 40s, sys: 4.48 s, total: 3min 45s
Wall time: 3min 29s


In [17]:
e.run_decision_tree()

array([ 0.78666667,  0.76733333,  0.79266667,  0.75266667,  0.80533333,
        0.81733333,  0.86266667,  0.832     ,  0.862     ,  0.948     ])