In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import logging

logging.getLogger('jieba').setLevel(logging.WARN)
logging.getLogger('fgclassifier').setLevel(logging.INFO)
logging.getLogger('fgclassifier.features').setLevel(logging.INFO)

In [2]:
import os

os.chdir('..')

In [3]:
from fgclassifier.utils import read_data, get_dataset

X_train, y_train = read_data(get_dataset('train_en'), flavor=None)
X_train.shape

2018-12-04 16:11:18,295 [INFO] Reading /opt/storage/english_train.csv..


(8000,)

Process ForkPoolWorker-1:
Process ForkPoolWorker-4:
Process ForkPoolWorker-3:
Process ForkPoolWorker-2:
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
  File "/Users/jesse/anaconda3/envs/idp/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/Users/jesse/anaconda3/envs/idp/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/Users/jesse/anaconda3/envs/idp/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/Users/jesse/anaconda3/envs/idp/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
Traceback (most recent call last):
  File "/Users/jesse/anaconda3/envs/idp/lib/python3.6/multiprocessing/synchronize.py", line 96, in __enter__
    return self._semlock.__enter__()
  File "/Users/jesse/anaconda3/envs/idp/lib/python3.6/multiprocessing/process.py", line 93, in run


Exam the usage of np.linspace/logspace

In [4]:
print(np.linspace(100, 1000, 10))
print(np.logspace(-4, 4, 9))
print(np.linspace(0, 1, 11))

[ 100.  200.  300.  400.  500.  600.  700.  800.  900. 1000.]
[1.e-04 1.e-03 1.e-02 1.e-01 1.e+00 1.e+01 1.e+02 1.e+03 1.e+04]
[0.  0.1 0.2 0.3 0.4 0.5 0.6 0.7 0.8 0.9 1. ]


Build the features

In [5]:
from sklearn.pipeline import Pipeline
from fgclassifier.features import Count, Tfidf

pipeline = Pipeline([
    ('vect', Count(min_df=0.02, max_df=0.99, ngram_range=(1, 6))),
    ('tfidf', Tfidf()),
], memory='data/pipeline_cache')

Xt = pipeline.fit_transform(X_train)

2018-12-04 16:11:44,773 [INFO] 'pattern' package not found; tag filters are not available for English
2018-12-04 16:12:52,391 [INFO] Fit & Transform TF-IDF...


In [6]:
print('Original data dimension: ',
      len(pipeline.named_steps.tfidf.idf_))

Original data dimension:  2612


## Optimize classifier 

Reference: https://scikit-learn.org/stable/modules/generated/sklearn.neural_network.MLPClassifier.html


In [12]:
from IPython.display import clear_output
from sklearn.model_selection import GridSearchCV
from sklearn.externals import joblib
from sklearn.neural_network import MLPClassifier

from fgclassifier.baseline import Baseline, MultiOutputClassifier

max_dim = len(pipeline.named_steps.tfidf.idf_)
clf = MultiOutputClassifier(MLPClassifier(
    learning_rate='adaptive',
    learning_rate_init=0.001,
    early_stopping=True,
    random_state=1
))

parameters = {
    'estimator__hidden_layer_sizes': [
        (100, ), (200, ), (300, ), (400, ), (500, ),
        (600, ), (700, ), (800, ), (900, ), (1000, ),
        (500, 100), (1000, 500),
    ],
    'estimator__activation': ['logistic', 'tanh', 'relu'],
}

with joblib.parallel_backend('threading', n_jobs=3):
    searcher = GridSearchCV(
        clf, parameters, cv=5,
        verbose=True,
        return_train_score=True
    )
    searcher.fit(Xt, y_train)
    
clear_output()

In [None]:
# Important training results
searcher.best_estimator_
searcher.cv_results_

Plot the results

In [9]:
import seaborn as sns

params = ['estimator__hidden_layer_sizes', 'estimator__activation']

def extract_results(searcher, param_names=params):
    """Process the results for display"""
    results =  pd.melt(
        pd.DataFrame(searcher.cv_results_),
        id_vars=param_names,
        value_vars=[
            'mean_test_score',
            'mean_train_score',
        ],
        value_name='accuracy'
    )
    results = results.drop(param_names, axis=1)
    print(results)
    results['train_or_test'] = np.where(results['variable'].str.contains('test'),
                                        'test', 'train')
    results['split'] = results['variable'].str.extract(r'(\d)')
    results['error'] = 1 - results['accuracy']
    results = results.drop(['variable', 'accuracy'], axis=1)
    return results

# results = extract_results(searcher)
# results.sample(3)