In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import logging

logging.getLogger('jieba').setLevel(logging.WARN)
logging.getLogger('fgclassifier').setLevel(logging.INFO)
logging.getLogger('fgclassifier.features').setLevel(logging.INFO)

In [2]:
import os

os.chdir('..')

In [3]:
from fgclassifier.utils import read_data, get_dataset

X_train, y_train = read_data(get_dataset('train_en'), flavor=None)
X_train.shape

2018-12-04 15:56:52,646 [INFO] Reading /opt/storage/english_train.csv..


(8000,)

Process ForkPoolWorker-3:
Process ForkPoolWorker-1:
Process ForkPoolWorker-4:
Process ForkPoolWorker-2:
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
  File "/Users/jesse/anaconda3/envs/idp/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/Users/jesse/anaconda3/envs/idp/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/Users/jesse/anaconda3/envs/idp/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/Users/jesse/anaconda3/envs/idp/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/Users/jesse/anaconda3/envs/idp/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/Users/jesse/anaconda3/envs/idp/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._

Exam the usage of np.linspace/logspace

In [13]:
np.linspace(100, 1000, 10)

array([ 100.,  325.,  550.,  775., 1000.])

In [5]:
np.logspace(-4, 4, 9)

array([1.e-04, 1.e-03, 1.e-02, 1.e-01, 1.e+00, 1.e+01, 1.e+02, 1.e+03,
       1.e+04])

In [6]:
np.linspace(0, 1, 11)

array([0. , 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1. ])

Build the features

In [7]:
from sklearn.pipeline import Pipeline
from fgclassifier.features import Count, Tfidf, SVD, SparseToDense

pipeline = Pipeline([
    ('vect', Count(min_df=0.02, max_df=0.99, ngram_range=(1, 6))),
    ('tfidf', Tfidf()),
    # LDA needs dense matrix, this is not needed
    ('densify', SparseToDense())
], memory='data/pipeline_cache')

Xt = pipeline.fit_transform(X_train)

2018-12-04 15:56:53,737 [INFO] 'pattern' package not found; tag filters are not available for English


In [11]:
len(pipeline.named_steps.vect.vocabulary_)

2612

Optimize classifier 

In [12]:
from IPython.display import clear_output
from sklearn.model_selection import GridSearchCV
from sklearn.externals import joblib

from fgclassifier.baseline import Baseline, MultiOutputClassifier
from fgclassifier.classifiers import LinearDiscriminantAnalysis as LDA

clf = MultiOutputClassifier(LDA())
choices = []

for solver in ['svd', 'lsgr', 'eigen']:
    for n_components in np.linspace(100, 1000, 10):
        choices.append(LDA(solver=solver,
                           n_components=n_components))
        # Add shrinkage
        if solver != 'svd':
            choices.append(LDA(solver=solver,
                               n_components=n_components,
                               shrinkage='auto'))
            for shrinkage in np.linspace(0, 1, 11):
                choices.append(LDA(solver=solver,
                                   n_components=n_components,
                                   shrinkage=shrinkage))
        
parameters = {'estimator': choices}

with joblib.parallel_backend('threading', n_jobs=5):
    searcher = GridSearchCV(
        clf, parameters, cv=5,
        verbose=True, return_train_score=True
    )
    searcher.fit(Xt, y_train)
    
clear_output()

Fitting 5 folds for each of 270 candidates, totalling 1350 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


KeyboardInterrupt: 

In [None]:
grid_search.best_estimator_

In [78]:
import seaborn as sns

def extract_results(searcher):
    """Process the results for display"""
    results =  pd.melt(
        pd.DataFrame(searcher.cv_results_),
        id_vars=['param_estimator'],
        value_vars=[
            'mean_test_score',
            'mean_train_score',
        ],
        value_name='accuracy'
    )
    results['lda_n_components'] = [x.n_components for x in results['param_estimator']]
    results = results.drop(['param_estimator'], axis=1)
    print(results)
    results['train_or_test'] = np.where(results['variable'].str.contains('test'),
                                        'test', 'train')
    results['split'] = results['variable'].str.extract(r'(\d)')
    results['error'] = 1 - results['accuracy']
    results = results.drop(['variable', 'accuracy'], axis=1)
    return results

results = extract_results(grid_search)
results.sample(3)

            variable  accuracy  lda_n_components
0    mean_test_score  0.465300             100.0
1    mean_test_score  0.465300             200.0
2    mean_test_score  0.465300             300.0
3    mean_test_score  0.465300             400.0
4    mean_test_score  0.465300             500.0
5    mean_test_score  0.465300               NaN
6   mean_train_score  0.777689             100.0
7   mean_train_score  0.777689             200.0
8   mean_train_score  0.777689             300.0
9   mean_train_score  0.777689             400.0
10  mean_train_score  0.777689             500.0
11  mean_train_score  0.777689               NaN




Unnamed: 0,lda_n_components,train_or_test,split,error
11,,train,,0.222311
4,500.0,test,,0.5347
9,400.0,train,,0.222311


In [80]:
grid_search.cv_results_



{'mean_fit_time': array([62.54476374, 69.18490821, 69.82223725, 69.19842786, 70.91524029,
        63.7616508 ]),
 'std_fit_time': array([3.26797614, 3.14853783, 9.78655297, 4.76832626, 3.98148706,
        1.20111508]),
 'mean_score_time': array([0.21988481, 0.18551934, 0.22027987, 0.21431428, 0.20507872,
        0.1853742 ]),
 'std_score_time': array([0.06669929, 0.02336994, 0.05080801, 0.03288661, 0.03675586,
        0.02028173]),
 'param_estimator': masked_array(data=[LinearDiscriminantAnalysis(n_components=100, priors=None, shrinkage=None,
               solver='svd', store_covariance=False, tol=0.0001),
                    LinearDiscriminantAnalysis(n_components=200, priors=None, shrinkage=None,
               solver='svd', store_covariance=False, tol=0.0001),
                    LinearDiscriminantAnalysis(n_components=300, priors=None, shrinkage=None,
               solver='svd', store_covariance=False, tol=0.0001),
                    LinearDiscriminantAnalysis(n_components=400, 

Looks like the LDA paramater does not matter for LDA.