In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import logging

logging.getLogger('jieba').setLevel(logging.WARN)
logging.getLogger('fgclassifier').setLevel(logging.INFO)
logging.getLogger('fgclassifier.features').setLevel(logging.INFO)

In [2]:
import os

os.chdir('..')

In [3]:
from fgclassifier.utils import read_data, get_dataset

X_train, y_train = read_data(get_dataset('train_en'), flavor=None)
X_train.shape

2018-12-04 16:04:08,953 [INFO] Reading /opt/storage/english_train.csv..


(8000,)

Exam the usage of np.linspace/logspace

In [4]:
np.linspace(100, 1000, 10)

array([ 100.,  200.,  300.,  400.,  500.,  600.,  700.,  800.,  900.,
       1000.])

In [5]:
np.logspace(-4, 4, 9)

array([1.e-04, 1.e-03, 1.e-02, 1.e-01, 1.e+00, 1.e+01, 1.e+02, 1.e+03,
       1.e+04])

In [6]:
np.linspace(0, 1, 11)

array([0. , 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1. ])

Build the features

In [7]:
from sklearn.pipeline import Pipeline
from fgclassifier.features import Count, Tfidf

pipeline = Pipeline([
    ('vect', Count(min_df=0.02, max_df=0.99, ngram_range=(1, 6))),
    ('tfidf', Tfidf()),
], memory='data/pipeline_cache')

Xt = pipeline.fit_transform(X_train)

2018-12-04 16:04:44,306 [INFO] 'pattern' package not found; tag filters are not available for English
2018-12-04 16:05:07,442 [INFO] Fit & Transform TF-IDF...


In [11]:
print('Original data dimension: ',
      len(pipeline.named_steps.tfidf.idf_))

Original data dimension:  2612


## Optimize classifier 

We use TruncatedSVD for dimension reduction, then pass the reduced dataset
to LDA. This is more efficient as this way we can handle sparse matrix whereas
LDA requires dense matrix input.

In [None]:
from IPython.display import clear_output
from sklearn.model_selection import GridSearchCV
from sklearn.externals import joblib
from sklearn.decomposition import TruncatedSVD

from fgclassifier.baseline import Baseline, MultiOutputClassifier
from fgclassifier.classifiers import LinearDiscriminantAnalysis as LDA

max_dim = len(pipeline.named_steps.tfidf.idf_)
clf = Pipeline([
    ('svd', TruncatedSVD(n_components=100)),
    ('clf', MultiOutputClassifier(LDA()))
])

parameters = {
    'svd__n_components': np.linspace(100, max_dim, 10, dtype=int)
}

with joblib.parallel_backend('threading', n_jobs=3):
    searcher = GridSearchCV(
        clf, parameters, cv=5,
        verbose=True,
        return_train_score=True
    )
    searcher.fit(Xt, y_train)
    
clear_output()

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=3)]: Using backend ThreadingBackend with 3 concurrent workers.
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
2018-12-04 16:10:06,440 [INFO]  F1 Score: 0.37129504517796275
  'precision', 'predicted', average, warn_for)
2018-12-04 16:10:06,515 [INFO]  F1 Score: 0.382266396282607
  'precision', 'predicted', average, warn_for)
2018-12-04 16:10:06,779 [INFO]  F1 Score: 0.3723449020129853
2018-12-04 16:10:06,787 [INFO]  F1 Score: 0.41948139230218456
2018-12-04 16:10:06,815 [INFO]  F1 Score: 0.42007612453371956
2018-12-04 16:10:07,019 [INFO]  F1 Score: 0.4160763902226594
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
2018-12-04 16:10:12,459 [INFO]  F1 Score: 0.3806552386446982
  'precision', 'predicted', average, warn_for)
2018-12-04 16:10:12,530 [INFO]  F1 Score: 0.3785826356210036
  'precision', 'predicted', average, warn_for)
2018-12

2018-12-04 16:24:49,442 [INFO]  F1 Score: 0.8382834440962956
  'precision', 'predicted', average, warn_for)
2018-12-04 16:26:59,344 [INFO]  F1 Score: 0.46440418605427836
2018-12-04 16:27:08,616 [INFO]  F1 Score: 0.8392221401724609
  'precision', 'predicted', average, warn_for)
2018-12-04 16:27:45,169 [INFO]  F1 Score: 0.4583670648414281
2018-12-04 16:27:48,043 [INFO]  F1 Score: 0.8388945046232917


In [None]:
grid_search.best_estimator_

In [78]:
import seaborn as sns

def extract_results(searcher):
    """Process the results for display"""
    results =  pd.melt(
        pd.DataFrame(searcher.cv_results_),
        id_vars=['param_estimator'],
        value_vars=[
            'mean_test_score',
            'mean_train_score',
        ],
        value_name='accuracy'
    )
    results['lda_n_components'] = [x.n_components for x in results['param_estimator']]
    results = results.drop(['param_estimator'], axis=1)
    print(results)
    results['train_or_test'] = np.where(results['variable'].str.contains('test'),
                                        'test', 'train')
    results['split'] = results['variable'].str.extract(r'(\d)')
    results['error'] = 1 - results['accuracy']
    results = results.drop(['variable', 'accuracy'], axis=1)
    return results

results = extract_results(grid_search)
results.sample(3)

            variable  accuracy  lda_n_components
0    mean_test_score  0.465300             100.0
1    mean_test_score  0.465300             200.0
2    mean_test_score  0.465300             300.0
3    mean_test_score  0.465300             400.0
4    mean_test_score  0.465300             500.0
5    mean_test_score  0.465300               NaN
6   mean_train_score  0.777689             100.0
7   mean_train_score  0.777689             200.0
8   mean_train_score  0.777689             300.0
9   mean_train_score  0.777689             400.0
10  mean_train_score  0.777689             500.0
11  mean_train_score  0.777689               NaN




Unnamed: 0,lda_n_components,train_or_test,split,error
11,,train,,0.222311
4,500.0,test,,0.5347
9,400.0,train,,0.222311


In [80]:
grid_search.cv_results_



{'mean_fit_time': array([62.54476374, 69.18490821, 69.82223725, 69.19842786, 70.91524029,
        63.7616508 ]),
 'std_fit_time': array([3.26797614, 3.14853783, 9.78655297, 4.76832626, 3.98148706,
        1.20111508]),
 'mean_score_time': array([0.21988481, 0.18551934, 0.22027987, 0.21431428, 0.20507872,
        0.1853742 ]),
 'std_score_time': array([0.06669929, 0.02336994, 0.05080801, 0.03288661, 0.03675586,
        0.02028173]),
 'param_estimator': masked_array(data=[LinearDiscriminantAnalysis(n_components=100, priors=None, shrinkage=None,
               solver='svd', store_covariance=False, tol=0.0001),
                    LinearDiscriminantAnalysis(n_components=200, priors=None, shrinkage=None,
               solver='svd', store_covariance=False, tol=0.0001),
                    LinearDiscriminantAnalysis(n_components=300, priors=None, shrinkage=None,
               solver='svd', store_covariance=False, tol=0.0001),
                    LinearDiscriminantAnalysis(n_components=400, 

Looks like the LDA paramater does not matter for LDA.