In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import logging

logging.getLogger('jieba').setLevel(logging.WARN)
logging.getLogger('fgclassifier').setLevel(logging.INFO)

In [2]:
import os

os.chdir('..')

In [68]:
from fgclassifier.utils import read_data, get_dataset

X_train, y_train = read_data(get_dataset('train_en'), flavor=None)
X_train.shape

(8000,)

In [66]:
np.linspace(0.01, 0.1, 10)

array([0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.1 ])

In [70]:
from IPython.display import clear_output

from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.externals import joblib

from fgclassifier.baseline import Baseline, MultiOutputClassifier
from fgclassifier.features import Count, Tfidf, SVD, SparseToDense
from fgclassifier.classifiers import LinearDiscriminantAnalysis

pipeline = Pipeline([
    ('vect', Count(min_df=0.02, max_df=0.99, ngram_range=(1, 6))),
    ('tfidf', Tfidf()),
    ('svd', SVD(n_components=1000)),
], memory='data/pipeline_cache')

Xt = pipeline.fit_transform(X_train)

clf = MultiOutputClassifier(LinearDiscriminantAnalysis())
parameters = {
    'estimator': [
        LinearDiscriminantAnalysis(n_components=n)
        for n in [100, 200, 300, 400, 500, None]
    ]
}

with joblib.parallel_backend('threading', n_jobs=1):
    grid_search = GridSearchCV(
        clf, parameters, cv=4,
        verbose=True, return_train_score=True
    )
    grid_search.fit(Xt, y_train)
    
clear_output()

2018-12-03 15:42:14,179 [INFO] Fit & Transform CountVectorizer...
2018-12-03 15:43:41,042 [INFO] Vocab Size: 2612
If this happens often in your code, it can cause performance problems 
(results will be correct in all cases). 
The reason for this is probably some large input arguments for a wrapped
 function (e.g. large strings).
THIS IS A JOBLIB ISSUE. If you can, kindly provide the joblib's team with an
 example so that they can fix the problem.
  **fit_params_steps[name])
2018-12-03 15:46:23,434 [INFO] Fit & Transform TF-IDF...
2018-12-03 15:46:23,633 [INFO] Fit & Transform TruncatedSVD...


Fitting 4 folds for each of 6 candidates, totalling 24 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
2018-12-03 15:47:52,118 [INFO] [Validate]: F1 Scores
2018-12-03 15:47:52,126 [INFO]   location_traffic_convenience            	0.4413
2018-12-03 15:47:52,130 [INFO]   location_distance_from_business_district	0.3433
2018-12-03 15:47:52,133 [INFO]   location_easy_to_find                   	0.4572
2018-12-03 15:47:52,137 [INFO]   service_wait_time                       	0.4603
2018-12-03 15:47:52,141 [INFO]   service_waiters_attitude                	0.5457
2018-12-03 15:47:52,146 [INFO]   service_parking_convenience             	0.4578
2018-12-03 15:47:52,151 [INFO]   service_serving_speed                   	0.5353
2018-12-03 15:47:52,157 [INFO]   price_level                             	0.5415
2018-12-03 15:47:52,163 [INFO]   price_cost_effective                    	0.4082
2018-12-03 15:47:52,167 [INFO]   price_discount                          	0.4850
2018-12-03 15:47:52,172 [INFO]   environment_decoration 

2018-12-03 15:49:56,468 [INFO]   dish_portion                            	0.4342
2018-12-03 15:49:56,471 [INFO]   dish_taste                              	0.4724
2018-12-03 15:49:56,473 [INFO]   dish_look                               	0.3531
2018-12-03 15:49:56,477 [INFO]   dish_recommendation                     	0.4859
2018-12-03 15:49:56,479 [INFO]   others_overall_experience               	0.4825
2018-12-03 15:49:56,482 [INFO]   others_willing_to_consume_again         	0.4726
2018-12-03 15:49:56,737 [INFO] [Validate]: F1 Scores
2018-12-03 15:49:56,740 [INFO]   location_traffic_convenience            	0.8920
2018-12-03 15:49:56,744 [INFO]   location_distance_from_business_district	0.8593
2018-12-03 15:49:56,747 [INFO]   location_easy_to_find                   	0.7971
2018-12-03 15:49:56,750 [INFO]   service_wait_time                       	0.7548
2018-12-03 15:49:56,753 [INFO]   service_waiters_attitude                	0.7672
2018-12-03 15:49:56,757 [INFO]   service_parking_conveni

2018-12-03 15:52:06,827 [INFO]   environment_space                       	0.7262
2018-12-03 15:52:06,832 [INFO]   environment_cleaness                    	0.7556
2018-12-03 15:52:06,840 [INFO]   dish_portion                            	0.6824
2018-12-03 15:52:06,844 [INFO]   dish_taste                              	0.7671
2018-12-03 15:52:06,847 [INFO]   dish_look                               	0.6795
2018-12-03 15:52:06,852 [INFO]   dish_recommendation                     	0.7853
2018-12-03 15:52:06,855 [INFO]   others_overall_experience               	0.7752
2018-12-03 15:52:06,859 [INFO]   others_willing_to_consume_again         	0.7809
2018-12-03 15:53:15,817 [INFO] [Validate]: F1 Scores
2018-12-03 15:53:15,824 [INFO]   location_traffic_convenience            	0.4354
2018-12-03 15:53:15,827 [INFO]   location_distance_from_business_district	0.3381
2018-12-03 15:53:15,830 [INFO]   location_easy_to_find                   	0.4807
2018-12-03 15:53:15,833 [INFO]   service_wait_time      

2018-12-03 15:55:40,569 [INFO]   price_level                             	0.5620
2018-12-03 15:55:40,572 [INFO]   price_cost_effective                    	0.4396
2018-12-03 15:55:40,576 [INFO]   price_discount                          	0.4949
2018-12-03 15:55:40,578 [INFO]   environment_decoration                  	0.4506
2018-12-03 15:55:40,581 [INFO]   environment_noise                       	0.5520
2018-12-03 15:55:40,584 [INFO]   environment_space                       	0.4969
2018-12-03 15:55:40,587 [INFO]   environment_cleaness                    	0.4580
2018-12-03 15:55:40,590 [INFO]   dish_portion                            	0.4137
2018-12-03 15:55:40,593 [INFO]   dish_taste                              	0.4819
2018-12-03 15:55:40,596 [INFO]   dish_look                               	0.3355
2018-12-03 15:55:40,599 [INFO]   dish_recommendation                     	0.4476
2018-12-03 15:55:40,602 [INFO]   others_overall_experience               	0.4694
2018-12-03 15:55:40,606 [INF

2018-12-03 15:57:48,555 [INFO]   service_parking_convenience             	0.8965
2018-12-03 15:57:48,559 [INFO]   service_serving_speed                   	0.7864
2018-12-03 15:57:48,564 [INFO]   price_level                             	0.7250
2018-12-03 15:57:48,569 [INFO]   price_cost_effective                    	0.7820
2018-12-03 15:57:48,573 [INFO]   price_discount                          	0.7661
2018-12-03 15:57:48,578 [INFO]   environment_decoration                  	0.7725
2018-12-03 15:57:48,584 [INFO]   environment_noise                       	0.7676
2018-12-03 15:57:48,589 [INFO]   environment_space                       	0.7113
2018-12-03 15:57:48,593 [INFO]   environment_cleaness                    	0.7606
2018-12-03 15:57:48,597 [INFO]   dish_portion                            	0.6750
2018-12-03 15:57:48,601 [INFO]   dish_taste                              	0.7722
2018-12-03 15:57:48,606 [INFO]   dish_look                               	0.6955
2018-12-03 15:57:48,610 [INF

2018-12-03 16:01:36,869 [INFO]   location_traffic_convenience            	0.4413
2018-12-03 16:01:36,872 [INFO]   location_distance_from_business_district	0.3433
2018-12-03 16:01:36,876 [INFO]   location_easy_to_find                   	0.4572
2018-12-03 16:01:36,879 [INFO]   service_wait_time                       	0.4603
2018-12-03 16:01:36,882 [INFO]   service_waiters_attitude                	0.5457
2018-12-03 16:01:36,885 [INFO]   service_parking_convenience             	0.4578
2018-12-03 16:01:36,890 [INFO]   service_serving_speed                   	0.5353
2018-12-03 16:01:36,894 [INFO]   price_level                             	0.5415
2018-12-03 16:01:36,900 [INFO]   price_cost_effective                    	0.4082
2018-12-03 16:01:36,905 [INFO]   price_discount                          	0.4850
2018-12-03 16:01:36,908 [INFO]   environment_decoration                  	0.4371
2018-12-03 16:01:36,911 [INFO]   environment_noise                       	0.5254
2018-12-03 16:01:36,914 [INF

2018-12-03 16:03:48,936 [INFO]   dish_look                               	0.3531
2018-12-03 16:03:48,940 [INFO]   dish_recommendation                     	0.4859
2018-12-03 16:03:48,944 [INFO]   others_overall_experience               	0.4825
2018-12-03 16:03:48,947 [INFO]   others_willing_to_consume_again         	0.4726
2018-12-03 16:03:49,292 [INFO] [Validate]: F1 Scores
2018-12-03 16:03:49,296 [INFO]   location_traffic_convenience            	0.8920
2018-12-03 16:03:49,300 [INFO]   location_distance_from_business_district	0.8593
2018-12-03 16:03:49,305 [INFO]   location_easy_to_find                   	0.7971
2018-12-03 16:03:49,309 [INFO]   service_wait_time                       	0.7548
2018-12-03 16:03:49,315 [INFO]   service_waiters_attitude                	0.7672
2018-12-03 16:03:49,320 [INFO]   service_parking_convenience             	0.8946
2018-12-03 16:03:49,323 [INFO]   service_serving_speed                   	0.7935
2018-12-03 16:03:49,328 [INFO]   price_level            

2018-12-03 16:06:16,858 [INFO]   dish_portion                            	0.6824
2018-12-03 16:06:16,862 [INFO]   dish_taste                              	0.7671
2018-12-03 16:06:16,865 [INFO]   dish_look                               	0.6795
2018-12-03 16:06:16,870 [INFO]   dish_recommendation                     	0.7853
2018-12-03 16:06:16,883 [INFO]   others_overall_experience               	0.7752
2018-12-03 16:06:16,892 [INFO]   others_willing_to_consume_again         	0.7809
2018-12-03 16:07:29,536 [INFO] [Validate]: F1 Scores
2018-12-03 16:07:29,540 [INFO]   location_traffic_convenience            	0.4354
2018-12-03 16:07:29,543 [INFO]   location_distance_from_business_district	0.3381
2018-12-03 16:07:29,546 [INFO]   location_easy_to_find                   	0.4807
2018-12-03 16:07:29,549 [INFO]   service_wait_time                       	0.4355
2018-12-03 16:07:29,552 [INFO]   service_waiters_attitude                	0.5465
2018-12-03 16:07:29,555 [INFO]   service_parking_conveni

2018-12-03 16:09:49,014 [INFO]   price_discount                          	0.4949
2018-12-03 16:09:49,017 [INFO]   environment_decoration                  	0.4506
2018-12-03 16:09:49,020 [INFO]   environment_noise                       	0.5520
2018-12-03 16:09:49,023 [INFO]   environment_space                       	0.4969
2018-12-03 16:09:49,025 [INFO]   environment_cleaness                    	0.4580
2018-12-03 16:09:49,029 [INFO]   dish_portion                            	0.4137
2018-12-03 16:09:49,032 [INFO]   dish_taste                              	0.4819
2018-12-03 16:09:49,035 [INFO]   dish_look                               	0.3355
2018-12-03 16:09:49,038 [INFO]   dish_recommendation                     	0.4476
2018-12-03 16:09:49,041 [INFO]   others_overall_experience               	0.4694
2018-12-03 16:09:49,044 [INFO]   others_willing_to_consume_again         	0.4603
2018-12-03 16:09:49,350 [INFO] [Validate]: F1 Scores
2018-12-03 16:09:49,354 [INFO]   location_traffic_conven

2018-12-03 16:11:59,202 [INFO]   price_level                             	0.7250
2018-12-03 16:11:59,205 [INFO]   price_cost_effective                    	0.7820
2018-12-03 16:11:59,209 [INFO]   price_discount                          	0.7661
2018-12-03 16:11:59,213 [INFO]   environment_decoration                  	0.7725
2018-12-03 16:11:59,216 [INFO]   environment_noise                       	0.7676
2018-12-03 16:11:59,219 [INFO]   environment_space                       	0.7113
2018-12-03 16:11:59,223 [INFO]   environment_cleaness                    	0.7606
2018-12-03 16:11:59,227 [INFO]   dish_portion                            	0.6750
2018-12-03 16:11:59,232 [INFO]   dish_taste                              	0.7722
2018-12-03 16:11:59,236 [INFO]   dish_look                               	0.6955
2018-12-03 16:11:59,239 [INFO]   dish_recommendation                     	0.8197
2018-12-03 16:11:59,244 [INFO]   others_overall_experience               	0.7790
2018-12-03 16:11:59,248 [INF

In [71]:
grid_search.best_estimator_

MultiOutputClassifier(estimator=LinearDiscriminantAnalysis(n_components=100, priors=None, shrinkage=None,
              solver='svd', store_covariance=False, tol=0.0001),
           n_jobs=None)

In [78]:
import seaborn as sns

def extract_results(searcher):
    """Process the results for display"""
    results =  pd.melt(
        pd.DataFrame(searcher.cv_results_),
        id_vars=['param_estimator'],
        value_vars=[
            'mean_test_score',
            'mean_train_score',
        ],
        value_name='accuracy'
    )
    results['lda_n_components'] = [x.n_components for x in results['param_estimator']]
    results = results.drop(['param_estimator'], axis=1)
    print(results)
    results['train_or_test'] = np.where(results['variable'].str.contains('test'),
                                        'test', 'train')
    results['split'] = results['variable'].str.extract(r'(\d)')
    results['error'] = 1 - results['accuracy']
    results = results.drop(['variable', 'accuracy'], axis=1)
    return results

results = extract_results(grid_search)
results.sample(3)

            variable  accuracy  lda_n_components
0    mean_test_score  0.465300             100.0
1    mean_test_score  0.465300             200.0
2    mean_test_score  0.465300             300.0
3    mean_test_score  0.465300             400.0
4    mean_test_score  0.465300             500.0
5    mean_test_score  0.465300               NaN
6   mean_train_score  0.777689             100.0
7   mean_train_score  0.777689             200.0
8   mean_train_score  0.777689             300.0
9   mean_train_score  0.777689             400.0
10  mean_train_score  0.777689             500.0
11  mean_train_score  0.777689               NaN




Unnamed: 0,lda_n_components,train_or_test,split,error
11,,train,,0.222311
4,500.0,test,,0.5347
9,400.0,train,,0.222311


In [80]:
grid_search.cv_results_



{'mean_fit_time': array([62.54476374, 69.18490821, 69.82223725, 69.19842786, 70.91524029,
        63.7616508 ]),
 'std_fit_time': array([3.26797614, 3.14853783, 9.78655297, 4.76832626, 3.98148706,
        1.20111508]),
 'mean_score_time': array([0.21988481, 0.18551934, 0.22027987, 0.21431428, 0.20507872,
        0.1853742 ]),
 'std_score_time': array([0.06669929, 0.02336994, 0.05080801, 0.03288661, 0.03675586,
        0.02028173]),
 'param_estimator': masked_array(data=[LinearDiscriminantAnalysis(n_components=100, priors=None, shrinkage=None,
               solver='svd', store_covariance=False, tol=0.0001),
                    LinearDiscriminantAnalysis(n_components=200, priors=None, shrinkage=None,
               solver='svd', store_covariance=False, tol=0.0001),
                    LinearDiscriminantAnalysis(n_components=300, priors=None, shrinkage=None,
               solver='svd', store_covariance=False, tol=0.0001),
                    LinearDiscriminantAnalysis(n_components=400, 

Looks like the LDA paramater does not matter for LDA.