In [1]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import logging

logging.getLogger('jieba').setLevel(logging.WARN)
logging.getLogger('fgclassifier').setLevel(logging.INFO)

In [7]:
import config
from collections import defaultdict
from sklearn.model_selection import train_test_split

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import LatentDirichletAllocation

from fgclassifier.features import FeaturePipeline, logger
from fgclassifier.utils import read_data

X, y = read_data('data/english.csv', seg_words=False, sample_n=None)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

# feature models with dependency specs
fm_spec = {
   'count': CountVectorizer(ngram_range=(1, 4), min_df=0.02, max_df=0.9),
   'tfidf': ['count', TfidfTransformer],
   'lsa_100': ['tfidf', TruncatedSVD(n_components=100)],
   'lsa_1k': ['tfidf', TruncatedSVD(n_components=1000)],
#    'lda_count_100': ['count', LatentDirichletAllocation(n_components=100)],
#    'lda_count_200': ['count', LatentDirichletAllocation(n_components=200)],
#    'lda_tfidf_100': ['tfidf', LatentDirichletAllocation(n_components=100)],
#    'lda_tfidf_200': ['tfidf', LatentDirichletAllocation(n_components=200)],
    
    # small vocabulary (removed more stop words)
#    'count_sv': CountVectorizer(ngram_range=(1, 4), min_df=0.01, max_df=0.85),
#    'tfidf_sv': ['count_sv', TfidfTransformer],
#    'lsa_100_sv': ['tfidf_sv', TruncatedSVD(n_components=100)],
#    'lsa_1k_sv': ['tfidf_sv', TruncatedSVD(n_components=1000)],
#    'lda_count_100_sv': ['count_sv', LatentDirichletAllocation(n_components=100)],
#    'lda_count_200_sv': ['count_sv', LatentDirichletAllocation(n_components=200)],
#    'lda_tfidf_100_sv': ['tfidf_sv', LatentDirichletAllocation(n_components=100)],
#    'lda_tfidf_200_sv': ['tfidf_sv', LatentDirichletAllocation(n_components=200)],
}

# Cache trained fetures, we make this cache object
# so different steps can reuse previously trained features
fm = defaultdict(dict)

for name in fm_spec.keys():
    logger.info(f'Building features for {name}...')
    model = FeaturePipeline(name, spec=fm_spec, cache=fm)
    model.fit_transform(X_train)
    model.transform(X_test)

2018-11-20 15:24:08,573 [INFO] Reading data/english.csv..
2018-11-20 15:24:08,761 [INFO] Building features for count...
2018-11-20 15:24:32,314 [INFO] Building features for tfidf...
2018-11-20 15:24:32,315 [INFO]   count: fit_transform use cache.
2018-11-20 15:24:32,415 [INFO]   count: transform use cache.
2018-11-20 15:24:32,429 [INFO] Building features for lsa_100...
2018-11-20 15:24:32,430 [INFO]   tfidf: fit_transform use cache.
2018-11-20 15:24:33,573 [INFO]   tfidf: transform use cache.
2018-11-20 15:24:33,602 [INFO] Building features for lsa_1k...
2018-11-20 15:24:33,603 [INFO]   tfidf: fit_transform use cache.
2018-11-20 15:24:48,013 [INFO]   tfidf: transform use cache.


Exam the quality of the top terms:

In [15]:
from collections import Counter

print(X_train.shape, X_test.shape)

counter = fm['count']['model'].named_steps.count
counter_sv = fm['count_sv']['model'].named_steps.count_sv
for model in [counter, counter_sv]:
    print('\nmin_df: %.2f, max_df: %.2f, ngram_range: %s' % (
        model.min_df, model.max_df, model.ngram_range
    ))
    print('vocab size: %s' % len(model.vocabulary_))
    print('\n'.join(['%s \t %s' % (k, v) for k, v in
                     Counter(model.vocabulary_).most_common()[:10]]))

(7500,) (2500,)

min_df: 0.01, max_df: 0.99, ngram_range: (1, 4)
vocab size: 5306
yuan to 	 5305
yuan the 	 5304
yuan it 	 5303
yuan is 	 5302
yuan for 	 5301
yuan and 	 5300
yuan 	 5299
yourself 	 5298
your own 	 5297
your 	 5296

min_df: 0.01, max_df: 0.85, ngram_range: (1, 4)
vocab size: 5297
yuan to 	 5296
yuan the 	 5295
yuan it 	 5294
yuan is 	 5293
yuan for 	 5292
yuan and 	 5291
yuan 	 5290
yourself 	 5289
your own 	 5288
your 	 5287


In [3]:
fm['tfidf']['model'].named_steps

{'count': FeaturePipeline(cache={'model': FeaturePipeline(cache={...}, spec=None,
         steps=[('count', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
         dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
         lowercase=True, max_df=0.99, max_features=None, min_df=0.01,
         ngram_range=(1, 4), preprocessor=None, stop_words=None,
         strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
         tokenizer=None, vocabulary=None))])},
         spec=None,
         steps=[('count', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
         dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
         lowercase=True, max_df=0.99, max_features=None, min_df=0.01,
         ngram_range=(1, 4), preprocessor=None, stop_words=None,
         strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
         tokenizer=None, vocabulary=None))]),
 'tfidf': TfidfTransformer(norm='l2', smooth_idf=True, sublinear_tf=F

## The Very Basic TF-IDF + LDA classifier

In [16]:
from fgclassifier.baseline import Baseline
from fgclassifier.classifiers import LDA

# Linear Discriminant Analysis
model = Baseline(classifier=LDA)
model.fit(fm['lsa_1k']['train'], y_train)
model.score(fm['lsa_1k']['test'], y_test)

2018-11-19 23:51:59,870 [INFO] [Validate]: F1 Scores
2018-11-19 23:51:59,874 [INFO]   location_traffic_convenience            	0.4409
  'precision', 'predicted', average, warn_for)
2018-11-19 23:51:59,880 [INFO]   location_distance_from_business_district	0.3275
2018-11-19 23:51:59,883 [INFO]   location_easy_to_find                   	0.4765
2018-11-19 23:51:59,886 [INFO]   service_wait_time                       	0.4311
2018-11-19 23:51:59,889 [INFO]   service_waiters_attitude                	0.5667
2018-11-19 23:51:59,897 [INFO]   service_parking_convenience             	0.4789
2018-11-19 23:51:59,901 [INFO]   service_serving_speed                   	0.5168
2018-11-19 23:51:59,908 [INFO]   price_level                             	0.5529
2018-11-19 23:51:59,912 [INFO]   price_cost_effective                    	0.4438
2018-11-19 23:51:59,924 [INFO]   price_discount                          	0.4932
2018-11-19 23:51:59,928 [INFO]   environment_decoration                  	0.4533
2018-11-1

0.4672456343211369

## Search for the Best Feature + Classifier Combination

In [3]:
# Run for all classifiers and feature builders
all_avg_scores, all_scores = defaultdict(dict), defaultdict(dict)

In [10]:
from fgclassifier.train import fm_cross_check

conf = {
    'fm_cache': fm,
    'y_train': y_train,
    'y_test': y_test,
    'results': {
        'avg': all_avg_scores,
        'all': all_scores
    }
}

# We'd only need to run the dummy models on one feature model,
# as they do not care about the features
fm_cross_check(
    ['tfidf'],
    ['DummyStratified', 'DummyMostFrequent'], **conf)

# Naive Bayes models cannot handle negative values, so we pass
# in only tfidf features
fm_cross_check(
    ['tfidf', # 'tfidf_sv'
    ],
    ['MultinomialNB', 'ComplementNB'], **conf)

# All other models can run on many classifiers
results = fm_cross_check(
    ['lsa_100', 'lsa_1k',
     'lda_count_100', 'lda_count_200',
     'lda_tfidf_100', 'lda_tfidf_200',
     'lsa_100_sv', 'lsa_1k_sv',
     'lda_count_100_sv', 'lda_count_200_sv',
     'lda_tfidf_100_sv', 'lda_tfidf_200_sv',
    ],
    ['LDA', 'LinearSVC', 'Logistic', 'Ridge', 'ExtraTree'], **conf)

2018-11-20 15:37:01,944 [INFO] Train for tfidf -> DummyStratified...
2018-11-20 15:37:01,971 [INFO] [Validate]: F1 Scores
2018-11-20 15:37:01,974 [INFO]   location_traffic_convenience            	0.2433
2018-11-20 15:37:01,978 [INFO]   location_distance_from_business_district	0.2396
2018-11-20 15:37:01,982 [INFO]   location_easy_to_find                   	0.2630
2018-11-20 15:37:01,987 [INFO]   service_wait_time                       	0.2516
2018-11-20 15:37:01,993 [INFO]   service_waiters_attitude                	0.2512
2018-11-20 15:37:01,996 [INFO]   service_parking_convenience             	0.2368
2018-11-20 15:37:02,000 [INFO]   service_serving_speed                   	0.2504
2018-11-20 15:37:02,004 [INFO]   price_level                             	0.2446
2018-11-20 15:37:02,007 [INFO]   price_cost_effective                    	0.2555
2018-11-20 15:37:02,011 [INFO]   price_discount                          	0.2536
2018-11-20 15:37:02,014 [INFO]   environment_decoration             

2018-11-20 15:37:02,743 [INFO]   dish_look                               	0.3400
2018-11-20 15:37:02,745 [INFO]   dish_recommendation                     	0.3600
2018-11-20 15:37:02,747 [INFO]   others_overall_experience               	0.4303
2018-11-20 15:37:02,749 [INFO]   others_willing_to_consume_again         	0.4011
2018-11-20 15:37:02,750 [INFO] ---------------------------------------------------
2018-11-20 15:37:02,751 [INFO] 【tfidf -> ComplementNB】: 0.3985
2018-11-20 15:37:02,752 [INFO] ---------------------------------------------------
2018-11-20 15:37:02,755 [INFO] Train for lsa_100 -> LDA...
2018-11-20 15:37:05,820 [INFO] [Validate]: F1 Scores
  'precision', 'predicted', average, warn_for)
2018-11-20 15:37:05,822 [INFO]   location_traffic_convenience            	0.3891
2018-11-20 15:37:05,825 [INFO]   location_distance_from_business_district	0.2679
2018-11-20 15:37:05,827 [INFO]   location_easy_to_find                   	0.3810
2018-11-20 15:37:05,830 [INFO]   service_wait

2018-11-20 15:37:22,522 [INFO]   price_level                             	0.3543
2018-11-20 15:37:22,524 [INFO]   price_cost_effective                    	0.2893
2018-11-20 15:37:22,526 [INFO]   price_discount                          	0.4171
2018-11-20 15:37:22,528 [INFO]   environment_decoration                  	0.3586
2018-11-20 15:37:22,530 [INFO]   environment_noise                       	0.3325
2018-11-20 15:37:22,532 [INFO]   environment_space                       	0.3188
2018-11-20 15:37:22,534 [INFO]   environment_cleaness                    	0.3482
2018-11-20 15:37:22,536 [INFO]   dish_portion                            	0.3084
2018-11-20 15:37:22,538 [INFO]   dish_taste                              	0.3364
2018-11-20 15:37:22,540 [INFO]   dish_look                               	0.2408
2018-11-20 15:37:22,543 [INFO]   dish_recommendation                     	0.2449
2018-11-20 15:37:22,544 [INFO]   others_overall_experience               	0.3829
2018-11-20 15:37:22,546 [INF

2018-11-20 15:39:41,733 [INFO]   location_distance_from_business_district	0.2926
2018-11-20 15:39:41,735 [INFO]   location_easy_to_find                   	0.3760
2018-11-20 15:39:41,737 [INFO]   service_wait_time                       	0.3094
2018-11-20 15:39:41,740 [INFO]   service_waiters_attitude                	0.5399
2018-11-20 15:39:41,742 [INFO]   service_parking_convenience             	0.2944
2018-11-20 15:39:41,744 [INFO]   service_serving_speed                   	0.4010
2018-11-20 15:39:41,746 [INFO]   price_level                             	0.4941
2018-11-20 15:39:41,748 [INFO]   price_cost_effective                    	0.3364
2018-11-20 15:39:41,751 [INFO]   price_discount                          	0.4509
2018-11-20 15:39:41,753 [INFO]   environment_decoration                  	0.3948
2018-11-20 15:39:41,756 [INFO]   environment_noise                       	0.3630
2018-11-20 15:39:41,759 [INFO]   environment_space                       	0.4053
2018-11-20 15:39:41,761 [INF

KeyError: 'train'

In [5]:
all_avg_scores

defaultdict(dict, {})

## Conclusion

- `ComplementNB` performs much better than a simple MultinomialNB, because our class labels are mostly unbalanced.
- `LatentDirichletAllocation` topics as features are not suitable for our classification problem, as features are often collinear. They often fare no better than the dummy classifier where we simply return the most frequent labels.
- LSA (Latent Semantic Analysis) shows a much more promising outcome, especially when combined with Linear Discriminant Analysis or SVC.
- A smaller vocabulary had marginal impact on the performance, what matters more is the number of SVD components in LSA. The higher the better, as more compoents will capture more information. LDA and SVC both perform well in high dimensional space.
- Basically SVD makes each feature (component) more indendent with each other, making LDA and SVC easier to come up with good fittings.
- Tree based models are not particularly useful. But the results may be different had we tuned the tree structure more.

## Next Steps

Required:

- Tune hyperparamters for `ComplementNB`, `TruncatedSVD`, `LinearDiscriminantAnalysis` and `SVC`/`LinearSVC`. Try different kernel functions.
- Test some boosting methods, especially [xgboost](https://xgboost.readthedocs.io/en/latest/).
- Test word embedding as features.

Optional:

- Possibly use different classifier for different labels.
- Test two step predictions: first run binary prediction for "mentioned" vs "not mentioned", i.e., -2 vs (-1, 0, 1), then predict (-1, 0, 1).
    - This could happen as either [ClassifierChain](https://scikit-learn.org/stable/modules/multiclass.html#classifierchain) or separate steps.