In [1]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import logging

logging.getLogger('jieba').setLevel(logging.WARN)
logging.getLogger('fgclassifier').setLevel(logging.INFO)

In [3]:
import config
from collections import defaultdict
from sklearn.model_selection import train_test_split

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import LatentDirichletAllocation

from fgclassifier.features import FeaturePipeline, logger
from fgclassifier.utils import read_data

X, y = read_data('data/english.csv', seg_words=False, sample_n=None)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

# feature models with dependency specs
fm_spec = {
   'count': CountVectorizer(ngram_range=(1, 4), min_df=0.01, max_df=0.99),
   'tfidf': ['count', TfidfTransformer],
   'lsa_100': ['tfidf', TruncatedSVD(n_components=100)],
   'lsa_1k': ['tfidf', TruncatedSVD(n_components=1000)],
   'lda_count_100': ['count', LatentDirichletAllocation(n_components=100)],
   'lda_count_200': ['count', LatentDirichletAllocation(n_components=200)],
   'lda_tfidf_100': ['tfidf', LatentDirichletAllocation(n_components=100)],
   'lda_tfidf_200': ['tfidf', LatentDirichletAllocation(n_components=200)],
    
    # small vocabulary (removed more stop words)
   'count_sv': [CountVectorizer(ngram_range=(1, 4), min_df=0.01, max_df=0.85)],
   'tfidf_sv': ['count_sv', TfidfTransformer],
   'lsa_100_sv': ['tfidf_sv', TruncatedSVD(n_components=100)],
   'lsa_1k_sv': ['tfidf_sv', TruncatedSVD(n_components=1000)],
   'lda_count_100_sv': ['count_sv', LatentDirichletAllocation(n_components=100)],
   'lda_count_200_sv': ['count_sv', LatentDirichletAllocation(n_components=200)],
   'lda_tfidf_100_sv': ['tfidf_sv', LatentDirichletAllocation(n_components=100)],
   'lda_tfidf_200_sv': ['tfidf_sv', LatentDirichletAllocation(n_components=200)],
}

# Cache trained fetures, we make this cache object
# so different steps can reuse previously trained features
fm = defaultdict(dict)

for name in fm_spec.keys():
    logger.info(f'Building features for {name}...')
    model = FeaturePipeline(name, spec=fm_spec, cache=fm)
    model.fit_transform(X_train)
    model.transform(X_test)

2018-11-19 23:15:38,945 [INFO] Reading data/english.csv..
2018-11-19 23:15:39,167 [INFO] Building features for count...
2018-11-19 23:16:06,107 [INFO] Building features for tfidf...
2018-11-19 23:16:06,108 [INFO]   count: fit_transform use cache.
2018-11-19 23:16:06,239 [INFO]   count: transform use cache.
2018-11-19 23:16:06,262 [INFO] Building features for lsa_100...
2018-11-19 23:16:06,263 [INFO]   tfidf: fit_transform use cache.
2018-11-19 23:16:08,077 [INFO]   tfidf: transform use cache.
2018-11-19 23:16:08,130 [INFO] Building features for lsa_1k...
2018-11-19 23:16:08,131 [INFO]   tfidf: fit_transform use cache.
2018-11-19 23:16:28,601 [INFO]   tfidf: transform use cache.
2018-11-19 23:16:28,964 [INFO] Building features for lda_count_100...
2018-11-19 23:16:28,965 [INFO]   count: fit_transform use cache.
2018-11-19 23:21:06,303 [INFO]   count: transform use cache.
2018-11-19 23:21:11,277 [INFO] Building features for lda_count_200...
2018-11-19 23:21:11,279 [INFO]   count: fit_tra

Exam the quality of the top terms:

In [15]:
from collections import Counter

print(X_train.shape, X_test.shape)

counter = fm['count']['model'].named_steps.count
counter_sv = fm['count_sv']['model'].named_steps.count_sv
for model in [counter, counter_sv]:
    print('\nmin_df: %.2f, max_df: %.2f, ngram_range: %s' % (
        model.min_df, model.max_df, model.ngram_range
    ))
    print('vocab size: %s' % len(model.vocabulary_))
    print('\n'.join(['%s \t %s' % (k, v) for k, v in
                     Counter(model.vocabulary_).most_common()[:10]]))

(7500,) (2500,)

min_df: 0.01, max_df: 0.99, ngram_range: (1, 4)
vocab size: 5306
yuan to 	 5305
yuan the 	 5304
yuan it 	 5303
yuan is 	 5302
yuan for 	 5301
yuan and 	 5300
yuan 	 5299
yourself 	 5298
your own 	 5297
your 	 5296

min_df: 0.01, max_df: 0.85, ngram_range: (1, 4)
vocab size: 5297
yuan to 	 5296
yuan the 	 5295
yuan it 	 5294
yuan is 	 5293
yuan for 	 5292
yuan and 	 5291
yuan 	 5290
yourself 	 5289
your own 	 5288
your 	 5287


## The Very Basic TF-IDF + LDA classifier

In [16]:
from fgclassifier.baseline import Baseline
from fgclassifier.classifiers import LDA

# Linear Discriminant Analysis
model = Baseline(classifier=LDA)
model.fit(fm['lsa_1k']['train'], y_train)
model.score(fm['lsa_1k']['test'], y_test)

2018-11-19 23:51:59,870 [INFO] [Validate]: F1 Scores
2018-11-19 23:51:59,874 [INFO]   location_traffic_convenience            	0.4409
  'precision', 'predicted', average, warn_for)
2018-11-19 23:51:59,880 [INFO]   location_distance_from_business_district	0.3275
2018-11-19 23:51:59,883 [INFO]   location_easy_to_find                   	0.4765
2018-11-19 23:51:59,886 [INFO]   service_wait_time                       	0.4311
2018-11-19 23:51:59,889 [INFO]   service_waiters_attitude                	0.5667
2018-11-19 23:51:59,897 [INFO]   service_parking_convenience             	0.4789
2018-11-19 23:51:59,901 [INFO]   service_serving_speed                   	0.5168
2018-11-19 23:51:59,908 [INFO]   price_level                             	0.5529
2018-11-19 23:51:59,912 [INFO]   price_cost_effective                    	0.4438
2018-11-19 23:51:59,924 [INFO]   price_discount                          	0.4932
2018-11-19 23:51:59,928 [INFO]   environment_decoration                  	0.4533
2018-11-1

0.4672456343211369

## Search for the Best Feature + Classifier Combination

In [17]:
# Run for all classifiers and feature builders
all_avg_scores, all_scores = defaultdict(dict), defaultdict(dict)

In [27]:
from IPython.display import clear_output

from sklearn.preprocessing import Normalizer

from collections import defaultdict
from fgclassifier.baseline import logger
from fgclassifier.features import FeaturePipeline, SparseToSense
from fgclassifier.features import LatentDirichletAllocation, SVD
from fgclassifier import classifiers

def fm_cross_check(fmns, clss):
    """Feature Model Cross Check"""
    # Test for all Feature models
    for fmn in fmns:
        logger.info(f'======== Feature Model: {fmn} =========')
        cache = fm[fmn]
        Xtrain, Xtest = cache['train'], cache['test']
        # Test on all major classifiers
        for cls in clss:
            logger.info(f'Train for {fmn} -> {cls}...')
            Classifier = getattr(classifiers, cls)
            model = Baseline(name=cls, classifier=Classifier)
            model.fit(Xtrain, y_train)
            all_scores[fmn][cls] = model.scores(Xtest, y_test)
            f1 = all_avg_scores[fmn][cls] = np.mean(all_scores[fmn][cls])
            logger.info('---------------------------------------------------')
            logger.info(f'【{fmn} -> {cls}】: {f1:.4f}')
            logger.info('---------------------------------------------------')
            

# We'd only need to run the dummy models on one feature model,
# as they do not care about the features
fm_cross_check(
    ['tfidf'],
    ['DummyStratified', 'DummyMostFrequent']
)

# Naive Bayes models cannot handle negative values, so we pass
# in only tfidf features
fm_cross_check(
    ['tfidf', 'tfidf_sv'],
    ['MultinomialNB', 'ComplementNB']
)

# All other models can run on many classifiers
fm_cross_check(
    ['lsa_100', 'lsa_1k',
     'lda_count_100', 'lda_count_200',
     'lda_tfidf_100', 'lda_tfidf_200',
     'lsa_100_sv', 'lsa_1k_sv',
     'lda_count_100_sv', 'lda_count_200_sv',
     'lda_tfidf_100_sv', 'lda_tfidf_200_sv',
    ],
    
    ['LDA', 'LinearSVC', 'Logistic', 'Ridge', 'ExtraTree']
)

# clear_output()

2018-11-20 00:28:32,134 [INFO] Train for tfidf -> DummyStratified...
2018-11-20 00:28:32,163 [INFO] [Validate]: F1 Scores
2018-11-20 00:28:32,166 [INFO]   location_traffic_convenience            	0.2476
2018-11-20 00:28:32,168 [INFO]   location_distance_from_business_district	0.2610
2018-11-20 00:28:32,172 [INFO]   location_easy_to_find                   	0.2475
2018-11-20 00:28:32,176 [INFO]   service_wait_time                       	0.2546
2018-11-20 00:28:32,179 [INFO]   service_waiters_attitude                	0.2424
2018-11-20 00:28:32,182 [INFO]   service_parking_convenience             	0.2668
2018-11-20 00:28:32,185 [INFO]   service_serving_speed                   	0.2485
2018-11-20 00:28:32,189 [INFO]   price_level                             	0.2424
2018-11-20 00:28:32,191 [INFO]   price_cost_effective                    	0.2600
2018-11-20 00:28:32,195 [INFO]   price_discount                          	0.2300
2018-11-20 00:28:32,198 [INFO]   environment_decoration             

2018-11-20 00:28:33,135 [INFO]   dish_look                               	0.3417
2018-11-20 00:28:33,138 [INFO]   dish_recommendation                     	0.3483
2018-11-20 00:28:33,140 [INFO]   others_overall_experience               	0.4336
2018-11-20 00:28:33,143 [INFO]   others_willing_to_consume_again         	0.4220
2018-11-20 00:28:33,144 [INFO] ---------------------------------------------------
2018-11-20 00:28:33,148 [INFO] 【tfidf -> ComplementNB】: 0.4070
2018-11-20 00:28:33,150 [INFO] ---------------------------------------------------
2018-11-20 00:28:33,155 [INFO] Train for tfidf_sv -> MultinomialNB...
2018-11-20 00:28:33,536 [INFO] [Validate]: F1 Scores
  'precision', 'predicted', average, warn_for)
2018-11-20 00:28:33,539 [INFO]   location_traffic_convenience            	0.3315
2018-11-20 00:28:33,541 [INFO]   location_distance_from_business_district	0.2434
2018-11-20 00:28:33,544 [INFO]   location_easy_to_find                   	0.3108
2018-11-20 00:28:33,546 [INFO]   s

In [30]:
all_avg_scores

defaultdict(dict,
            {'tfidf': {'DummyStratified': 0.24850453508887155,
              'DummyMostFrequent': 0.20004133859221804,
              'MultinomialNB': 0.2851317119840156,
              'ComplementNB': 0.4070130652449275},
             'tfidf_sv': {'MultinomialNB': 0.2881810332309514,
              'ComplementNB': 0.40593509859996635},
             'lsa_100': {'LDA': 0.37994477253680514,
              'LinearSVC': 0.3380442087162074,
              'Logistic': 0.33412824892379456,
              'Ridge': 0.323475620903369,
              'ExtraTree': 0.2193105117571359},
             'lsa_1k': {'LDA': 0.4672456343211369,
              'LinearSVC': 0.4170892428737959,
              'Logistic': 0.37443974704251676,
              'Ridge': 0.363214429438162,
              'ExtraTree': 0.20935807927874767},
             'lda_count_100': {'LDA': 0.3205497662606225,
              'LinearSVC': 0.2849517453749279,
              'Logistic': 0.27708563669736763,
              'Ridge'

## Conclusion

- `ComplementNB` performs much better than a simple MultinomialNB, because our class labels are mostly unbalanced.
- `LatentDirichletAllocation` topics as features are not suitable for our classification problem, as features are often collinear. They often fare no better than the dummy classifier where we simply return the most frequent labels.
- LSA (Latent Semantic Analysis) shows a much more promising outcome, especially when combined with Linear Discriminant Analysis or SVC.
- A smaller vocabulary had marginal impact on the performance, what matters more is the number of SVD components in LSA. The higher the better, as more compoents will capture more information. LDA and SVC both perform well in high dimensional space.
- Basically SVD makes each feature (component) more indendent with each other, making LDA and SVC easier to come up with good fittings.
- Tree based models are not particularly useful. But the results may be different had we tuned the tree structure more.

## Next Steps

- Tune hyperparamters for `ComplementNB`, `TruncatedSVD`, `LinearDiscriminantAnalysis` and `SVC`/`LinearSVC`. Try different kernel functions.
- Test some boosting methods, especially [xgboost](https://xgboost.readthedocs.io/en/latest/).
- Possibly use different classifier for different labels.
- Test two step predictions: first run binary prediction for "mentioned" vs "not mentioned", i.e., -2 vs (-1, 0, 1), then predict (-1, 0, 1).
    - This could happen as either [ClassifierChain](https://scikit-learn.org/stable/modules/multiclass.html#classifierchain) or separate steps.
- Test word embedding as features.