In [2]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import logging

logging.getLogger('jieba').setLevel(logging.WARN)
logging.getLogger('fgclassifier').setLevel(logging.INFO)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


This notebook shows how to use our baseline model.
It also demonstrates how to test different feature models (i.e.,
different ways of building the features) at the same time.

We will use mostly the Google Translated English dataset for this
demonstration purpose.

In [57]:
import config
from collections import defaultdict
from sklearn.model_selection import train_test_split

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import LatentDirichletAllocation

from fgclassifier.features import FeaturePipeline, logger, SparseToDense 
from fgclassifier.utils import read_data

X, y = read_data('data/english.csv', flavor=None)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

# feature models with dependency specs
fm_spec = {
   'count': CountVectorizer(ngram_range=(1, 4), min_df=0.001, stop_words='english'),
   'tfidf': ['count', TfidfTransformer],
   'lsa_200': ['tfidf', TruncatedSVD(n_components=200)],
   'lsa_500': ['tfidf', TruncatedSVD(n_components=500)],
   'lsa_1k': ['tfidf', TruncatedSVD(n_components=1000)],
#    'lda_count_100': ['count', LatentDirichletAllocation(n_components=100)],
#    'lda_tfidf_100': ['tfidf', LatentDirichletAllocation(n_components=100)],
    
    # small vocabulary (removed more stop words)
   'count_sv': CountVectorizer(ngram_range=(1, 5), min_df=0.02, max_df=0.99,
                               stop_words='english'),
   'tfidf_sv': ['count_sv', TfidfTransformer],
   'tfidf_sv_dense': ['tfidf_sv', SparseToDense()],
   'lsa_100_sv': ['tfidf_sv', TruncatedSVD(n_components=100)],
   'lsa_500_sv': ['tfidf_sv', TruncatedSVD(n_components=500)],
#    'lda_count_100_sv': ['count_sv', LatentDirichletAllocation(n_components=100)],
#    'lda_tfidf_100_sv': ['tfidf_sv', LatentDirichletAllocation(n_components=100)],
}

2018-11-20 22:54:56,383 [INFO] Reading data/english.csv..


In [None]:
# Cache trained fetures, we make this cache object
# so different steps can reuse previously trained features
fm = defaultdict(dict)

In [58]:
for name in fm_spec.keys():
    logger.info(f'Building features for {name}...')
    model = FeaturePipeline(name, spec=fm_spec, cache=fm)
    model.fit_transform(X_train)
    model.transform(X_test)

2018-11-20 22:55:02,996 [INFO] Building features for count...
2018-11-20 22:55:02,998 [INFO]   count: fit_transform use cache.
2018-11-20 22:55:02,999 [INFO]   count: transform use cache.
2018-11-20 22:55:03,001 [INFO] Building features for tfidf...
2018-11-20 22:55:03,002 [INFO]   tfidf: fit_transform use cache.
2018-11-20 22:55:03,003 [INFO]   tfidf: transform use cache.
2018-11-20 22:55:03,004 [INFO] Building features for lsa_200...
2018-11-20 22:55:03,006 [INFO]   lsa_200: fit_transform use cache.
2018-11-20 22:55:03,007 [INFO]   lsa_200: transform use cache.
2018-11-20 22:55:03,008 [INFO] Building features for lsa_500...
2018-11-20 22:55:03,010 [INFO]   tfidf: fit_transform use cache.
2018-11-20 22:55:12,918 [INFO]   tfidf: transform use cache.
2018-11-20 22:55:13,083 [INFO] Building features for lsa_1k...
2018-11-20 22:55:13,085 [INFO]   lsa_1k: fit_transform use cache.
2018-11-20 22:55:13,087 [INFO]   lsa_1k: transform use cache.
2018-11-20 22:55:13,091 [INFO] Building features 

Exam the quality of the top terms:

In [49]:
from collections import Counter

print('Data Shape:', X_train.shape, X_test.shape)

for mn in ['count', 'count_sv']:
    model = fm[mn]['model'].named_steps[mn]
    x_train = fm[mn]['train']
    counts = np.sum(x_train, axis=0).flat
    counts = {k: counts[v] for k, v in model.vocabulary_.items()}
    print('\nmin_df: %.2f, max_df: %.2f, ngram_range: %s' % (
        model.min_df, model.max_df, model.ngram_range
    ))
    print('\nvocab size: %s\n' % len(model.vocabulary_))
    print('\n'.join([
        '%s \t %s' % (k, v)
        for k, v in Counter(counts).most_common()[:10]]))

Data Shape: (7500,) (2500,)

min_df: 0.00, max_df: 1.00, ngram_range: (1, 4)

vocab size: 18512

good 	 17818
taste 	 12112
eat 	 11203
time 	 6935
delicious 	 6583
store 	 6551
like 	 6169
people 	 5905
dishes 	 5507
really 	 5364

min_df: 0.02, max_df: 0.99, ngram_range: (1, 5)

vocab size: 945

good 	 17818
taste 	 12112
eat 	 11203
time 	 6935
delicious 	 6583
store 	 6551
like 	 6169
people 	 5905
dishes 	 5507
really 	 5364


In [50]:
fm['tfidf']['model'].named_steps

{'count': FeaturePipeline(cache={'model': FeaturePipeline(cache={...}, spec=None,
         steps=[('count', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
         dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
         lowercase=True, max_df=1.0, max_features=None, min_df=0.001,
         ...atrix of type '<class 'numpy.int64'>'
 	with 287551 stored elements in Compressed Sparse Row format>},
         spec=None,
         steps=[('count', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
         dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
         lowercase=True, max_df=1.0, max_features=None, min_df=0.001,
         ngram_range=(1, 4), preprocessor=None, stop_words='english',
         strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
         tokenizer=None, vocabulary=None))]),
 'tfidf': TfidfTransformer(norm='l2', smooth_idf=True, sublinear_tf=False, use_idf=True)}

## The Very Basic TF-IDF + LDA classifier

In [51]:
from fgclassifier.baseline import Baseline
from fgclassifier.classifiers import LDA

# Linear Discriminant Analysis
model = Baseline(classifier=LDA)
model.fit(fm['lsa_1k']['train'], y_train)
model.score(fm['lsa_1k']['test'], y_test)

2018-11-20 22:47:06,290 [INFO] [Validate]: F1 Scores
2018-11-20 22:47:06,297 [INFO]   location_traffic_convenience            	0.4290
  'precision', 'predicted', average, warn_for)
2018-11-20 22:47:06,301 [INFO]   location_distance_from_business_district	0.3449
2018-11-20 22:47:06,304 [INFO]   location_easy_to_find                   	0.3998
2018-11-20 22:47:06,308 [INFO]   service_wait_time                       	0.4152
2018-11-20 22:47:06,311 [INFO]   service_waiters_attitude                	0.5634
2018-11-20 22:47:06,315 [INFO]   service_parking_convenience             	0.4753
2018-11-20 22:47:06,318 [INFO]   service_serving_speed                   	0.4981
2018-11-20 22:47:06,322 [INFO]   price_level                             	0.5377
2018-11-20 22:47:06,326 [INFO]   price_cost_effective                    	0.4568
2018-11-20 22:47:06,334 [INFO]   price_discount                          	0.4942
2018-11-20 22:47:06,340 [INFO]   environment_decoration                  	0.4549
2018-11-2

0.45177582066396765

## Search for the Best Feature + Classifier Combination

In [53]:
# Run for all classifiers and feature builders
all_avg_scores, all_scores = defaultdict(dict), defaultdict(dict)

In [60]:
from fgclassifier.train import fm_cross_check

conf = {
    'fm_cache': fm,
    'y_train': y_train,
    'y_test': y_test,
    'results': {
        'avg': all_avg_scores,
        'all': all_scores
    }
}

# We'd only need to run the dummy models on one feature model,
# as they do not care about the features
# fm_cross_check(
#     ['tfidf'],
#     ['DummyStratified', 'DummyMostFrequent'], **conf)

# Naive Bayes models cannot handle negative values, so we pass
# in only tfidf features
fm_cross_check(
    [#'tfidf',
     'tfidf_sv'],
    [#'MultinomialNB',
    'ComplementNB'], **conf)

# All other models can run on many classifiers
results = fm_cross_check(
    [#'lsa_200', 'lsa_500', 'lsa_1k',
     # 'tfidf_sv_dense',
     'lsa_100_sv', 'lsa_500_sv',
    ],
    ['LDA', 'LinearSVC', 'Logistic', 'Ridge'], **conf)

2018-11-20 23:00:24,329 [INFO] Train for tfidf_sv -> ComplementNB...
2018-11-20 23:00:24,505 [INFO] [Validate]: F1 Scores
  'precision', 'predicted', average, warn_for)
2018-11-20 23:00:24,510 [INFO]   location_traffic_convenience            	0.3896
2018-11-20 23:00:24,514 [INFO]   location_distance_from_business_district	0.3193
2018-11-20 23:00:24,516 [INFO]   location_easy_to_find                   	0.3192
2018-11-20 23:00:24,519 [INFO]   service_wait_time                       	0.3564
2018-11-20 23:00:24,525 [INFO]   service_waiters_attitude                	0.4596
2018-11-20 23:00:24,533 [INFO]   service_parking_convenience             	0.2840
2018-11-20 23:00:24,537 [INFO]   service_serving_speed                   	0.4255
2018-11-20 23:00:24,541 [INFO]   price_level                             	0.4902
2018-11-20 23:00:24,547 [INFO]   price_cost_effective                    	0.4091
2018-11-20 23:00:24,552 [INFO]   price_discount                          	0.4705
2018-11-20 23:00:24,5

2018-11-20 23:00:45,136 [INFO]   environment_cleaness                    	0.3025
2018-11-20 23:00:45,139 [INFO]   dish_portion                            	0.2840
2018-11-20 23:00:45,142 [INFO]   dish_taste                              	0.4090
2018-11-20 23:00:45,145 [INFO]   dish_look                               	0.2588
2018-11-20 23:00:45,152 [INFO]   dish_recommendation                     	0.2790
2018-11-20 23:00:45,156 [INFO]   others_overall_experience               	0.3903
2018-11-20 23:00:45,161 [INFO]   others_willing_to_consume_again         	0.2907
2018-11-20 23:00:45,163 [INFO] ---------------------------------------------------
2018-11-20 23:00:45,165 [INFO] 【lsa_100_sv -> Logistic】: 0.3341
2018-11-20 23:00:45,167 [INFO] ---------------------------------------------------
2018-11-20 23:00:45,169 [INFO] Train for lsa_100_sv -> Ridge...
2018-11-20 23:00:47,829 [INFO] [Validate]: F1 Scores
  'precision', 'predicted', average, warn_for)
2018-11-20 23:00:47,833 [INFO]   locati

2018-11-20 23:02:12,755 [INFO]   price_level                             	0.4857
2018-11-20 23:02:12,758 [INFO]   price_cost_effective                    	0.3557
2018-11-20 23:02:12,761 [INFO]   price_discount                          	0.4500
2018-11-20 23:02:12,764 [INFO]   environment_decoration                  	0.4053
2018-11-20 23:02:12,767 [INFO]   environment_noise                       	0.3676
2018-11-20 23:02:12,771 [INFO]   environment_space                       	0.3869
2018-11-20 23:02:12,775 [INFO]   environment_cleaness                    	0.3924
2018-11-20 23:02:12,778 [INFO]   dish_portion                            	0.3273
2018-11-20 23:02:12,782 [INFO]   dish_taste                              	0.4381
2018-11-20 23:02:12,785 [INFO]   dish_look                               	0.2842
2018-11-20 23:02:12,788 [INFO]   dish_recommendation                     	0.3101
2018-11-20 23:02:12,791 [INFO]   others_overall_experience               	0.4204
2018-11-20 23:02:12,794 [INF

In [76]:
rows = {}
for fm_name in all_scores:
    for clf_name in all_scores[fm_name]:
        key = f'{fm_name}.{clf_name}'
        rows[key] = [all_avg_scores[fm_name][clf_name],
                     *all_scores[fm_name][clf_name]]
df = pd.DataFrame(rows)
df.index = ['average', *y_train.columns]
df.T.sort_values('average', ascending=False)

Unnamed: 0,average,location_traffic_convenience,location_distance_from_business_district,location_easy_to_find,service_wait_time,service_waiters_attitude,service_parking_convenience,service_serving_speed,price_level,price_cost_effective,...,environment_decoration,environment_noise,environment_space,environment_cleaness,dish_portion,dish_taste,dish_look,dish_recommendation,others_overall_experience,others_willing_to_consume_again
tfidf_sv_dense.LDA,0.459859,0.466292,0.340962,0.410272,0.412732,0.572884,0.515427,0.53013,0.54845,0.45529,...,0.458143,0.572426,0.498306,0.484644,0.370326,0.483179,0.367013,0.367459,0.470756,0.383674
lsa_1k.LDA,0.451776,0.428954,0.344891,0.39977,0.415248,0.56341,0.475317,0.498066,0.537697,0.456815,...,0.454947,0.544886,0.477421,0.478036,0.38564,0.514141,0.342406,0.358052,0.463108,0.402531
lsa_500_sv.LDA,0.446607,0.459354,0.33191,0.373216,0.411363,0.569089,0.498109,0.516629,0.526618,0.430554,...,0.442681,0.532172,0.456488,0.475279,0.376105,0.492761,0.351356,0.373592,0.446095,0.383864
tfidf_sv_dense.LinearSVC,0.421411,0.442215,0.336579,0.361738,0.379089,0.554437,0.411494,0.483994,0.527506,0.383646,...,0.451426,0.506178,0.470011,0.417699,0.343321,0.465163,0.305578,0.325279,0.443375,0.349842
lsa_200.LDA,0.408699,0.44,0.303148,0.335515,0.343845,0.543946,0.513407,0.427153,0.485807,0.400839,...,0.396346,0.401883,0.397675,0.398892,0.339981,0.494261,0.310214,0.346392,0.442764,0.371712
lsa_1k.LinearSVC,0.398814,0.428698,0.332816,0.34354,0.324191,0.542338,0.357451,0.442308,0.530034,0.368271,...,0.426472,0.448942,0.407851,0.407844,0.345914,0.444789,0.296249,0.312868,0.433277,0.318851
lsa_500_sv.LinearSVC,0.398021,0.431651,0.319914,0.342465,0.304075,0.541149,0.41176,0.462675,0.502496,0.382424,...,0.416187,0.436649,0.392652,0.401811,0.331832,0.457851,0.289214,0.322134,0.430498,0.324592
tfidf_sv.ComplementNB,0.392122,0.38963,0.319275,0.319162,0.356397,0.459568,0.283975,0.425481,0.490206,0.409127,...,0.382232,0.437925,0.442779,0.382203,0.357981,0.480247,0.349911,0.302559,0.423626,0.359624
tfidf_sv_dense.Logistic,0.389897,0.425154,0.312505,0.334288,0.295214,0.545828,0.346813,0.429196,0.504688,0.356294,...,0.417671,0.405237,0.42075,0.396754,0.33934,0.450403,0.293099,0.314696,0.427298,0.328747
lsa_500_sv.Logistic,0.381847,0.424717,0.308133,0.329628,0.294252,0.544164,0.343861,0.433497,0.485729,0.355709,...,0.405342,0.367622,0.386928,0.392433,0.327336,0.438108,0.284163,0.310072,0.420445,0.33484


## Conclusion

- `ComplementNB` performs much better than a simple MultinomialNB, because our class labels are mostly unbalanced.
- `LatentDirichletAllocation` topics as features are not suitable for our classification problem, as features are often collinear. They often fare no better than the dummy classifier where we simply return the most frequent labels.
- LSA (Latent Semantic Analysis) shows a much more promising outcome, especially when combined with Linear Discriminant Analysis or SVC.
- A smaller vocabulary had marginal impact on the performance, what matters more is the number of SVD components in LSA. The higher the better, as more compoents will capture more information. LDA and SVC both perform well in high dimensional space.
- Basically SVD makes each feature (component) more indendent with each other, making LDA and SVC easier to come up with good fittings.
- Tree based models are not particularly useful. But the results may be different had we tuned the tree structure more.

## Next Steps

Required:

- Tune hyperparamters for `ComplementNB`, `TruncatedSVD`, `LinearDiscriminantAnalysis` and `SVC`/`LinearSVC`. Try different kernel functions.
- Test some boosting methods, especially [xgboost](https://xgboost.readthedocs.io/en/latest/).
- Test word embedding as features.

Optional:

- Possibly use different classifier for different labels.
- Test two step predictions: first run binary prediction for "mentioned" vs "not mentioned", i.e., -2 vs (-1, 0, 1), then predict (-1, 0, 1).
    - This could happen as either [ClassifierChain](https://scikit-learn.org/stable/modules/multiclass.html#classifierchain) or separate steps.