In [1]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import logging

logging.getLogger('jieba').setLevel(logging.WARN)
logging.getLogger('fgclassifier').setLevel(logging.INFO)

In [3]:
import config
from collections import defaultdict
from sklearn.model_selection import train_test_split

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import LatentDirichletAllocation

from fgclassifier.features import FeaturePipeline, logger
from fgclassifier.utils import read_data

X, y = read_data('data/english.csv', seg_words=False, sample_n=None)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

# feature models with dependency specs
fm_spec = {
   'count': CountVectorizer(ngram_range=(1, 4), min_df=0.01, max_df=0.99),
   'tfidf': ['count', TfidfTransformer],
   'lsa_100': ['tfidf', TruncatedSVD(n_components=100)],
   'lsa_1k': ['tfidf', TruncatedSVD(n_components=1000)],
   'lda_count_100': ['count', LatentDirichletAllocation(n_components=100)],
   'lda_count_200': ['count', LatentDirichletAllocation(n_components=200)],
   'lda_tfidf_100': ['tfidf', LatentDirichletAllocation(n_components=100)],
   'lda_tfidf_200': ['tfidf', LatentDirichletAllocation(n_components=200)],
    
    # small vocabulary (removed more stop words)
   'count_sv': [CountVectorizer(ngram_range=(1, 4), min_df=0.01, max_df=0.85)],
   'tfidf_sv': ['count_sv', TfidfTransformer],
   'lsa_100_sv': ['tfidf_sv', TruncatedSVD(n_components=100)],
   'lsa_1k_sv': ['tfidf_sv', TruncatedSVD(n_components=1000)],
   'lda_count_100_sv': ['count_sv', LatentDirichletAllocation(n_components=100)],
   'lda_count_200_sv': ['count_sv', LatentDirichletAllocation(n_components=200)],
   'lda_tfidf_100_sv': ['tfidf_sv', LatentDirichletAllocation(n_components=100)],
   'lda_tfidf_200_sv': ['tfidf_sv', LatentDirichletAllocation(n_components=200)],
}

# Cache trained fetures, we make this cache object
# so different steps can reuse previously trained features
fm = defaultdict(dict)

for name in fm_spec.keys():
    logger.info(f'Building features for {name}...')
    model = FeaturePipeline(name, spec=fm_spec, cache=fm)
    model.fit_transform(X_train)
    model.transform(X_test)

2018-11-19 23:15:38,945 [INFO] Reading data/english.csv..
2018-11-19 23:15:39,167 [INFO] Building features for count...
2018-11-19 23:16:06,107 [INFO] Building features for tfidf...
2018-11-19 23:16:06,108 [INFO]   count: fit_transform use cache.
2018-11-19 23:16:06,239 [INFO]   count: transform use cache.
2018-11-19 23:16:06,262 [INFO] Building features for lsa_100...
2018-11-19 23:16:06,263 [INFO]   tfidf: fit_transform use cache.
2018-11-19 23:16:08,077 [INFO]   tfidf: transform use cache.
2018-11-19 23:16:08,130 [INFO] Building features for lsa_1k...
2018-11-19 23:16:08,131 [INFO]   tfidf: fit_transform use cache.
2018-11-19 23:16:28,601 [INFO]   tfidf: transform use cache.
2018-11-19 23:16:28,964 [INFO] Building features for lda_count_100...
2018-11-19 23:16:28,965 [INFO]   count: fit_transform use cache.
2018-11-19 23:21:06,303 [INFO]   count: transform use cache.
2018-11-19 23:21:11,277 [INFO] Building features for lda_count_200...
2018-11-19 23:21:11,279 [INFO]   count: fit_tra

Exam the quality of the top terms:

In [15]:
from collections import Counter

print(X_train.shape, X_test.shape)

counter = fm['count']['model'].named_steps.count
counter_sv = fm['count_sv']['model'].named_steps.count_sv
for model in [counter, counter_sv]:
    print('\nmin_df: %.2f, max_df: %.2f, ngram_range: %s' % (
        model.min_df, model.max_df, model.ngram_range
    ))
    print('vocab size: %s' % len(model.vocabulary_))
    print('\n'.join(['%s \t %s' % (k, v) for k, v in
                     Counter(model.vocabulary_).most_common()[:10]]))

(7500,) (2500,)

min_df: 0.01, max_df: 0.99, ngram_range: (1, 4)
vocab size: 5306
yuan to 	 5305
yuan the 	 5304
yuan it 	 5303
yuan is 	 5302
yuan for 	 5301
yuan and 	 5300
yuan 	 5299
yourself 	 5298
your own 	 5297
your 	 5296

min_df: 0.01, max_df: 0.85, ngram_range: (1, 4)
vocab size: 5297
yuan to 	 5296
yuan the 	 5295
yuan it 	 5294
yuan is 	 5293
yuan for 	 5292
yuan and 	 5291
yuan 	 5290
yourself 	 5289
your own 	 5288
your 	 5287


## The Very Basic TF-IDF + LDA classifier

In [16]:
from fgclassifier.baseline import Baseline
from fgclassifier.classifiers import LDA

# Linear Discriminant Analysis
model = Baseline(classifier=LDA)
model.fit(fm['lsa_1k']['train'], y_train)
model.score(fm['lsa_1k']['test'], y_test)

2018-11-19 23:51:59,870 [INFO] [Validate]: F1 Scores
2018-11-19 23:51:59,874 [INFO]   location_traffic_convenience            	0.4409
  'precision', 'predicted', average, warn_for)
2018-11-19 23:51:59,880 [INFO]   location_distance_from_business_district	0.3275
2018-11-19 23:51:59,883 [INFO]   location_easy_to_find                   	0.4765
2018-11-19 23:51:59,886 [INFO]   service_wait_time                       	0.4311
2018-11-19 23:51:59,889 [INFO]   service_waiters_attitude                	0.5667
2018-11-19 23:51:59,897 [INFO]   service_parking_convenience             	0.4789
2018-11-19 23:51:59,901 [INFO]   service_serving_speed                   	0.5168
2018-11-19 23:51:59,908 [INFO]   price_level                             	0.5529
2018-11-19 23:51:59,912 [INFO]   price_cost_effective                    	0.4438
2018-11-19 23:51:59,924 [INFO]   price_discount                          	0.4932
2018-11-19 23:51:59,928 [INFO]   environment_decoration                  	0.4533
2018-11-1

0.4672456343211369

## Search for the Best Feature + Classifier Combination

In [17]:
# Run for all classifiers and feature builders
all_avg_scores, all_scores = defaultdict(dict), defaultdict(dict)

In [None]:
from IPython.display import clear_output

from sklearn.preprocessing import Normalizer

from collections import defaultdict
from fgclassifier.baseline import logger
from fgclassifier.features import FeaturePipeline, SparseToSense
from fgclassifier.features import LatentDirichletAllocation, SVD
from fgclassifier import classifiers

def fm_cross_check(fmns, clss):
    """Feature Model Cross Check"""
    # Test for all Feature models
    for fmn in fmns:
        logger.info(f'======== Feature Model: {fmn} =========')
        cache = fm[fmn]
        Xtrain, Xtest = cache['train'], cache['test']
        # Test on all major classifiers
        for cls in clss:
            logger.info(f'Train for {fmn} -> {cls}...')
            Classifier = getattr(classifiers, cls)
            model = Baseline(name=cls, classifier=Classifier)
            model.fit(Xtrain, y_train)
            all_scores[fmn][cls] = model.scores(Xtest, y_test)
            f1 = all_avg_scores[fmn][cls] = np.mean(all_scores[fmn][cls])
            logger.info('---------------------------------------------------')
            logger.info(f'【{fmn} -> {cls}】: {f1:.4f}')
            logger.info('---------------------------------------------------')
            

# We'd only need to run the dummy models on one feature model,
# as they do not care about the features
fm_cross_check(
    ['tfidf'],
    ['DummyStratified', 'DummyMostFrequent']
)
# Naive Bayes models cannot handle negative values, so we pass
# in only tfidf features
fm_cross_check(
    ['tfidf', 'tfidf_sv'],
    ['MultinomialNB', 'ComplementNB']
)
# All other models can run on many classifiers
fm_cross_check(
    ['lsa_100', 'lsa_1k', 'lda_100', 'lda_200', 'lsa_100_sv',
     'lsa_1k_sv', 'lda_100_sv', 'lsa_100_sv'],
    
    ['LDA', 'LinearSVC', 'Logistic', 'Ridge', 'ExtraTree']
)

2018-11-20 00:07:42,788 [INFO] Train for tfidf -> DummyStratified...
2018-11-20 00:07:42,814 [INFO] [Validate]: F1 Scores
2018-11-20 00:07:42,816 [INFO]   location_traffic_convenience            	0.2402
2018-11-20 00:07:42,819 [INFO]   location_distance_from_business_district	0.2485
2018-11-20 00:07:42,822 [INFO]   location_easy_to_find                   	0.2472
2018-11-20 00:07:42,824 [INFO]   service_wait_time                       	0.2460
2018-11-20 00:07:42,829 [INFO]   service_waiters_attitude                	0.2335
2018-11-20 00:07:42,831 [INFO]   service_parking_convenience             	0.2509
2018-11-20 00:07:42,834 [INFO]   service_serving_speed                   	0.2334
2018-11-20 00:07:42,836 [INFO]   price_level                             	0.2420
2018-11-20 00:07:42,839 [INFO]   price_cost_effective                    	0.2515
2018-11-20 00:07:42,844 [INFO]   price_discount                          	0.2576
2018-11-20 00:07:42,848 [INFO]   environment_decoration             

2018-11-20 00:07:43,792 [INFO]   dish_look                               	0.3417
2018-11-20 00:07:43,794 [INFO]   dish_recommendation                     	0.3483
2018-11-20 00:07:43,800 [INFO]   others_overall_experience               	0.4336
2018-11-20 00:07:43,807 [INFO]   others_willing_to_consume_again         	0.4220
2018-11-20 00:07:43,808 [INFO] ---------------------------------------------------
2018-11-20 00:07:43,810 [INFO] 【tfidf -> ComplementNB】: 0.4070
2018-11-20 00:07:43,813 [INFO] ---------------------------------------------------
2018-11-20 00:07:43,818 [INFO] Train for tfidf_sv -> MultinomialNB...
2018-11-20 00:07:44,141 [INFO] [Validate]: F1 Scores
  'precision', 'predicted', average, warn_for)
2018-11-20 00:07:44,143 [INFO]   location_traffic_convenience            	0.3315
2018-11-20 00:07:44,146 [INFO]   location_distance_from_business_district	0.2434
2018-11-20 00:07:44,148 [INFO]   location_easy_to_find                   	0.3108
2018-11-20 00:07:44,150 [INFO]   s

2018-11-20 00:07:55,312 [INFO]   price_discount                          	0.4247
2018-11-20 00:07:55,314 [INFO]   environment_decoration                  	0.3560
2018-11-20 00:07:55,316 [INFO]   environment_noise                       	0.3384
2018-11-20 00:07:55,318 [INFO]   environment_space                       	0.3319
2018-11-20 00:07:55,321 [INFO]   environment_cleaness                    	0.3420
2018-11-20 00:07:55,323 [INFO]   dish_portion                            	0.3156
2018-11-20 00:07:55,325 [INFO]   dish_taste                              	0.4011
2018-11-20 00:07:55,327 [INFO]   dish_look                               	0.2485
2018-11-20 00:07:55,330 [INFO]   dish_recommendation                     	0.2385
2018-11-20 00:07:55,332 [INFO]   others_overall_experience               	0.4173
2018-11-20 00:07:55,334 [INFO]   others_willing_to_consume_again         	0.3168
2018-11-20 00:07:55,335 [INFO] ---------------------------------------------------
2018-11-20 00:07:55,336 [I

In [None]:
all_avg_scores