In [1]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import logging

logging.getLogger('jieba').setLevel(logging.WARN)
logging.getLogger('fgclassifier').setLevel(logging.INFO)

In [2]:
import config
from sklearn.model_selection import train_test_split

from fgclassifier.features import Tfidf
from fgclassifier.utils import read_data

X, y = read_data('data/english.csv', seg_words=False, sample_n=None)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

# Builder TF-IDF features
feature = Tfidf(analyzer='word', ngram_range=(1, 5),
                min_df=0.001, max_df=0.99, norm='l2')
X_train = feature.fit_transform(X_train)
X_test = feature.transform(X_test)

2018-11-19 16:13:41,201 [INFO] Reading data/english.csv..
2018-11-19 16:13:41,397 [INFO] Fit & Transform TF-IDF...
2018-11-19 16:14:15,852 [INFO] Transforming TF-IDF...


Exam the quality of the top terms:

In [3]:
from collections import Counter

print('Vocab size: %s' % len(feature.vocabulary_))
print('Most common words: \n')
print('\n'.join(['%s \t %s' % (k, v) for k, v in
                 Counter(feature.vocabulary_).most_common()[:20]]))

Vocab size: 59933
Most common words: 

鸳鸯 	 59932
馄饨 	 59931
糍粑 	 59930
环境 the 	 59929
环境 	 59928
服务 	 59927
嘿嘿 	 59926
嘻嘻 	 59925
位置 	 59924
zucchini 	 59923
zoo 	 59922
zone 	 59921
zhujiang 	 59920
zhuang 	 59919
zhu 	 59918
zhou 	 59917
zhongshan road 	 59916
zhongshan park 	 59915
zhongshan 	 59914
zhongjie 	 59913


## The Very Basic TF-IDF + LDA classifier

In [4]:
from fgclassifier.baseline import Baseline
from fgclassifier.features import SVD
from fgclassifier.classifiers import LDA

model = Baseline(
    # Linear Discriminant Analysis
    classifier=LDA,
    # steps before the classifier.
    # make sure to include dimension reduction here,
    # otherwise future steps will be really really slow...
    steps=[
        ('dim_reduce', SVD(n_components=1000))
    ]
)
model.fit(X_train, y_train)
model.score(X_test, y_test)

2018-11-19 16:14:19,188 [INFO] Fit & Transform TruncatedSVD...
2018-11-19 16:16:15,630 [INFO] Transforming TruncatedSVD...
2018-11-19 16:16:16,721 [INFO] [Validate]: F1 Scores
2018-11-19 16:16:16,724 [INFO]   location_traffic_convenience            	0.4195
  'precision', 'predicted', average, warn_for)
2018-11-19 16:16:16,728 [INFO]   location_distance_from_business_district	0.3190
2018-11-19 16:16:16,731 [INFO]   location_easy_to_find                   	0.4741
2018-11-19 16:16:16,734 [INFO]   service_wait_time                       	0.4263
2018-11-19 16:16:16,737 [INFO]   service_waiters_attitude                	0.5702
2018-11-19 16:16:16,740 [INFO]   service_parking_convenience             	0.4119
2018-11-19 16:16:16,746 [INFO]   service_serving_speed                   	0.4958
2018-11-19 16:16:16,751 [INFO]   price_level                             	0.5493
2018-11-19 16:16:16,754 [INFO]   price_cost_effective                    	0.4538
2018-11-19 16:16:16,758 [INFO]   price_discount 

0.4549529159313958

In [None]:
from IPython.display import clear_output

from collections import defaultdict
from fgclassifier.baseline import logger
from fgclassifier.features import FeaturePipeline, SparseToSense
from fgclassifier.features import LatentDirichletAllocation, SVD
from fgclassifier import classifiers

# Run for all classifiers and feature builders
models = defaultdict(dict)
all_avg_scores, all_scores = defaultdict(dict), defaultdict(dict)

# Naive Bayes models cannot handle negative values, so we pass
# in raw tf-idf to them
for cls in ['DummyStratified', 'MultinomialNB', 'ComplementNB']:
    logger.info('-----------------------------------')
    logger.info(f'Train for {cls}...')
    Classifier = getattr(classifiers, cls)
    model = Baseline(name=cls, classifier=Classifier)
    model.fit(X_train, y_train)
    models['raw'][cls] = model
    all_scores['raw'][cls] = model.scores(X_test, y_test)
    all_avg_scores['raw'][cls] = np.mean(all_scores['raw'][cls])
    

# decomposition methods
decomps = {
    'svd_100': SVD(n_components=100),
    'svd_500': SVD(n_components=500),
    'svd_1k': SVD(n_components=1000),
    'lda_100': LatentDirichletAllocation(n_components=100),
    'lda_500': LatentDirichletAllocation(n_components=500),
    'lda_1k': LatentDirichletAllocation(n_components=1000),
}

for decomp, Decomposer in decomps.items():
    logger.info('-----------------------------------')
    logger.info(f'Build {decomp} features...')
    pipe = FeaturePipeline(steps=Decomposer)
    X_train_ = pipe.fit_transform(X_train)
    X_test_ = pipe.transform(X_test)
    models[decomp]['feature'] = pipe
    
    for cls in ['LDA', 'LinearSVC',
                'Logistic', 'Ridge',
                'ExtraTree', 'RandomForest']:
        logger.info('-----------------------------------')
        logger.info(f'Train for {decomp} -> {cls}...')
        Classifier = getattr(classifiers, cls)
        model = Baseline(name=cls, classifier=Classifier)
        model.fit(X_train_, y_train)
        models[decomp][cls] = model
        all_scores[decomp][cls] = model.scores(X_test_, y_test)
        all_avg_scores[decomp][cls] = np.mean(all_scores[decomp][cls])

2018-11-19 16:40:54,897 [INFO] -----------------------------------
2018-11-19 16:40:54,898 [INFO] Train for DummyStratified...
2018-11-19 16:40:54,938 [INFO] [Validate]: F1 Scores
2018-11-19 16:40:54,942 [INFO]   location_traffic_convenience            	0.2560
2018-11-19 16:40:54,945 [INFO]   location_distance_from_business_district	0.2474
2018-11-19 16:40:54,950 [INFO]   location_easy_to_find                   	0.2684
2018-11-19 16:40:54,954 [INFO]   service_wait_time                       	0.2513
2018-11-19 16:40:54,959 [INFO]   service_waiters_attitude                	0.2561
2018-11-19 16:40:54,963 [INFO]   service_parking_convenience             	0.2413
2018-11-19 16:40:54,966 [INFO]   service_serving_speed                   	0.2492
2018-11-19 16:40:54,970 [INFO]   price_level                             	0.2567
2018-11-19 16:40:54,975 [INFO]   price_cost_effective                    	0.2443
2018-11-19 16:40:54,979 [INFO]   price_discount                          	0.2573
2018-11-19

2018-11-19 16:41:14,772 [INFO] [Validate]: F1 Scores
  'precision', 'predicted', average, warn_for)
2018-11-19 16:41:14,774 [INFO]   location_traffic_convenience            	0.3669
2018-11-19 16:41:14,777 [INFO]   location_distance_from_business_district	0.2463
2018-11-19 16:41:14,780 [INFO]   location_easy_to_find                   	0.3687
2018-11-19 16:41:14,783 [INFO]   service_wait_time                       	0.2745
2018-11-19 16:41:14,785 [INFO]   service_waiters_attitude                	0.4979
2018-11-19 16:41:14,788 [INFO]   service_parking_convenience             	0.2421
2018-11-19 16:41:14,791 [INFO]   service_serving_speed                   	0.3253
2018-11-19 16:41:14,795 [INFO]   price_level                             	0.3854
2018-11-19 16:41:14,799 [INFO]   price_cost_effective                    	0.2861
2018-11-19 16:41:14,804 [INFO]   price_discount                          	0.4283
2018-11-19 16:41:14,808 [INFO]   environment_decoration                  	0.3540
2018-11-1

2018-11-19 16:42:17,971 [INFO] [Validate]: F1 Scores
  'precision', 'predicted', average, warn_for)
2018-11-19 16:42:17,973 [INFO]   location_traffic_convenience            	0.2359
2018-11-19 16:42:17,976 [INFO]   location_distance_from_business_district	0.2217
2018-11-19 16:42:17,978 [INFO]   location_easy_to_find                   	0.2588
2018-11-19 16:42:17,980 [INFO]   service_wait_time                       	0.2353
2018-11-19 16:42:17,983 [INFO]   service_waiters_attitude                	0.3913
2018-11-19 16:42:17,985 [INFO]   service_parking_convenience             	0.2421
2018-11-19 16:42:17,988 [INFO]   service_serving_speed                   	0.2615
2018-11-19 16:42:17,991 [INFO]   price_level                             	0.2209
2018-11-19 16:42:17,994 [INFO]   price_cost_effective                    	0.2214
2018-11-19 16:42:17,996 [INFO]   price_discount                          	0.2751
2018-11-19 16:42:18,000 [INFO]   environment_decoration                  	0.3068
2018-11-1

In [37]:
all_avg_scores

{'LDA': 0.4626354712591074,
 'QDA': 0.23685164163320965,
 'Logistic': 0.3673625874835228,
 'Ridge': 0.3661980926797871}