In [29]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import logging

logging.getLogger('jieba').setLevel(logging.WARN)
logging.getLogger('fgclassifier').setLevel(logging.INFO)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [21]:
import config
from sklearn.model_selection import train_test_split

from fgclassifier.baseline import BaselineFeature, Tfidf, SVD
from fgclassifier.utils import read_data

X, y = read_data('data/english.csv', seg_words=False, sample_n=None)
display(X.head(2))
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

# Builder TF-IDF features with reduced dimension
feature = BaselineFeature([
    ('tfidf', Tfidf(analyzer='word', ngram_range=(1, 4),
                    min_df=0.01, max_df=1.0, norm='l2')),
    ('svd', SVD(n_components=1000))
])
X_train = feature.fit_transform(X_train)
X_test = feature.transform(X_test)

2018-11-19 14:35:03,853 [INFO] Reading data/english.csv..


0    Hey, the lollipop of the dead man, the overlor...
1    The third time to participate in the public co...
Name: content, dtype: object

2018-11-19 14:35:04,263 [INFO] Fit & Transform TF-IDF...
2018-11-19 14:35:24,675 [INFO] Fit & Transform TruncatedSVD...
2018-11-19 14:35:42,922 [INFO] Transforming TF-IDF...
2018-11-19 14:35:45,461 [INFO] Transforming TruncatedSVD...


In [30]:
from fgclassifier.baseline import Baseline
from fgclassifier.classifiers import LinearDiscriminantAnalysis

model = Baseline(classifier=LinearDiscriminantAnalysis())
model.fit(X_train, y_train)
model.score(X_test, y_test)

2018-11-19 14:41:40,774 [INFO] [Validate]: F1 Scores
2018-11-19 14:41:40,776 [INFO]   location_traffic_convenience            	0.4385
2018-11-19 14:41:40,778 [INFO]   location_distance_from_business_district	0.3371
2018-11-19 14:41:40,781 [INFO]   location_easy_to_find                   	0.4637
2018-11-19 14:41:40,783 [INFO]   service_wait_time                       	0.4034
2018-11-19 14:41:40,786 [INFO]   service_waiters_attitude                	0.5584
2018-11-19 14:41:40,789 [INFO]   service_parking_convenience             	0.4865
2018-11-19 14:41:40,793 [INFO]   service_serving_speed                   	0.5119
2018-11-19 14:41:40,796 [INFO]   price_level                             	0.5383
2018-11-19 14:41:40,801 [INFO]   price_cost_effective                    	0.4258
2018-11-19 14:41:40,806 [INFO]   price_discount                          	0.4814
2018-11-19 14:41:40,809 [INFO]   environment_decoration                  	0.4415
2018-11-19 14:41:40,820 [INFO]   environment_noise      

0.4626354712591074

Exam the quality of the top terms:

In [26]:
from collections import Counter

print('\n'.join(['%s \t %s' % (k, v) for k, v in
                 Counter(feature.named_steps.tfidf.vocabulary_).most_common()[:20]]))

yuan to 	 5256
yuan the 	 5255
yuan it 	 5254
yuan is 	 5253
yuan for 	 5252
yuan and 	 5251
yuan 	 5250
yourself 	 5249
your 	 5248
young 	 5247
you will 	 5246
you want to eat 	 5245
you want to 	 5244
you want 	 5243
you to 	 5242
you should 	 5241
you need to 	 5240
you need 	 5239
you must 	 5238
you like 	 5237


In [31]:
from fgclassifier import classifiers

# Run for all classifiers
# can add another layer of for loops to 
# change vectorizer/reducer, or use sklearn's
# GridSearchCV
all_avg_scores, all_scores = {}, {}
for cls in ['LinearDiscriminantAnalysis',
            'LogisticRegression',
            'RidgeClassifierCV']:
    Classifier = getattr(classifiers, cls)
    model = Baseline(classifier=Classifier)
    model.fit(X_train, y_train)
    all_scores[cls] = model.scores(X_test, y_test)
    all_avg_scores[cls] = np.mean(all_scores[cls])

NameError: name 'Y_test' is not defined