In [2]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import logging

logging.getLogger('jieba').setLevel(logging.WARN)
logging.getLogger('fgclassifier').setLevel(logging.INFO)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [18]:
import config

from fgclassifier.baseline import Baseline, Tfidf, SVD
from fgclassifier.utils import read_data, read_csv
from fgclassifier import classifiers


vectorizer = Tfidf(analyzer='word', ngram_range=(1, 4), min_df=0.005, max_df=0.8, norm='l2')
reducer = SVD(n_components=100)

model = Baseline(steps=[
    ('tfidf', vectorizer),
    ('reduce_dim', reducer)
])
df = read_csv('data/english.csv', seg_words=False, sample_n=None)
X_train, Y_train = read_data(df[:8000])
X_validate, Y_validate = read_data(df[8000:])

2018-11-16 02:00:08,983 [INFO] Reading data/english.csv..


In [20]:
ff = vectorizer.fit_transform(X_train)

2018-11-16 02:04:12,923 [INFO] Fit & Transform TF-IDF...


In [22]:
vectorizer

Exam the quality of the top terms:

In [48]:
from collections import Counter

print('\n'.join(['%s \t %s' % (k, v) for k, v in
                 Counter(model.vectorizer.vocabulary_).most_common()[:100]]))

zhongshan 	 10962
zhejiang 	 10961
zhang 	 10960
yunnan 	 10959
yum kung soup 	 10958
yum kung 	 10957
yum 	 10956
yuba 	 10955
yuan vouchers 	 10954
yuan to 	 10953
yuan this 	 10952
yuan there 	 10951
yuan the price is 	 10950
yuan the price 	 10949
yuan the 	 10948
yuan per person 	 10947
yuan per 	 10946
yuan it is 	 10945
yuan it 	 10944
yuan is 	 10943
yuan in 	 10942
yuan for 	 10941
yuan and the 	 10940
yuan and 	 10939
yuan 	 10938
yourself 	 10937
your own 	 10936
your home 	 10935
your 	 10934
younger 	 10933
young people 	 10932
young 	 10931
you will be 	 10930
you will 	 10929
you want to eat 	 10928
you want to 	 10927
you want 	 10926
you to 	 10925
you think 	 10924
you the 	 10923
you that 	 10922
you should 	 10921
you say 	 10920
you order 	 10919
you need to 	 10918
you need 	 10917
you must 	 10916
you look at 	 10915
you look 	 10914
you like 	 10913
you know 	 10912
you have to 	 10911
you have 	 10910
you go to 	 10909
you go 	 10908
you for your 	 10907
you fo

In [49]:
all_avg_scores, all_scores = {}, {}
for cls in ['LinearDiscriminantAnalysis']:
    if cls.startswith('_'):
        continue
    Classifier = getattr(classifiers, cls) 
    model = Indie(classifier=Classifier,
                  vectorizer=model.vectorizer,
                  reducer=model.reducer)
    model.train(X_train, Y_train)
    avg_score, scores = model.validate(X_validate, Y_validate)
    all_avg_scores[cls] = avg_score
    all_scores[cls] = scores

2018-11-11 22:21:20,258 [INFO] [train] location_traffic_convenience 
2018-11-11 22:21:22,628 [INFO] [train] location_distance_from_business_district 
2018-11-11 22:21:24,881 [INFO] [train] location_easy_to_find 
2018-11-11 22:21:27,474 [INFO] [train] service_wait_time 
2018-11-11 22:21:29,750 [INFO] [train] service_waiters_attitude 
2018-11-11 22:21:31,833 [INFO] [train] service_parking_convenience 
2018-11-11 22:21:33,869 [INFO] [train] service_serving_speed 
2018-11-11 22:21:35,799 [INFO] [train] price_level 
2018-11-11 22:21:37,756 [INFO] [train] price_cost_effective 
2018-11-11 22:21:39,686 [INFO] [train] price_discount 
2018-11-11 22:21:41,682 [INFO] [train] environment_decoration 
2018-11-11 22:21:43,801 [INFO] [train] environment_noise 
2018-11-11 22:21:45,922 [INFO] [train] environment_space 
2018-11-11 22:21:47,951 [INFO] [train] environment_cleaness 
2018-11-11 22:21:49,997 [INFO] [train] dish_portion 
2018-11-11 22:21:52,461 [INFO] [train] dish_taste 
2018-11-11 22:21:54,538