In [2]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import logging

logging.getLogger('jieba').setLevel(logging.WARN)
logging.getLogger('fgclassifier').setLevel(logging.INFO)

In [None]:
import config
from collections import defaultdict
from sklearn.model_selection import train_test_split

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import LatentDirichletAllocation

from fgclassifier.features import FeaturePipeline, logger
from fgclassifier.utils import read_data

X, y = read_data('data/english.csv', seg_words=False, sample_n=None)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

# feature models with dependency specs
fm_spec = {
   'count': CountVectorizer(ngram_range=(1, 4), min_df=0.01, max_df=0.99),
   'tfidf': ['count', TfidfTransformer],
   'lsa_100': ['tfidf', TruncatedSVD(n_components=100)],
   'lsa_1k': ['tfidf', TruncatedSVD(n_components=1000)],
   'lda_count_100': ['count', LatentDirichletAllocation(n_components=100)],
   'lda_count_200': ['count', LatentDirichletAllocation(n_components=200)],
   'lda_tfidf_100': ['tfidf', LatentDirichletAllocation(n_components=100)],
   'lda_tfidf_200': ['tfidf', LatentDirichletAllocation(n_components=200)],
    
    # small vocabulary (removed more stop words)
   'count_sv': [CountVectorizer(ngram_range=(1, 4), min_df=0.01, max_df=0.85)],
   'tfidf_sv': ['count_sv', TfidfTransformer],
   'lsa_100_sv': ['tfidf_sv', TruncatedSVD(n_components=100)],
   'lsa_1k_sv': ['tfidf_sv', TruncatedSVD(n_components=1000)],
   'lda_count_100_sv': ['count_sv', LatentDirichletAllocation(n_components=100)],
   'lda_count_200_sv': ['count_sv', LatentDirichletAllocation(n_components=200)],
   'lda_tfidf_100_sv': ['tfidf_sv', LatentDirichletAllocation(n_components=100)],
   'lda_tfidf_200_sv': ['tfidf_sv', LatentDirichletAllocation(n_components=200)],
}

# Cache trained fetures, we make this cache object
# so different steps can reuse previously trained features
fm = defaultdict(dict)

for name in fm_spec.keys():
    logger.info(f'Building features for {name}...')
    model = FeaturePipeline(name, spec=fm_spec, cache=fm)
    fmx_train[name] = model.fit_transform(X_train)
    fmx_test[name] = model.transform(X_test)

2018-11-19 23:06:52,956 [INFO] Reading data/english.csv..
2018-11-19 23:06:53,150 [INFO] Building features for count...
2018-11-19 23:06:58,923 [INFO] Building features for tfidf...
2018-11-19 23:06:58,924 [INFO]   count: fit_transform use cache.
2018-11-19 23:06:59,037 [INFO]   count: transform use cache.
2018-11-19 23:06:59,051 [INFO] Building features for lsa_100...
2018-11-19 23:06:59,053 [INFO]   tfidf: fit_transform use cache.
2018-11-19 23:07:00,520 [INFO]   tfidf: transform use cache.
2018-11-19 23:07:00,570 [INFO] Building features for lsa_1k...
2018-11-19 23:07:00,571 [INFO]   tfidf: fit_transform use cache.
2018-11-19 23:07:16,477 [INFO]   tfidf: transform use cache.
2018-11-19 23:07:16,820 [INFO] Building features for lda_count_100...
2018-11-19 23:07:16,821 [INFO]   count: fit_transform use cache.


Exam the quality of the top terms:

In [None]:
from collections import Counter

print('Vocab size: %s' % len(fm['count'].vocabulary_))
print('Most common words: \n')
print('\n'.join(['%s \t %s' % (k, v) for k, v in
                 Counter(fm['count'].vocabulary_).most_common()[:20]]))

## The Very Basic TF-IDF + LDA classifier

In [None]:
from fgclassifier.baseline import Baseline
from fgclassifier.classifiers import LDA

# Linear Discriminant Analysis
model = Baseline(classifier=LDA)
model.fit(fmx_train[''], y_train)
model.score(fmx_test[''], y_test)

## Search for the Best Feature + Classifier Combination

In [None]:
# Run for all classifiers and feature builders
models = defaultdict(dict)
all_avg_scores, all_scores = defaultdict(dict), defaultdict(dict)

In [None]:
from IPython.display import clear_output

from sklearn.preprocessing import Normalizer

from collections import defaultdict
from fgclassifier.baseline import logger
from fgclassifier.features import FeaturePipeline, SparseToSense
from fgclassifier.features import LatentDirichletAllocation, SVD
from fgclassifier import classifiers

# Naive Bayes models cannot handle negative values, so we pass
# in raw tf-idf to them
for cls in ['DummyStratified', 'MultinomialNB', 'ComplementNB']:
    logger.info('-----------------------------------')
    logger.info(f'Train for {cls}...')
    Classifier = getattr(classifiers, cls)
    model = Baseline(name=cls, classifier=Classifier)
    model.fit(X_train, y_train)
    models['raw'][cls] = model
    all_scores['raw'][cls] = model.scores(X_test, y_test)
    all_avg_scores['raw'][cls] = np.mean(all_scores['raw'][cls])
    

# decomposition methods
decomps = {
    'lsa_100': SVD(n_components=100),
    'lsa_500': SVD(n_components=500),
    'lsa_1k': SVD(n_components=1000),
    'lda_100': [
        SVD(n_components=1000),
        Normalizer(),
        LatentDirichletAllocation(n_components=100)
    ],
    'lda_500': [
        SVD(n_components=1000),
        Normalizer(),
        LatentDirichletAllocation(n_components=500),
    ]
}

for decomp, Decomposer in decomps.items():
    logger.info('-----------------------------------')
    logger.info(f'Build {decomp} features...')
    pipe = FeaturePipeline(steps=Decomposer)
    
    # the decomposited features
    X_train_ = pipe.fit_transform(X_train)
    X_test_ = pipe.transform(X_test)
    
    # save how we decomped the features
    models[decomp]['feature'] = pipe
    
    for cls in ['LDA', 'LinearSVC',
                'Logistic', 'Ridge',
                'ExtraTree']:
        logger.info('-----------------------------------')
        logger.info(f'Train for {decomp} -> {cls}...')
        Classifier = getattr(classifiers, cls)
        model = Baseline(name=cls, classifier=Classifier)
        model.fit(X_train_, y_train)
        models[decomp][cls] = model
        all_scores[decomp][cls] = model.scores(X_test_, y_test)
        all_avg_scores[decomp][cls] = np.mean(all_scores[decomp][cls])

In [None]:
X_train_

In [None]:
all_avg_scores