In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import logging

logging.getLogger('jieba').setLevel(logging.WARN)
logging.getLogger('fgclassifier').setLevel(logging.INFO)

In [2]:
import os

os.chdir('..')

# Model Selection

## Baseline Models

This notebook shows how to use our baseline model.
It also demonstrates how to test different feature models (i.e.,
different ways of building the features) at the same time.

We will use mostly the Google Translated English dataset for this
demonstration purpose.

In [3]:
import config
from collections import defaultdict
from sklearn.model_selection import train_test_split

from fgclassifier.utils import read_data, get_dataset

X_train, y_train = read_data(get_dataset('train'))
X_test, y_test = read_data(get_dataset('valid'))
# X_train, y_train = read_data(get_dataset('train'), sample_n=100)
# X_test, y_test = read_data(get_dataset('valid'), sample_n=100)

2018-12-04 01:37:00,870 [INFO] Reading /opt/storage/train/sentiment_analysis_trainingset.csv.tokenized.csv..
2018-12-04 01:37:02,781 [INFO] Reading /opt/storage/train/sentiment_analysis_trainingset.csv..
2018-12-04 01:37:04,806 [INFO] Reading /opt/storage/valid/sentiment_analysis_validationset.csv.tokenized.csv..
2018-12-04 01:37:05,064 [INFO] Reading /opt/storage/valid/sentiment_analysis_validationset.csv..


In [4]:
# del fm['tfidf_sv']
# del fm['tfidf_sv_dense']
# del fm['lsa_200_sv']
# del fm['lsa_500_sv']
# del fm['count_tiny']

In [5]:
# Cache feature models and trained fetures, we make this cache object
# so different steps can reuse previously transformed features
fm = defaultdict(dict)

In [None]:
from fgclassifier.features import FeaturePipeline, logger

def build_features(fm_names, fm):
    for name in fm_names:
        logger.info(f'Building features for {name}...')
        model = FeaturePipeline.from_spec(name, cache=fm)
        model.fit_transform(X_train)
        model.transform(X_test)
    
build_features(['count', 'count_sv', 'count_tiny'], fm)

2018-12-04 01:37:05,741 [INFO] 'pattern' package not found; tag filters are not available for English
2018-12-04 01:37:05,749 [INFO] Building features for count...
2018-12-04 01:37:05,750 [INFO] Fit & Transform CountVectorizer...


Exam the quality of the top terms:

In [None]:
from collections import Counter

print('Data Shape:', X_train.shape, X_test.shape)

for mn in ['count', 'count_sv', 'count_tiny']:
    model = fm[mn]['model'].named_steps[mn]
    key = next(filter(lambda x: 'fit_transform' in x, fm[mn].keys()))
    x_train = fm[mn][key]
    counts = np.sum(x_train, axis=0).flat
    counts = {k: counts[v] for k, v in model.vocabulary_.items()}
    print('\nmin_df: %.3f, max_df: %.3f, ngram_range: %s' % (
        model.min_df, model.max_df, model.ngram_range
    ))
    print('\nvocab size: %s\n' % len(model.vocabulary_))
    print('\n'.join([
        '%s \t %s' % (k, v)
        for k, v in Counter(counts).most_common()[:5]]))

Save the word count features, as it is pretty slow to run for the whole dataset...

In [None]:
from sklearn.externals import joblib


def partial_get(d, keyword):
    key = next(filter(lambda x: x.startswith(keyword), d.keys()))
    return d[key]

def save_count_cache(mn, path=None):
    path = path or f'data/fm_cache-{mn}'
    Xtrain = partial_get(fm[mn], 'fit_transform')
    Xtest = partial_get(fm[mn], 'transform')
    joblib.dump(Xtrain, path + '-train.pkl')
    joblib.dump(Xtest, path + '-test.pkl')
    
save_count_cache('count')
save_count_cache('count_sv')
save_count_cache('count_tiny')
joblib.load('data/fm_cache-count-test.pkl')

Build more features

In [None]:
build_features(['tfidf', 'lsa_500', 'lsa_1k'], fm)
build_features(['tfidf_sv', 'tfidf_sv_dense', 'lsa_500_sv'], fm)
build_features(['tfidf_tiny', 'tfidf_tiny_dense', 'lsa_500_tiny'], fm)
build_features(['word2vec', 'word2vec_minmax'], fm)

In [None]:
fm['tfidf']['model'].named_steps

## The Very Basic TF-IDF + LDA classifier

In [None]:
# Impact all feature models at once, so to avoid
# classes being reloaded and causing save_model to fail
from fgclassifier.baseline import Baseline, Dummy
from fgclassifier.classifiers import LDA
from fgclassifier.train import fm_cross_check

In [None]:
# Linear Discriminant Analysis, specify the FeaturePipeline
# as steps
model = Baseline(('LDA', LDA), fm=fm['lsa_500']['model'])

# Always pass in the original features
# the pipeline will take care of the cache
model.fit(X_train, y_train)
print(model.name)
print('Final score:', model.score(X_test, y_test))

In [None]:
model.scores(X_test, y_test)

## Search for the Best Feature + Classifier Combination

In [None]:
# Run for all classifiers and feature builders
all_avg_scores, all_scores = defaultdict(dict), defaultdict(dict)

In [None]:
from fgclassifier import classifiers
from fgclassifier.baseline import Dummy

Dummy(classifiers.DummyStratified)

In [None]:
conf = {
    'fm_cache': fm,
    'X_train': X_train,
    'X_test': X_test,
    'y_train': y_train,
    'y_test': y_test,
    'results': {
        'models': {},
        'avg': all_avg_scores,
        'all': all_scores
    }
}

In [None]:
from IPython.display import clear_output

# We'd only need to run the dummy models on one feature model,
# as they do not care about the features
fm_cross_check(
    ['tfidf_sv'],
    ['DummyStratified', 'DummyMostFrequent'],
    model_cls=Dummy, **conf)

# Naive Bayes models cannot handle negative values, so we pass
# in only tfidf features
fm_cross_check(
    ['tfidf', 'tfidf_sv', 'tfidf_tiny', 'word2vec_minmax'],
    ['MultinomialNB', 'ComplementNB'], **conf)

# All other models can run on many classifiers
results = fm_cross_check(
    ['lsa_500',
     'lsa_1k',
     'tfidf_sv_dense',
     'tfidf_tiny_dense',
     'lsa_500_sv',
     'lsa_500_tiny',
     'word2vec'
    ],
    ['LDA', 'QDA', 'LinearSVC', 'Logistic', 'Ridge'], **conf)

clear_output()

In [None]:
rows = {}
for fm_name in all_scores:
    for clf_name in all_scores[fm_name]:
        key = f'{fm_name}.{clf_name}'
        rows[key] = [all_avg_scores[fm_name][clf_name],
                     *all_scores[fm_name][clf_name]]
df = pd.DataFrame(rows)
df.index = ['average', *y_train.columns]
df = df.T.sort_values('average', ascending=False)
df

In [None]:
import matplotlib.pyplot as plt

df.T.drop(['average']).boxplot(
    figsize=(18, 6), rot=90)

plt.show()

Let's save the models for future use.

In [None]:
from fgclassifier.utils import save_model

def clear_cache(model):
    if hasattr(model, 'steps'):
        for (name, step) in model.steps:
            clear_cache(step)
    if hasattr(model, 'cache'):
        model.cache = None
    return model

for name, model in results['models'].items():
    clear_cache(model)
    save_model(model)

## Conclusion

- `ComplementNB` performs much better than a simple MultinomialNB, because our class labels are mostly unbalanced.
- `LatentDirichletAllocation` topics as features are not suitable for our classification problem, as features are often collinear. They often fare no better than the dummy classifier where we simply return the most frequent labels.
- LSA (Latent Semantic Analysis, Tfidf + SVD) shows a much more promising outlook, especially when combined with Linear Discriminant Analysis or SVC.
- Find the right vocabulary (min_df and ngram range) is crucial. Throw away noises early often outperforms running dimension reduction later.
- Basically SVD makes each feature (component) more indendent with each other, making LDA and SVC easier to come up with good fittings.
- Tree based models are not particularly useful. But the results may be different had we tuned the tree structure more.

## Next Steps

Required:

- Tune hyperparamters for `ComplementNB`, `TruncatedSVD`, `LinearDiscriminantAnalysis` and `SVC`/`LinearSVC`. Try different kernel functions.
- Try over-/under-sampling since most of our classes are imbalanced. [Possible solution](https://imbalanced-learn.org/)
- Test some boosting methods, especially [xgboost](https://xgboost.readthedocs.io/en/latest/).
- Test word embedding as features.

Optional:

- Possibly use different classifier for different labels.
- Test two step predictions: first run binary prediction for "mentioned" vs "not mentioned", i.e., -2 vs (-1, 0, 1), then predict (-1, 0, 1).
    - This could happen as either [ClassifierChain](https://scikit-learn.org/stable/modules/multiclass.html#classifierchain) or separate steps.

In [None]:
model = results['models']['lsa_500_en_LDA']
print(X_test[0:1].shape)
probas = model.predict_proba(X_test[0:1])
probas[0].shape

In [None]:
model.predict(X_test[0:1])