In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import logging

logging.getLogger('jieba').setLevel(logging.WARN)
logging.getLogger('fgclassifier').setLevel(logging.INFO)

In [2]:
import os

os.chdir('..')

In [3]:
import sklearn
sklearn.__version__

'0.20.1'

# Model Selection

## Baseline Models

This notebook shows how to use our baseline model.
It also demonstrates how to test different feature models (i.e.,
different ways of building the features) at the same time.

We will use mostly the Google Translated English dataset for this
demonstration purpose.

In [4]:
import config
from collections import defaultdict
from sklearn.model_selection import train_test_split

from fgclassifier.utils import read_data, get_dataset

X_train, y_train = read_data(get_dataset('train_en'))
X_test, y_test = read_data(get_dataset('valid_en'))

2018-12-05 20:49:37,309 [INFO] Reading /opt/storage/english_train.csv..
2018-12-05 20:49:37,616 [INFO] Reading /opt/storage/english_valid.csv..


In [5]:
# del fm['tfidf_sv']
# del fm['tfidf_sv_dense']
# del fm['lsa_200_sv']
# del fm['lsa_500_sv']
# del fm['count_tiny']

In [6]:
# Cache feature models and trained fetures, we make this cache object
# so different steps can reuse previously transformed features
fm = defaultdict(dict)

In [7]:
from fgclassifier.features import FeaturePipeline, logger

def build_features(fm_names, fm):
    for name in fm_names:
        logger.info(f'Building features for {name}...')
        model = FeaturePipeline.from_spec(name, cache=fm)
        model.fit_transform(X_train)
        model.transform(X_test)
    
build_features(['count_en', 'count_en_sv'], fm)

2018-12-05 20:49:38,572 [INFO] 'pattern' package not found; tag filters are not available for English
2018-12-05 20:49:38,585 [INFO] Building features for count_en...
2018-12-05 20:50:05,052 [INFO] Vocab Size: 4000
2018-12-05 20:50:06,219 [INFO] Building features for count_en_sv...
2018-12-05 20:50:29,361 [INFO] Vocab Size: 2000


Exam the quality of the top terms:

In [8]:
from collections import Counter

print('Data Shape:', X_train.shape, X_test.shape)

for mn in ['count_en', 'count_en_sv']:
    model = fm[mn]['model'].named_steps[mn]
    key = next(filter(lambda x: 'fit_transform' in x, fm[mn].keys()))
    x_train = fm[mn][key]
    counts = np.sum(x_train, axis=0).flat
    counts = {k: counts[v] for k, v in model.vocabulary_.items()}
    print('\nmin_df: %.3f, max_df: %.3f, ngram_range: %s' % (
        model.min_df, model.max_df, model.ngram_range
    ))
    
    print('\nvocab size: %s\n' % len(model.vocabulary_))
    if model.stop_words_:
        print('\nStop words size: %s\n' % len(model.stop_words_))
    
    # Remove stop words as they take a lot of memory
    model.stop_words_ = None
    
    print('\n'.join([
        '%s \t %s' % (k, v)
        for k, v in Counter(counts).most_common()[:5]]))

Data Shape: (8000,) (2000,)

min_df: 3.000, max_df: 1.000, ngram_range: (1, 6)

vocab size: 4000


Stop words size: 3712557

good 	 18975
taste 	 13095
eat 	 11877
time 	 7277
delicious 	 7088

min_df: 3.000, max_df: 1.000, ngram_range: (1, 6)

vocab size: 2000


Stop words size: 3714557

good 	 18975
taste 	 13095
eat 	 11877
time 	 7277
delicious 	 7088


Save the word count features, as it is pretty slow to run for the whole dataset...

In [9]:
from sklearn.externals import joblib


def partial_get(d, keyword):
    key = next(filter(lambda x: x.startswith(keyword), d.keys()))
    return d[key]

def save_transform_cache(mn, path=None):
    path = path or f'data/fm_cache-{mn}'
    Xtrain = partial_get(fm[mn], 'fit_transform')
    Xtest = partial_get(fm[mn], 'transform')
    joblib.dump(Xtrain, path + '-train.pkl')
    joblib.dump(Xtest, path + '-test.pkl')
    print(f'Saved {path}')
    
save_transform_cache('count_en')
save_transform_cache('count_en_sv')
joblib.load('data/fm_cache-count-test.pkl')

Saved data/fm_cache-count_en
Saved data/fm_cache-count_en_sv


<2000x4000 sparse matrix of type '<class 'numpy.int64'>'
	with 280136 stored elements in Compressed Sparse Row format>

Build more features

In [20]:
build_features(['tfidf_en', 'tfidf_en_dense', 'lsa_500_en', 'lsa_1k_en'], fm)
build_features(['tfidf_en_sv', 'tfidf_en_sv_dense', 'lsa_500_en_sv', 'lsa_1k_en_sv'], fm)
# build_features(['tfidf_tiny', 'tfidf_tiny_dense', 'lsa_500_tiny'], fm)
build_features(['word2vec_en'], fm)

2018-12-05 21:11:08,921 [INFO] Building features for tfidf_en...
2018-12-05 21:11:08,922 [INFO]   tfidf_en: fit_transform use cache.
2018-12-05 21:11:08,923 [INFO]   tfidf_en: transform use cache.
2018-12-05 21:11:08,923 [INFO] Building features for tfidf_en_dense...
2018-12-05 21:11:08,924 [INFO]   tfidf_en_dense: fit_transform use cache.
2018-12-05 21:11:08,925 [INFO]   tfidf_en_dense: transform use cache.
2018-12-05 21:11:08,925 [INFO] Building features for lsa_500_en...
2018-12-05 21:11:08,926 [INFO]   lsa_500_en: fit_transform use cache.
2018-12-05 21:11:08,927 [INFO]   lsa_500_en: transform use cache.
2018-12-05 21:11:08,928 [INFO] Building features for lsa_1k_en...
2018-12-05 21:11:08,928 [INFO]   lsa_1k_en: fit_transform use cache.
2018-12-05 21:11:08,929 [INFO]   lsa_1k_en: transform use cache.
2018-12-05 21:11:08,931 [INFO] Building features for tfidf_en_sv...
2018-12-05 21:11:08,935 [INFO]   tfidf_en_sv: fit_transform use cache.
2018-12-05 21:11:08,942 [INFO]   tfidf_en_sv: 

In [21]:
save_transform_cache('tfidf_en')
save_transform_cache('lsa_500_en')
save_transform_cache('lsa_1k_en')

save_transform_cache('tfidf_en_sv')
save_transform_cache('lsa_500_en_sv')
save_transform_cache('lsa_1k_en_sv')

# save_transform_cache('tfidf_tiny')
# save_transform_cache('lsa_500_tiny')
# save_transform_cache('lsa_1k_tiny')

save_transform_cache('word2vec_en')

Saved data/fm_cache-tfidf_en
Saved data/fm_cache-lsa_500_en
Saved data/fm_cache-lsa_1k_en
Saved data/fm_cache-tfidf_en_sv
Saved data/fm_cache-lsa_500_en_sv
Saved data/fm_cache-lsa_1k_en_sv
Saved data/fm_cache-word2vec_en


## The Very Basic TF-IDF + LDA classifier

In [23]:
# Impact all feature models at once, so to avoid
# classes being reloaded and causing save_model to fail
from fgclassifier.baseline import Baseline, Dummy
from fgclassifier.train import fm_cross_check

Check a basic model

In [24]:
model = Baseline('SGD_Logistic', fm=fm['lsa_1k_en']['model'])
# Always pass in the original features
# the pipeline will take care of the cache
model.fit(X_train, y_train)
print(f'{model.name} final score:', model.score(X_test, y_test))

2018-12-05 21:11:37,288 [INFO]   lsa_1k_en: fit_transform use cache.
2018-12-05 21:11:47,336 [INFO]   lsa_1k_en: transform use cache.
  'precision', 'predicted', average, warn_for)
2018-12-05 21:11:47,449 [INFO]  F1 Score: 0.4400537993260894


lsa_1k_en_SGD_Logistic final score: 0.4400537993260894


Logistic is much slower but performs not much better than Stochastic logistic.

In [25]:
model = Baseline('SGD_Huber', fm=fm['lsa_1k_en']['model'])
model.fit(X_train, y_train)
print(f'{model.name} final score:', model.score(X_test, y_test))

2018-12-05 21:11:47,504 [INFO]   lsa_1k_en: fit_transform use cache.
2018-12-05 21:11:56,691 [INFO]   lsa_1k_en: transform use cache.
2018-12-05 21:11:56,836 [INFO]  F1 Score: 0.41832082945849275


lsa_1k_en_SGD_Huber final score: 0.41832082945849275


In [26]:
model = Baseline('SGD_SVC', fm=fm['lsa_1k_en']['model'])
model.fit(X_train, y_train)
print(f'{model.name} final score:', model.score(X_test, y_test))

2018-12-05 21:11:56,905 [INFO]   lsa_1k_en: fit_transform use cache.
2018-12-05 21:12:05,392 [INFO]   lsa_1k_en: transform use cache.
  'precision', 'predicted', average, warn_for)
2018-12-05 21:12:05,499 [INFO]  F1 Score: 0.44261309492249856


lsa_1k_en_SGD_SVC final score: 0.44261309492249856


In [27]:
# model = Baseline('Ridge', fm=fm['lsa_1k']['model'])
# model.fit(X_train, y_train)
# print(f'{model.name} final score:', model.score(X_test, y_test))

## Search for the Best Feature + Classifier Combination

In [28]:
# Run for all classifiers and feature builders
train_avg_scores, train_scores = defaultdict(dict), defaultdict(dict)
test_avg_scores, test_scores = defaultdict(dict), defaultdict(dict)
test_time, train_time = defaultdict(dict), defaultdict(dict)

In [29]:
from fgclassifier import classifiers
from fgclassifier.baseline import Dummy

Dummy(classifiers.DummyStratified)

Dummy(classifier=None)

In [30]:
conf = {
    'fm_cache': fm,
    'X_train': X_train,
    'X_test': X_test,
    'y_train': y_train,
    'y_test': y_test,
    'results': {
        'models': {},
        'test': test_scores,
        'test_time': test_time,
        'test_avg': test_avg_scores,
        'train': train_scores,
        'train_time': train_time,
        'train_avg': train_avg_scores
    }
}

In [32]:
from IPython.display import clear_output

# # We'd only need to run the dummy models on one feature model,
# # as they do not care about the features
fm_cross_check(
    ['count_en', 'count_en_sv',
     'tfidf_en', 'tfidf_en_sv'],
    ['DummyStratified', 'DummyMostFrequent'],
    model_cls=Dummy, **conf)

# # Naive Bayes models cannot handle negative values, so we pass
# # in only tfidf features
fm_cross_check(
    ['count_en', 'count_en_sv',
     'tfidf_en', 'tfidf_en_sv'],
    ['ComplementNB'], **conf)
clear_output()

In [33]:
# Only SGD methods can handle sparse matrix
fm_cross_check(
    [
     'tfidf_en', 'lsa_500_en', 'lsa_1k_en',
     'tfidf_en_sv', 'lsa_500_en_sv', 'lsa_1k_en_sv',
     'count_en', 'count_en_sv',
    ],
    ['SGD_Logistic', 'SGD_SVC'], **conf)

clear_output()

In [34]:
# All other models can run on many classifiers
results = fm_cross_check(
    ['lsa_500_en', 'lsa_1k_en',
     'lsa_500_en_sv', 'lsa_1k_en_sv'],
    ['LDA', 'Ridge'], **conf)

clear_output()

In [35]:
results = fm_cross_check(
    ['word2vec_en'],
    ['SGD_Logistic', 'SGD_SVC', 'LDA', 'Ridge'], **conf)

2018-12-05 21:20:02,487 [INFO] 
2018-12-05 21:20:02,489 [INFO] 
2018-12-05 21:20:02,489 [INFO] Train for word2vec_en -> SGD_Logistic...
2018-12-05 21:20:02,490 [INFO]   word2vec_en: fit_transform use cache.
2018-12-05 21:20:40,366 [INFO]   word2vec_en: transform use cache.
2018-12-05 21:20:40,414 [INFO] -------------------------------------------------------
2018-12-05 21:20:40,414 [INFO] 【word2vec_en -> SGD_Logistic】 Train: 0.3425, Test: 0.3176
2018-12-05 21:20:40,415 [INFO] -------------------------------------------------------
2018-12-05 21:20:40,415 [INFO] Train for word2vec_en -> SGD_SVC...
2018-12-05 21:20:40,416 [INFO]   word2vec_en: fit_transform use cache.
2018-12-05 21:20:44,228 [INFO]   word2vec_en: transform use cache.
2018-12-05 21:20:44,458 [INFO]   word2vec_en: transform use cache.
2018-12-05 21:20:44,500 [INFO] -------------------------------------------------------
2018-12-05 21:20:44,501 [INFO] 【word2vec_en -> SGD_SVC】 Train: 0.3682, Test: 0.3409
2018-12-05 21:20:44,

In [36]:
results = fm_cross_check(
    ['tfidf_en_sv_dense'],
    ['LDA', 'Ridge'], **conf)

2018-12-05 21:21:03,622 [INFO] 
2018-12-05 21:21:03,624 [INFO] 
2018-12-05 21:21:03,624 [INFO] Train for tfidf_en_sv_dense -> LDA...
2018-12-05 21:21:03,625 [INFO]   tfidf_en_sv_dense: fit_transform use cache.
2018-12-05 21:24:16,191 [INFO]   tfidf_en_sv: transform use cache.
2018-12-05 21:24:16,757 [INFO]   tfidf_en_sv_dense: transform use cache.
2018-12-05 21:24:16,897 [INFO] -------------------------------------------------------
2018-12-05 21:24:16,898 [INFO] 【tfidf_en_sv_dense -> LDA】 Train: 0.8396, Test: 0.4483
2018-12-05 21:24:16,899 [INFO] -------------------------------------------------------
2018-12-05 21:24:16,900 [INFO] Train for tfidf_en_sv_dense -> Ridge...
2018-12-05 21:24:16,900 [INFO]   tfidf_en_sv_dense: fit_transform use cache.
2018-12-05 21:27:35,654 [INFO]   tfidf_en_sv_dense: transform use cache.
  'precision', 'predicted', average, warn_for)
2018-12-05 21:27:36,442 [INFO]   tfidf_en_sv_dense: transform use cache.
  'precision', 'predicted', average, warn_for)
20

In [None]:
# This is very slow ...
results = fm_cross_check(
    ['tfidf_en_dense'],
    ['LDA', 'Ridge'], **conf)

2018-12-05 21:27:36,680 [INFO] 
2018-12-05 21:27:36,682 [INFO] 
2018-12-05 21:27:36,683 [INFO] Train for tfidf_en_dense -> LDA...
2018-12-05 21:27:36,684 [INFO]   tfidf_en_dense: fit_transform use cache.


In [None]:
# Boosting is too damn slow...
# results = fm_cross_check(
#     ['lsa_500',
#      'lsa_500_sv',
#      'lsa_500_tiny',
#      'lsa_1k',
#      'word2vec',
#     ],
#     ['XGB', 'AdaBoost', 'GradientBoost'], **conf)
# clear_output()

In [None]:
def merge_dense(obj):
    obj = obj.copy()
    obj['tfidf_en'] = {
        **obj['tfidf_en_dense'],
        **obj['tfidf_en']
    }
    obj['tfidf_en_sv'] = {
        **obj['tfidf_en_sv_dense'],
        **obj['tfidf_en_sv']
    }
    del obj['tfidf_en_dense']
    del obj['tfidf_en_sv_dense']
    return obj

def extract_scores(scores, avg_scores):
    scores = merge_dense(scores)
    avg_scores = merge_dense(avg_scores)
    rows = {}
    for fm_name in scores:
        for clf_name in avg_scores[fm_name]:
            key = f'{fm_name}.{clf_name}'
            rows[key] = [avg_scores[fm_name][clf_name], *scores[fm_name][clf_name]]
    df = pd.DataFrame(rows)
    df.index = ['average', *y_train.columns]
    return df.T.sort_values('average', ascending=False)

df_train = extract_scores(train_scores, train_avg_scores)
df_test = extract_scores(test_scores, test_avg_scores)
df_test

In [None]:
# Save the results
results = conf['results'].copy()
del results['models']  # don't save models (which are huuuuge)
joblib.dump(results, 'data/model-selection-en.pkl')

In [None]:
# important columns
imp_cols = [
    'count_en', 'count_en_sv',
    'tfidf_en', 'tfidf_en_sv',
    'lsa_500_en', 'lsa_500_en_sv',
    'lsa_1k_en', 'lsa_1k_en_sv',
    'word2vec_en']

def extract_avg_scores(scores):
    scores = merge_dense(scores)
    df = pd.DataFrame(scores)
    df['avg'] = df.mean(axis=1, skipna=True)
    df = df.T
    df['avg'] = df.mean(axis=1, skipna=True)
    df = df.T
    df = df.sort_values(by='avg', axis=1, ascending=False)
    df = df.sort_values(by='avg', ascending=False)
    df = df.drop(['avg'], axis=1)
    df = df.drop(['avg'], axis=0)
    return df[imp_cols]

df_train_avg = extract_avg_scores(train_avg_scores)
df_test_avg = extract_avg_scores(test_avg_scores)
df_test_avg

In [None]:
def extract_running_time(times):
    df = pd.DataFrame(times)
    df['avg'] = df.mean(axis=1, skipna=True)
    df = df.T
    df['avg'] = df.mean(axis=1, skipna=True)
    df = df.T
    df = df.sort_values(by='avg', axis=1, ascending=True)
    df = df.sort_values(by='avg', ascending=True)
    df = df.drop(['avg'], axis=1)
    df = df.drop(['avg'], axis=0)
    return df[imp_cols]
    
df_train_time = extract_running_time(train_time)
df_test_time = extract_running_time(test_time)
df_train_time

In [None]:
df_test_time

In [None]:
import matplotlib.pyplot as plt

df_test.drop([
    'tfidf.DummyStratified',
    'tfidf.DummyMostFrequent',
    'tfidf_sv.DummyStratified',
    'tfidf_sv.DummyMostFrequent',
    'count_sv.DummyStratified',
    'count_sv.DummyMostFrequent',
]).T.drop(['average']).boxplot(
    figsize=(18, 6), rot=90)

plt.show()

Let's save the models for future use.

In [None]:
results = conf['results']

In [None]:
from fgclassifier.utils import save_model

def clear_cache(model):
    if hasattr(model, 'steps'):
        for (name, step) in model.steps:
            clear_cache(step)
    if hasattr(model, 'cache'):
        model.cache = None
    return model

for name, model in results['models'].items():
    # skip unimportant models
    if 'QDA' in name:
        continue
    clear_cache(model)
    save_model(model)

clear_output()

In [40]:
x = results['models']['lsa_500_sv_SGD_SVC'].named_steps.lsa_500_sv.named_steps.lsa_500_sv
x.components_.shape

(500, 2000)

## Conclusion

- `ComplementNB` performs much better than a simple MultinomialNB, because our class labels are mostly unbalanced.
- `LatentDirichletAllocation` topics as features are not suitable for our classification problem, as features are often collinear. They often fare no better than the dummy classifier where we simply return the most frequent labels.
- LSA (Latent Semantic Analysis, Tfidf + SVD) shows a much more promising outlook, especially when combined with Linear Discriminant Analysis or SVC.
- Find the right vocabulary (min_df, ngram range, max_features) is crucial. Throw away noises early often outperforms running dimension reduction later.
- Basically SVD makes each feature (component) more indendent with each other, making LDA and SVC easier to come up with good fittings.
- Tree based models are not particularly useful. But the results may be different had we tuned the tree structure more.

## Next Steps

Required:

- Tune hyperparamters for `ComplementNB`, `TruncatedSVD`, `LinearDiscriminantAnalysis` and `SVC`/`LinearSVC`. Try different kernel functions.
- Try over-/under-sampling since most of our classes are imbalanced. [Possible solution](https://imbalanced-learn.org/)
- Test some boosting methods, especially [xgboost](https://xgboost.readthedocs.io/en/latest/).
- Test word embedding as features.

Optional:

- Possibly use different classifier for different labels.
- Test two step predictions: first run binary prediction for "mentioned" vs "not mentioned", i.e., -2 vs (-1, 0, 1), then predict (-1, 0, 1).
    - This could happen as either [ClassifierChain](https://scikit-learn.org/stable/modules/multiclass.html#classifierchain) or separate steps.

In [None]:
# model = results['models']['lsa_500_en_LDA']
# print(X_test[0:1].shape)
# probas = model.predict_proba(X_test[0:1])
# probas[0].shape

In [None]:
# model.predict(X_test[0:1])