In [5]:
import gc
import numpy as np
import tensorflow as tf
import yaml
import joblib
import random
import xgboost
from pprint import pprint
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier

from keras import backend as K
from keras.callbacks import EarlyStopping
from keras.callbacks import ModelCheckpoint

In [9]:
# load constants and config
config_path = '../config/20news.yaml'
with open(config_path, 'r') as f:
    config = yaml.safe_load(f)
# end with
pprint('=' * 20 + 'Configs' + '=' * 20)


LB, L, T, TEXT_EMBEDDING_MATRIX = None, None, None, None

random.seed(config['seed'])
np.random.seed(config['seed'])
tf.random.set_seed(config['seed'])
LB = joblib.load(config['binarizer_out'])
LE = joblib.load(config['encoder_out'])
L = joblib.load(config['labeled_train_out'])
T = joblib.load(config['test_out'])
random.shuffle(L)



In [14]:
def train_svm(rep):
    print('='*20 + rep + 'svm' + '='*20, flush=True)
    global LE, LB, U, L, U_prime, n, k, u, config
    train_train = np.array([np.array(item[rep]) for item in L])
    y_train = np.array([np.array(item['cat_en']) for item in L])

    # Use L1 to train a classifier h1 that considers only the use representation of doc
    h1 = LinearSVC(random_state=0, tol=1e-5)
    h1.fit(train_train, y_train)

    # define the third combined classifier from h1 and h2,
    # and test the performance of h1 and h2 on test set
    print('=' * 50 + 'Predicting on Test Set....' + '=' * 50, flush=True)

    T_test = np.array([np.array(item[rep]) for item in T])
    h1_y_pred = h1.predict(T_test)
    del T_test
    gc.collect()

    h1_y_pred = LE.inverse_transform(h1_y_pred)

    y_true = np.array([item['cat_bin'] for item in T])
    y_true = LB.inverse_transform(y_true)

    print(accuracy_score(y_true, h1_y_pred), flush=True)

    print(classification_report(y_true, h1_y_pred), flush=True)
# end def

In [15]:
train_svm('doc2vec')

0.4318906001062135
                          precision    recall  f1-score   support

             alt.atheism       0.26      0.32      0.29       319
           comp.graphics       0.33      0.40      0.36       389
 comp.os.ms-windows.misc       0.30      0.31      0.30       394
comp.sys.ibm.pc.hardware       0.34      0.36      0.35       392
   comp.sys.mac.hardware       0.29      0.25      0.26       385
          comp.windows.x       0.44      0.42      0.43       395
            misc.forsale       0.55      0.57      0.56       390
               rec.autos       0.53      0.47      0.50       396
         rec.motorcycles       0.53      0.54      0.53       398
      rec.sport.baseball       0.39      0.62      0.48       397
        rec.sport.hockey       0.62      0.64      0.63       399
               sci.crypt       0.71      0.51      0.59       396
         sci.electronics       0.34      0.30      0.32       393
                 sci.med       0.66      0.59      0.62 