In [1]:
import logging
logging.basicConfig()
logger = logging.getLogger("Sklearn SVM")
logger.setLevel(logging.INFO)

from pyvi import ViTokenizer
from sklearn.preprocessing import MaxAbsScaler, MultiLabelBinarizer
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.multioutput import MultiOutputClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
import wandb

from utils.data_loader import load_phoatis


class SVMClassifier:
    """ 
    * Support Vector Machine with linear kernel and 
        TF-IDF feturizer by default.
    * This class uses the Sklearn's Pipeline.
    * It also handles for multilabel problem by using
        MultiOutputClassifier from Sklearn.
    """
    def __init__(self,
                 kernel="linear",
                 vectorizer=TfidfVectorizer,
                 tokenizer=True,
                 scaler="maxabsscaler",
                 probability=True,
                 random_state=42,
                 num_cpu=-1):
        self.kernel = kernel
        self.random_state = random_state
        self.probability = probability
        self.num_cpu = num_cpu
        self.tokenizer = self.__tokenize if tokenizer else None

        # pipeline configuration
        self.vectorizer = vectorizer(token_pattern=r"(?u)\b\w+\b",
                                          tokenizer=self.tokenizer)
        if scaler == "maxabsscaler":
            self.scaler = MaxAbsScaler()

        self.classifier = SVC(kernel=self.kernel,
                              random_state=self.random_state,
                              verbose=1,
                              probability=self.probability)

        self.pipeline = Pipeline([
            ("vectorizer", self.vectorizer),
            ("scaler", self.scaler),
            ("clf", MultiOutputClassifier(self.classifier, n_jobs=self.num_cpu))
        ])
        
        # This gird has to depend on training set size
        # due to training time cost.
        self.grid_params = {
            "vectorizer__ngram_range": [(1, 1), (1, 2)],
            "vectorizer__max_df": [0.25, 0.5, 1],
            "clf__estimator__C": [1e-3, 1e-2, 0.1, 1]
        }

    def train(self, X, y, **kwargs):
        try:
            if kwargs["label_binarizer"]:
                self.label_binarizer = kwargs["label_binarizer"]
            else:
                self.label_binarizer = None

            self.tuner = GridSearchCV(estimator=self.pipeline,
                                      param_grid=self.grid_params,
                                      scoring="f1_macro",
                                      n_jobs=self.num_cpu,
                                      cv=5,
                                      verbose=3,
                                      error_score=0.,
                                      refit=True,
                                      return_train_score=True)
            # Search hyperparameter
            self.tuner.fit(X, y)

            # return best model
            logger.info(f"GridSearchCV Reports:\n{self.tuner.cv_results_}\n\n")
            logger.info(f"Best estimator: {self.tuner.best_estimator_}\n\n")
            logger.info(f"Best score: {self.tuner.best_score_}\n\n")
        except Exception as e:
            logger.error(f"{e}")
            return None
        
        self.model = self.tuner.best_estimator_
        return self
    
    def predict(self, X):
        predicted_result = self.model.predict(X)
        return predicted_result

    def evaluate(self, X, y, wandb=False):
        y_pred = self.model.predict(X)
        y_prob = self.model.predict_proba(X)
        y_true = y

        # Sklearn's classification report
        clf_report = classification_report(y_true, y_pred,
                                           target_names=self.label_binarizer.classes_)
        logger.info(f"Classification Report:\n{clf_report}\n\n")
        
        if wandb:
            wandb.sklearn.plot_roc(y_true, y_prob, labels=self.label_binarizer.classes_)

    def __tokenize(self, text):
        return ViTokenizer.tokenize(text).split()

In [16]:
import numpy as np
help(np.argsort)

Help on function argsort in module numpy:

argsort(a, axis=-1, kind=None, order=None)
    Returns the indices that would sort an array.
    
    Perform an indirect sort along the given axis using the algorithm specified
    by the `kind` keyword. It returns an array of indices of the same shape as
    `a` that index data along the given axis in sorted order.
    
    Parameters
    ----------
    a : array_like
        Array to sort.
    axis : int or None, optional
        Axis along which to sort.  The default is -1 (the last axis). If None,
        the flattened array is used.
    kind : {'quicksort', 'mergesort', 'heapsort', 'stable'}, optional
        Sorting algorithm. The default is 'quicksort'. Note that both 'stable'
        and 'mergesort' use timsort under the covers and, in general, the
        actual implementation will vary with data type. The 'mergesort' option
        is retained for backwards compatibility.
    
        .. versionchanged:: 1.15.0.
           The 'stab

In [2]:
""" How to use? """

# model = SVMClassifier()
# model.train(X, y)
# model.predict(X)

' How to use? '

In [26]:
wandb.init(project="va-intent-detection", entity="emandai")




VBox(children=(Label(value='1.289 MB of 1.289 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

## PhoATIS

In [3]:
""" Load & Preprocess Data """

train, val, test = load_phoatis("../data/phoatis/")
train_X, train_y = train[0], train[1]
val_X, val_y = val[0], val[1]
test_X, test_y = test[0], test[1]


def _preprocess(data):
    data = [x.split("\n")[0] for x in data]
    return data


# this remove \n
iters = list(map(_preprocess,
                [train_X, train_y,
                 val_X, val_y,
                 test_X, test_y]))
train_X, train_y = iters[0], iters[1]
val_X, val_y = iters[2], iters[3]
test_X, test_y = iters[4], iters[5]

In [4]:
def _transform_label(labels):
    labels = [label.split("#") for label in labels]
    return labels

train_y = _transform_label(train_y)
val_y = _transform_label(val_y)
test_y = _transform_label(test_y)

In [5]:
intent_label = []
with open("../data/phoatis/intent_label.txt", "r") as f:
    intent_label = f.readlines()
    intent_label = [label.split("\n")[0] for label in intent_label] # remove \n
    intent_label = intent_label[1:] # remove UNK label
    intent_label = _transform_label(intent_label)

print("Intents:\n\n", intent_label)
print(f"\nNumber of intents: {len(intent_label)}")

Intents:

 [['abbreviation'], ['aircraft'], ['aircraft', 'flight', 'flight_no'], ['airfare'], ['airfare', 'flight'], ['airline'], ['airline', 'flight_no'], ['airport'], ['capacity'], ['city'], ['city', 'flight_time'], ['distance'], ['flight'], ['flight', 'flight_no'], ['flight', 'flight_time'], ['flight_no'], ['flight_no', 'flight_time'], ['flight_time'], ['ground_fare'], ['ground_fare', 'ground_service'], ['ground_service'], ['meal'], ['quantity'], ['restriction']]

Number of intents: 24


In [6]:
lb = MultiLabelBinarizer()
train_y = lb.fit_transform(train_y)
val_y = lb.transform(val_y)
test_y = lb.transform(test_y)

  "unknown class(es) {0} will be ignored".format(sorted(unknown, key=str))


In [None]:
model = SVMClassifier(tokenizer=False)
model = model.train(train_X, train_y, label_binarizer=lb)

In [29]:
model.evaluate(test_X, test_y)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
INFO:Sklearn SVM:Classification Report:
                precision    recall  f1-score   support

  abbreviation       1.00      0.98      0.99        52
      aircraft       0.80      0.80      0.80        10
       airfare       0.86      0.90      0.88        62
       airline       0.94      0.84      0.89        19
       airport       1.00      0.89      0.94        18
      capacity       1.00      0.95      0.98        21
          city       1.00      0.50      0.67         6
      distance       1.00      0.90      0.95        10
        flight       0.98      0.99      0.98       645
     flight_no       1.00      1.00      1.00         9
   flight_time       1.00      1.00      1.00         1
   ground_fare       1.00      0.86      0.92         7
ground_service       1.00      1.00      1.00        36
          meal       1.00      0.67      0.80         6
      qua

[LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][CV 3/5] END clf__estimator__C=0.001, vectorizer__max_df=0.25, vectorizer__ngram_range=(1, 1);, score=(train=0.053, test=0.053) total time=   9.1s
[LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][CV 1/5] END clf__estimator__C=0.001, vectorizer__max_df=1, vectorizer__ngram_range=(1, 1);, score=(train=0.053, test=0.053) total time=   1.9s
[LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][CV 2/5] END clf__estimator__C=0.001, vectorizer__max_df=1, vectorizer__ngram_range=(1, 2);, score=(train=0.053, test=0.053) total time=   1.8s
[LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][CV 4/5] END clf__estimator__C=0.01, vectorizer__max_

  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(tru

  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(tru

[LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][CV 2/5] END clf__estimator__C=0.001, vectorizer__max_df=0.25, vectorizer__ngram_range=(1, 2);, score=(train=0.053, test=0.053) total time=  13.6s
[LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][CV 3/5] END clf__estimator__C=0.01, vectorizer__max_df=0.25, vectorizer__ngram_range=(1, 2);, score=(train=0.245, test=0.237) total time=  12.7s
[LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][CV 4/5] END clf__estimator__C=0.01, vectorizer__max_df=1, vectorizer__ngram_range=(1, 2);, score=(train=0.053, test=0.054) total time=   1.9s
[LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][CV 5/5] END clf__estimator__C=0.1, vectorizer__max_

[LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][CV 1/5] END clf__estimator__C=0.001, vectorizer__max_df=0.25, vectorizer__ngram_range=(1, 1);, score=(train=0.053, test=0.053) total time=   9.0s
[LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][CV 3/5] END clf__estimator__C=0.001, vectorizer__max_df=0.5, vectorizer__ngram_range=(1, 2);, score=(train=0.053, test=0.053) total time=  14.6s
[LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][CV 4/5] END clf__estimator__C=0.01, vectorizer__max_df=0.5, vectorizer__ngram_range=(1, 2);, score=(train=0.249, test=0.263) total time=  16.0s
[LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][CV 5/5] END clf__estimator__C=0.1, vector

  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(tru

### quick test

In [14]:
# text = "chị chị điện cho anh tuấn nhưng mà anh tuấn ho không không dùng số này đóa chị nha"
text = ""
pred = model.predict([text])
print(pred)
print(lb.inverse_transform(pred))

[[0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0]]
[('flight',)]


## EMANDAI