In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, TfidfTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn.multiclass import OneVsRestClassifier
from sklearn.base import TransformerMixin, BaseEstimator
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.svm import SVC
import joblib
import pandas as pd
import numpy as np
from pyvi import ViPosTagger, ViTokenizer
import warnings
warnings.filterwarnings("ignore")

### Preprocessing text

In [2]:
def import_dataset(filename):
    X = []
    y = []
    categories = set()
    dataset = []
    with open(filename, "r", encoding='utf-8') as file:
        for line in file:
            if line.find("--- # ---") != -1:
                data, label = line.split("--- # ---")
                new_data = ""
                for word in data.split():
                    if word.find("sys.") == -1 and word.find("dev.") == -1:
                        word = word.replace('_', ' ')
                    new_data += word + ' '
                dataset.append([new_data.strip(), label.strip()])
                X.append(new_data.strip())
                y.append(set(label.strip().split('|')))
                for tmp in label.strip().split('|'):
                    categories.add(tmp)
    
    return X, y, categories

In [3]:
X_train, y_train, categories = import_dataset('data/intentCompactTrainVi.txt')
X_test, y_test, _ = import_dataset('data/intentCompactTestVi.txt')

### Init pipeline

In [4]:
class FeatureTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.stopwords = set()
        with open("data/vietnamese-stopwords.txt", 'r', encoding='utf-8') as file:
            for line in file:
                self.stopwords.add(line.strip())
        
    def fit(self, *_):
        return self
    
    def transform(self, X, y=None, **fit_params):
        result = [ViTokenizer.tokenize(text.lower()) for text in X]
        return [" ".join([token for token in text.split() if token not in self.stopwords]) for text in result]

In [5]:
LogReg_pipeline = Pipeline([
    ('transformer', FeatureTransformer()),
#     ("vect", TfidfVectorizer(ngram_range=(1,2), use_idf=0)),
    ("vect", CountVectorizer()),
#     ('clf', OneVsRestClassifier(LogisticRegression(C=10000))),
    ('clf', OneVsRestClassifier(SVC(C=100))),
])

### Multilabel encoding

In [6]:
lb = MultiLabelBinarizer(classes=tuple(categories))
y_train = lb.fit_transform(y_train)
y_test = lb.fit_transform(y_test)

In [7]:
# Train model
clf = LogReg_pipeline.fit(X_train, y_train)

### Predict and summary

In [8]:
def predict_from_text(text):
    return lb.inverse_transform(LogReg_pipeline.predict(text))

print(predict_from_text(["sysfootballplayer mang áo số mấy tại sysfootballclub"]))

[('schema:isMemberOf[schema:sportNumber]@football_player',)]


In [9]:
prediction = LogReg_pipeline.predict(X_test)
print("Accuracy score:", accuracy_score(prediction, y_test))

Accuracy score: 1.0


In [10]:
def compare2list(pre, org):
    for x, y in zip(pre,org):
        if x != y:
            return False
    return True
i = 0
for pre, org in zip(prediction, y_test):
    print("---> Case %dth: " % (i))
    print("Text:", X_test[i])
    if compare2list(pre, org):
        print(True)
        print("Predicted: ", lb.inverse_transform(np.array([pre])))
        
    else:
        print(False)
        print("Predicted: ", lb.inverse_transform(np.array([pre])))
        print("Real: ", lb.inverse_transform(np.array([org])))
    print("--------------")    
    i += 1

---> Case 0th: 
Text: sysfootballplayer giữ vị trí gì
True
Predicted:  [('schema:hasPlayerProfile[schema:hasPosition]@football_player',)]
--------------
---> Case 1th: 
Text: sysfootballplayer đang đá vị trí nào
True
Predicted:  [('schema:hasPlayerProfile[schema:hasPosition]@football_player',)]
--------------
---> Case 2th: 
Text: sysfootballplayer đang thi đấu cho vị trí gì
True
Predicted:  [('schema:hasPlayerProfile[schema:hasPosition]@football_player',)]
--------------
---> Case 3th: 
Text: sysfootballplayer chơi ở đội bóng đá nào
True
Predicted:  [('schema:isMemberOf[schema:isInClub]@football_player',)]
--------------
---> Case 4th: 
Text: sysfootballplayer thi đấu cho câu lạc bộ bóng đá nào
True
Predicted:  [('schema:isMemberOf[schema:isInClub]@football_player',)]
--------------
---> Case 5th: 
Text: sysfootballplayer có bao nhiêu devfootballplayerhonour
True
Predicted:  [('schema:hasHonour[schema:honourTitle]@football_player',)]
--------------
---> Case 6th: 
Text: sysfootballpla

### Save model

In [24]:
joblib.dump(LogReg_pipeline, "models/football_multilabel.pkl", compress= 1)
joblib.dump(lb, "models/football_labelencoding.pkl")

['models/football_labelencoding.pkl']