In [1]:
from numpy import uint
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
# from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
#import pandas as pd
from joblib import dump
from nlp_frame import nlp_frame
from sklearn.feature_extraction.text import CountVectorizer
# from sklearn.feature_extraction.text import HashingVectorizer
import time
from xgboost import XGBClassifier



In [2]:
import os
os.chdir('..')

In [3]:
class nlp_model_training(nlp_frame):
    def __init__(self, vectParams: dict, segParams: dict, modelSelect: str, modelParams: dict) -> None:
        '''"vectParams":dict
        {
            "analyzer":str "word" | "char" | "char_wb",
            "max_df":float [0.0, 1.0],
            "min_df":float [0.0, 1.0],
            "binary":bool
        }

        "segParams":dict
        {
            "corpus":str,
            "HMM":bool,
            "use_paddle":bool
        }

        "modelSelect":str "NB" | "RF" | "XG",
        "modelParams":dict
        {
            NB use, no Fool Proof, makesure what modelSelect you set
            "alpha":float, default=1.0. Additive (Laplace/Lidstone) smoothing parameter (0 for no smoothing).
            "fit_prior":bool, default=True,Whether to learn class prior probabilities or not. If false, a uniform prior will be used.

            RF use, no Fool Proof, makesure what modelSelect you set
            "n_estimators":int, default=100
            "criterion":str "gini" | "entropy" default=”gini”
            "min_samples_split":int or float, default=2
            "min_samples_leaf"int or float, default=1
            "max_features":str "auto", "sqrt", "log2"
            "bootstrap":bool, default=True
            "oob_scorebool": bool, default=False, Only available if bootstrap=True.
            "class_weight":default=None, {“balanced”, “balanced_subsample”} [{0: 1, 1: 1}, {0: 1, 1: 5}, {0: 1, 1: 1}, {0: 1, 1: 1}]
            
            XG use, https://xgboost.readthedocs.io/en/stable/python/python_api.html#module-xgboost.sklearn
            n_estimators: int,#總共迭代的次數，即決策樹的個數。預設值為100
            max_depth: int,#樹的最大深度，默認值為6
            booster: str,#Specify which booster to use: gbtree, gblinear or dart.
            learning_rate: float,#學習速率，預設0.3
            gamma: float,#懲罰項係數，指定節點分裂所需的最小損失函數下降值
        }
        '''
        super().__init__()

        self.vect = CountVectorizer(**vectParams)

        self.segParams = segParams
        self.modelSelect = modelSelect
        self.modelParams = modelParams

        # self.vect = HashingVectorizer(n_features=2**n)

    def __loadCorpusAndSplit(self, corpus: str, HMM: bool, use_paddle: bool):
        df = self.loadCorpus(corpus, HMM, use_paddle)

        # BoW transform
        # -----------------------------------
        X = self.vect.fit_transform(df["X"]).toarray()
        # transform to dataframe
        # X = pd.DataFrame(
        #     X, columns=self.vect.get_feature_names_out())
        y = df['y'].astype('category')

        print(f"\n{X.shape}\n{y.shape}")

        # split dataset, random_state should only set in test
        X_v, X_test, y_v, y_test = \
            train_test_split(X, y, train_size=0.8, stratify=y)

        X_train, X_vaild, y_train, y_vaild = \
            train_test_split(X_v, y_v,
                             train_size=0.75, stratify=y_v)

        return X_train, X_test, y_train, y_test, X_vaild, y_vaild

    def training(self):
        X_train, X_test, y_train, y_test, X_vaild, y_vaild = \
            self.__loadCorpusAndSplit(**self.segParams)

        print(f'train dataset shape:{X_train.shape} {y_train.shape}')
        print(f'vaild dataset shape:{X_vaild.shape} {y_vaild.shape}')
        print(f'test  dataset shape:{X_test.shape} {y_test.shape}')

        model = None
        if self.modelSelect == "NB":
            # model = GaussianNB()
            model = MultinomialNB(**self.modelParams)
        elif self.modelSelect == "RF":
            model = RandomForestClassifier(**self.modelParams)
        elif self.modelSelect == "XG":
            model = XGBClassifier(**self.modelParams)
        

        model.fit(X_train, y_train)

        # cv = cross_val_score(model, X_train, y_train,
        #                      cv=5, scoring='accuracy').mean()
        # ----------------------------------------------------------------
        y_train_pred = model.predict(X_train)
        confusion_matrix_train = metrics.confusion_matrix(
            y_train, y_train_pred)
        accuracy_score_train = metrics.accuracy_score(y_train, y_train_pred)

        y_vaild_pred = model.predict(X_vaild)
        confusion_matrix_vaild = metrics.confusion_matrix(
            y_vaild, y_vaild_pred)
        accuracy_score_vaild = metrics.accuracy_score(y_vaild, y_vaild_pred)

        y_pred = model.predict(X_test)
        accuracy_score_test = metrics.accuracy_score(y_test, y_pred)
        confusion_matrix_test = metrics.confusion_matrix(y_test, y_pred)

        return (model, accuracy_score_train, confusion_matrix_train,
                accuracy_score_vaild, confusion_matrix_vaild, accuracy_score_test, confusion_matrix_test)


def ml_call(vectParams, segParams, modelSelect, modelParams, resultTimestamp, dumpModel):
    nmt = nlp_model_training(vectParams, segParams, modelSelect, modelParams)
    model, accuracy_score_train, confusion_matrix_train, accuracy_score_vaild, confusion_matrix_vaild,\
        accuracy_score_test, confusion_matrix_test = nmt.training()

    if dumpModel == True:
        dump(
            model, f'nlpModel_{modelSelect}/{resultTimestamp}.joblib')
        dump(
            nmt.vect, f'nlpModel_{modelSelect}/vect_{resultTimestamp}.vect')

    temp = f"\n{resultTimestamp}\n{vectParams}\n{segParams}\n{modelSelect}\n{modelParams}\n" +\
        "-------------------------------------------------------------\n"
    with open(f"./info/{modelSelect}_parameters.txt", mode="a", encoding="utf-8") as f:
        f.write(temp)
    print(temp)

    temp = f"\n{resultTimestamp}\n" +\
        f"accuracy score train:{accuracy_score_train:.3f}\nconfusion matrix train\n{confusion_matrix_train}\n" +\
        f"accuracy score vaild:{accuracy_score_vaild:.3f}\nconfusion matrix vaild\n{confusion_matrix_vaild}\n" +\
        f"accuracy score test:{accuracy_score_test:.3f}\nconfusion matrix test\n{confusion_matrix_test}\n" +\
        "-------------------------------------------------------------\n"

    with open(f"./info/{modelSelect}_modelScore.txt", 'a', encoding="utf-8") as f:
        f.write(temp)
    print(temp)


In [10]:
try:
    resultTimestamp = f"{time.time()}"

    vectParams = {
        "analyzer": "char_wb",
        "max_df": 0.8,
        "min_df": 0.0,
        "binary": False
    }

    segParams = {
        "corpus": "./corpus_words/corpus_new.xlsx",
        "HMM": True,
        "use_paddle": False
    }

    modelSelect = "XG"

    # alpha:Additive (Laplace/Lidstone) smoothing parameter(0 for no smoothing).
    # "fit_prior": bool, default = True Whether to learn class prior probabilities or not. If false, a uniform prior will be used.
    modelParams = {
        "n_estimators":100,
        "max_depth": 10,
        "booster": "gbtree",#Specify which booster to use: gbtree, gblinear or dart.
        "learning_rate": 0.3,#學習速率，預設0.3。
        "gpu_id" :0
    }

    ml_call(vectParams, segParams,
            modelSelect, modelParams, resultTimestamp, dumpModel=False)
except Exception as e:
    temp = f'\n{e}\n{resultTimestamp}\n{vectParams}\n{segParams}\n{modelSelect}\n{modelParams}'
    with open('./info/err.txt', 'a', encoding='utf-8') as f:
        f.write(
            temp)
    print(temp)


(2156, 1134)
(2156,)
train dataset shape:(1293, 1134) (1293,)
vaild dataset shape:(431, 1134) (431,)
test  dataset shape:(432, 1134) (432,)





1644656977.435214
{'analyzer': 'char_wb', 'max_df': 0.8, 'min_df': 0.0, 'binary': False}
{'corpus': './corpus_words/corpus_new.xlsx', 'HMM': True, 'use_paddle': False}
XG
{'n_estimators': 100, 'max_depth': 10, 'booster': 'gbtree', 'learning_rate': 0.3, 'gpu_id': 0}
-------------------------------------------------------------


1644656977.435214
accuracy score train:0.961
confusion matrix train
[[104   0   2   1   1]
 [  1  77   0   1   0]
 [  0   0 130   6   3]
 [  0   1   1 335  11]
 [  0   0   0  23 596]]
accuracy score vaild:0.626
confusion matrix vaild
[[ 15   6   6   2   7]
 [  5   5   8   3   5]
 [  6   4  12  11  13]
 [  3   3   4  69  37]
 [  4   2   6  26 169]]
accuracy score test:0.671
confusion matrix test
[[ 15   4   7   6   4]
 [  3   8   8   3   4]
 [  2   4  13  21   7]
 [  3   1   4  78  30]
 [  8   0   6  17 176]]
-------------------------------------------------------------

