In [1]:
import numpy as np
import pandas as pd
from collections import Counter
import matplotlib.pyplot as plt
import os
import ast
import nltk
import pickle
import math

In [2]:
from sklearn import tree
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics import accuracy_score, precision_score, recall_score, precision_recall_fscore_support
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectPercentile, chi2, SelectKBest, f_classif, mutual_info_classif

In [3]:
TEST_SIZE = 0.3
RANDOM_STATE = 42
SAMPLE_SIZE = 3000

TEXT_COL_NAME = 'text_split'
TEXT_TOKEN_COL_NAME = 'text_token_stop'
CLASS_COL_NAME = 'reduced_tags'

# DF_FILE_PATH = './data/no_stop.pkl'
DF_FILE_PATH = './data/no_stop_1000_30.pkl'

In [4]:
DT_SAVE_FILE_NAME = './Model/no_stop_1000_30_DT.bin'
DT_SELECT_SAVE_FILE_NAME = './Model/no_stop_1000_30_DT_selection.bin'
RF_SAVE_FILE_NAME = './Model/no_stop_1000_30_RF.bin'
RF_SELECT_SAVE_FILE_NAME = './Model/no_stop_1000_30_RF_selection.bin'
NN_SAVE_FILE_NAME = './Model/NN_model'

# Load Data

In [5]:
%%time
# df = pd.read_csv('./data/medium_articles.csv')
df = pd.read_pickle(DF_FILE_PATH)
df.head()

CPU times: user 495 ms, sys: 78.7 ms, total: 574 ms
Wall time: 573 ms


Unnamed: 0,title,text,url,authors,timestamp,tags,text_length,reduced_tags,text_token,text_token_stop,text_after
0,Surviving a Rod Through the Head,"You’ve heard of him, haven’t you? Phineas Gage...",https://medium.com/live-your-life-on-purpose/s...,['Rishav Sinha'],2020-02-26 00:01:01.576000+00:00,"['Brain', 'Health', 'Development', 'Psychology...",2326,[Health],"[You’ve, heard, of, him,, haven’t, you?, Phine...","[You’ve, heard, him,, haven’t, you?, Phineas, ...",234
1,Quora Overview,Making the most of Quora for content marketing...,https://medium.com/digital-marketing-lab/quora...,['Casey Botticello'],2020-09-04 18:30:41.246000+00:00,"['Productivity', 'Entrepreneurship', 'Writing'...",1887,"[Entrepreneurship, Writing, Startup]","[Making, the, most, of, Quora, for, content, m...","[Making, Quora, content, marketing, There, sev...",193
2,The Simple Formula For Becoming A Better Writer,You Need To Show Up Every Single Day. “The mos...,https://medium.com/swlh/the-simple-formula-for...,['Matt Lillywhite'],2020-06-26 13:31:34.246000+00:00,"['Creativity', 'Entrepreneurship', 'Blogging',...",1708,"[Entrepreneurship, Writing]","[You, Need, To, Show, Up, Every, Single, Day.,...","[You, Need, To, Show, Up, Every, Single, Day.,...",195
3,The Power of Sleep in Learning: Mind-Blowing S...,The Power of Sleep in Learning: Mind-Blowing S...,https://medium.com/superintelligence/the-power...,['John Von Neumann Ii'],2020-02-12 09:19:16.941000+00:00,"['Self Improvement', 'Productivity', 'Science'...",2201,"[Self Improvement, Health, Entrepreneurship]","[The, Power, of, Sleep, in, Learning:, Mind-Bl...","[The, Power, Sleep, Learning:, Mind-Blowing, S...",231
4,All the Love you do not see,All the Love you do not see On writing with he...,https://asingularstory.medium.com/all-the-love...,['A Singular Story'],2020-06-13 12:29:45.505000+00:00,"['Social Media', 'Future', 'Society', 'Writing...",1396,"[Writing, Mental Health]","[All, the, Love, you, do, not, see, On, writin...","[All, Love, see, On, writing, heart, hope, Pho...",150


In [6]:
df.head()

Unnamed: 0,title,text,url,authors,timestamp,tags,text_length,reduced_tags,text_token,text_token_stop,text_after
0,Surviving a Rod Through the Head,"You’ve heard of him, haven’t you? Phineas Gage...",https://medium.com/live-your-life-on-purpose/s...,['Rishav Sinha'],2020-02-26 00:01:01.576000+00:00,"['Brain', 'Health', 'Development', 'Psychology...",2326,[Health],"[You’ve, heard, of, him,, haven’t, you?, Phine...","[You’ve, heard, him,, haven’t, you?, Phineas, ...",234
1,Quora Overview,Making the most of Quora for content marketing...,https://medium.com/digital-marketing-lab/quora...,['Casey Botticello'],2020-09-04 18:30:41.246000+00:00,"['Productivity', 'Entrepreneurship', 'Writing'...",1887,"[Entrepreneurship, Writing, Startup]","[Making, the, most, of, Quora, for, content, m...","[Making, Quora, content, marketing, There, sev...",193
2,The Simple Formula For Becoming A Better Writer,You Need To Show Up Every Single Day. “The mos...,https://medium.com/swlh/the-simple-formula-for...,['Matt Lillywhite'],2020-06-26 13:31:34.246000+00:00,"['Creativity', 'Entrepreneurship', 'Blogging',...",1708,"[Entrepreneurship, Writing]","[You, Need, To, Show, Up, Every, Single, Day.,...","[You, Need, To, Show, Up, Every, Single, Day.,...",195
3,The Power of Sleep in Learning: Mind-Blowing S...,The Power of Sleep in Learning: Mind-Blowing S...,https://medium.com/superintelligence/the-power...,['John Von Neumann Ii'],2020-02-12 09:19:16.941000+00:00,"['Self Improvement', 'Productivity', 'Science'...",2201,"[Self Improvement, Health, Entrepreneurship]","[The, Power, of, Sleep, in, Learning:, Mind-Bl...","[The, Power, Sleep, Learning:, Mind-Blowing, S...",231
4,All the Love you do not see,All the Love you do not see On writing with he...,https://asingularstory.medium.com/all-the-love...,['A Singular Story'],2020-06-13 12:29:45.505000+00:00,"['Social Media', 'Future', 'Society', 'Writing...",1396,"[Writing, Mental Health]","[All, the, Love, you, do, not, see, On, writin...","[All, Love, see, On, writing, heart, hope, Pho...",150


In [7]:
print(df.shape)

(18118, 11)


In [8]:
%%time
df['text'] = df['text'].apply(lambda x: x.replace('\n\n', ' '))
df['text_token'] = df['text'].apply(lambda x: x.split(' '))
df['text_token']

CPU times: user 277 ms, sys: 48.5 ms, total: 325 ms
Wall time: 323 ms


0        [You’ve, heard, of, him,, haven’t, you?, Phine...
1        [Making, the, most, of, Quora, for, content, m...
2        [You, Need, To, Show, Up, Every, Single, Day.,...
3        [The, Power, of, Sleep, in, Learning:, Mind-Bl...
4        [All, the, Love, you, do, not, see, On, writin...
                               ...                        
18113    [My, theory, on, the, future, of, customer, se...
18114    [I, recently, downloaded, an, app, of, a, comp...
18115    [In, case, you, don’t, know,, now, you, can, u...
18116    [25, cent, chicken, wings., That’s, how, it, a...
18117    [Photo, by, Comeup, 2021, on, Comeup, 2021, Th...
Name: text_token, Length: 18118, dtype: object

In [9]:
tags = [tag for tag_list in df['reduced_tags'] for tag in tag_list ]
print(len(list(set(tags))))

30


# Decision Tree

## 沒有 selection

In [10]:
# from models.decision_tree import TokenDecisionTreeModel
class TokenDecisionTreeModel():
    def __init__(self, dt_args = dict(), tfidf_args = dict()):
        self.sklearn_model = tree.DecisionTreeClassifier(**dt_args)
        if "analyzer" not in tfidf_args.keys():
            tfidf_args["analyzer"] = lambda x : x
        if "decode_error" not in tfidf_args.keys():
            tfidf_args["decode_error"] = "ignore"
        self.enc = TfidfVectorizer(**tfidf_args)
        self.lenc = CountVectorizer(analyzer = lambda x : x, decode_error="ignore")
        
    def list2onehot(self, l, mode="train"):
        if mode=="train":
            self.enc.fit(l)
        return self.enc.transform(l).toarray()
    
    def class2onehot(self, lc, mode="train"):
        if mode=="train":
            self.lenc.fit(lc)
        return self.lenc.transform(lc).toarray()
        
    def train(self, X, Y):
        ## X : list of tokens (n,t)
        ## Y : list of class (n,c)
        ## return token
        X_array = self.list2onehot(X)
        print('finish list2onehot')
        Y_array = self.class2onehot(Y)
        print('finish class2onehot')
        self.sklearn_model.fit(X_array, Y_array)
        print('finish train')
        feature_importance_list = list(zip([tup[0] for tup in sorted(list(self.enc.vocabulary_.items()), key=lambda x : x[1])], self.sklearn_model.feature_importances_.tolist()))
        feature_importance_list = sorted(feature_importance_list, key = lambda x : -x[1])

        return feature_importance_list

    def _PRscore(self, y_true, pred):
        hit_matrix = np.zeros_like(pred)
        hit_matrix[np.where((pred == y_true) & (y_true > 0))] = 1
        
        tp = hit_matrix.sum(axis=1)
        pred_sum = pred.sum(axis=1)
        true_sum = y_true.sum(axis=1)
        pred_l = []
        recall_l = []
        for ix in range(tp.shape[0]):
            pred_score = (1.0 if true_sum[ix] == 0 else 0.0) if pred_sum[ix] == 0 else tp[ix]/pred_sum[ix]
            recall_score = (1.0 if pred_sum[ix] == 0 else 0.0) if true_sum[ix] == 0 else tp[ix]/true_sum[ix]
            pred_l.append(pred_score)
            recall_l.append(recall_score)

        return np.array(pred_l).mean(), np.array(recall_l).mean() 
    
    
    def predict(self, X, type="str"):
        X_array = self.list2onehot(X, mode="test")
        #Y_array = self.class2onehot(Y, mode="test")
        pred = self.sklearn_model.predict(X_array)
        return self.lenc.inverse_transform(pred) if type=="str" else pred
    
    def predict_proba(self, X):
        X_array = self.list2onehot(X, mode="test")
        #Y_array = self.class2onehot(Y, mode="test")
        pred = self.sklearn_model.predict_proba(X_array)
        return np.array([r[:,1] for r in pred]).T
    
    def score(self, X, Y, k=5):
        X_array = self.list2onehot(X, mode="test")
        Y_array = self.class2onehot(Y, mode="test")
        pred = self.predict(X, type="num")
        
        acc = self.sklearn_model.score(X_array, Y_array)
#         precision = precision_score(Y_array, pred, average='micro')
#         recall = recall_score(Y_array, pred, average='micro')
#         pr_metrix = precision_recall_fscore_support(Y_array, pred, average=None)
        precision, recall = self._PRscore(Y_array, pred)
        
        pred_proba = self.predict_proba(X)
        topk_class = np.argsort(pred_proba, axis=1)[:,-k:]
        topk_pred = np.zeros_like(pred)
        for i in range(pred.shape[0]):
            topk_pred[i, topk_class[i]] = 1
        
        topk = recall_score(Y_array, topk_pred, average='micro')
        return acc, recall, precision, topk
    def get_depth(self):
        return self.sklearn_model.get_depth()
    def get_leaves(self):
        return self.sklearn_model.get_n_leaves()

In [11]:
# from models.decision_tree import TokenDecisionTreeModel
#     model
#     self.enc = TfidfVectorizer
#     self.lenc = CountVectorizer

def score_tokenize(df, token_col_name, class_col_name, print_mes = False, max_depth =None, min_samples_split = 2, test_size = 0.3, random_state = 42, threshold = 0.05):
    
    train, test = train_test_split(df, test_size = test_size, random_state= random_state)
    
    train_token_list = list(train[token_col_name])
    train_class_list = list(train[class_col_name])
    test_token_list = list(test[token_col_name])
    test_class_list = list(test[class_col_name])
    
    
    print(f'training dataset size: {len(train_token_list)}')
    print(f'testing dataset size: {len(test_token_list)}')
    model = TokenDecisionTreeModel(dt_args={"random_state":42,"max_depth": max_depth, "min_samples_split":min_samples_split}, tfidf_args={"stop_words":"english"})
    model.list2onehot(train_token_list)
    model.class2onehot(train_class_list)
    
    print(f'x_columns_size: {len(model.enc.vocabulary_)}')
    print(f'y_columns_size: {len(model.lenc.vocabulary_)}')
    
    # print(model.lenc.vocabulary_)
    
    feature_importance_list = model.train(train_token_list, train_class_list)
    print(f'feature_importance_list: {len(feature_importance_list)}')
    print(f'feature_importance_list: {feature_importance_list[:30]}')
    
    # predict
    predict_return_matrix_num = model.predict(test_token_list ,type='num')
    predict_return_matrix = model.predict(test_token_list)

    # score
    acc, recall, precision, topk = model.score(test_token_list, test_class_list)
    score_df = pd.DataFrame([{'acc': acc, 'recall': recall, 'precision': precision, 'topk':topk}])
    score_df.index = [class_col_name+'one_DT']

    ## 計算空的
    test_class_list_class2onehot = model.class2onehot(test_class_list)
    print(f'test_class_list_class2onehot {len(test_class_list_class2onehot)}')
    empty = 0
    for i in test_class_list_class2onehot:
        if sum(i) == 0:
            empty+=1
    print(f'empty {empty}')
    
    # 印出一些東東
    if print_mes:   
        acutual_average = model.class2onehot(df[class_col_name]).sum(axis=1).mean()
        acutual_distribution = model.class2onehot(df[class_col_name]).sum(axis=0)
        print(f'原始資料： {class_col_name} 數目：{acutual_average}')
        print(f'原始資料：每個 {class_col_name} 被使用之數目：{acutual_distribution}')
        print(f'實際{class_col_name}數 {len(acutual_distribution)}')
        
        predict_average = predict_return_matrix_num.sum(axis=1).mean()
        predict_distribution = predict_return_matrix_num.sum(axis=0)
        print(f'預測中：平均 {class_col_name} 數目：{predict_average}')
        print(f'預測中：每個 {class_col_name} 被預測之數目：{predict_distribution}')
        print(f'預測{class_col_name}數 {len(predict_distribution)}')
        
        print(f'{class_col_name} model 深度: {model.get_depth()}, 葉子數：{model.get_leaves()}')
        
        error_cnt = 0
        for i in range(len(test_class_list)):
            if len(test_class_list[i]) == len(predict_return_matrix[i]):
                if test_class_list[i].sort() == predict_return_matrix[i].sort():
                    continue
            error_cnt+=1
            print(f'predict: {predict_return_matrix[i]} , true: {test_class_list[i]}')
        print(error_cnt)
    
    return model, score_df.T, predict_return_matrix, predict_return_matrix_num

In [12]:
%%time
model, score_df, predict_return_matrix, predict_return_matrix_num \
            = score_tokenize(df, TEXT_TOKEN_COL_NAME, CLASS_COL_NAME,\
                             test_size = TEST_SIZE, random_state = RANDOM_STATE, \
                             print_mes= True) 
with open(DT_SAVE_FILE_NAME, 'wb') as files:
    pickle.dump(model.sklearn_model, files)
score_df

training dataset size: 12682
testing dataset size: 5436


  "The parameter 'stop_words' will not be used"


x_columns_size: 211682
y_columns_size: 30
finish list2onehot
finish class2onehot
finish train
feature_importance_list: 211682
feature_importance_list: [('blockchain', 0.029007836133424), ('data', 0.019014136556009965), ('crypto', 0.016972737974301034), ('I', 0.015473478629460913), ('code', 0.013344058688495615), ('Bitcoin', 0.010408757028165056), ('writing', 0.009627214666231248), ('design', 0.008812148046660914), ('Python', 0.007261992055850629), ('love', 0.007174021823628529), ('token', 0.007016348693556036), ('The', 0.006760981779529337), ('COVID-19', 0.005036468645405719), ('business', 0.004817980742460194), ('one', 0.004718196171537458), ('life', 0.004663008528631662), ('=', 0.004655323426347489), ('model', 0.00453845908428984), ('cryptocurrency', 0.004298598272092251), ('This', 0.004268948555483198), ('people', 0.004232424487762767), ('In', 0.004201025434771789), ('marketing', 0.0040454783064501005), ('We', 0.004006113545512389), ('It', 0.0038040783431744093), ('new', 0.003725861

predict: ['Life Lessons'] , true: ['Self', 'Poetry']
predict: ['Life Lessons'] , true: ['Life Lessons', 'Mental Health', 'Self']
predict: ['Health' 'Self' 'Self Improvement'] , true: ['Life']
predict: ['Design' 'JavaScript' 'Programming'] , true: ['Startup']
predict: ['Love' 'Relationships'] , true: ['Love']
predict: ['Data Science' 'Machine Learning'] , true: ['Software Development']
predict: ['Data Science' 'Machine Learning'] , true: ['Machine Learning']
predict: ['Data Science'] , true: ['Technology', 'Data Science']
predict: ['Bitcoin' 'Cryptocurrency'] , true: ['Blockchain', 'Cryptocurrency', 'Bitcoin']
predict: ['Poetry'] , true: ['Education', 'Life', 'Mental Health', 'Writing']
predict: ['Writing'] , true: ['Culture', 'Technology']
predict: ['Culture'] , true: ['Artificial Intelligence', 'Machine Learning']
predict: ['Love' 'Relationships'] , true: ['Poetry']
predict: ['Writing'] , true: ['Writing', 'Life', 'Life Lessons']
predict: ['Blockchain'] , true: ['Cryptocurrency', 'Bit

Unnamed: 0,reduced_tagsone_DT
acc,0.146799
recall,0.311197
precision,0.326646
topk,0.418373


## 對全部類別作 selection

In [10]:
from sklearn import tree
# from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics import precision_score, recall_score, precision_recall_fscore_support

import numpy as np

# from models.decision_tree import TokenDecisionTreeModel
class TokenDecisionTreeModel_withChiSquared():
    def __init__(self, dt_args = dict(), tfidf_args = dict()):
        self.sklearn_model = tree.DecisionTreeClassifier(**dt_args)
        if "analyzer" not in tfidf_args.keys():
            tfidf_args["analyzer"] = lambda x : x
        if "decode_error" not in tfidf_args.keys():
            tfidf_args["decode_error"] = "ignore"
        self.enc = TfidfVectorizer(**tfidf_args)
        self.lenc = CountVectorizer(analyzer = lambda x : x, decode_error="ignore")
        self.drop_keys = []
        
        
    def list2onehot(self, l, mode="train"):
        if mode=="train":
            self.enc.fit(l)
        return self.enc.transform(l).toarray()
    
    def class2onehot(self, lc, mode="train"):
        if mode=="train":
            self.lenc.fit(lc)
        return self.lenc.transform(lc).toarray()
        
    def train(self, X, Y, k=10000, p=15, use='p'):
        ## X : list of tokens (n,t)
        ## Y : list of class (n,c)
        ## return token
        X_array = self.list2onehot(X)
        Y_array = self.class2onehot(Y)
    
   
        ######## selection ####################
        # ValueError: y should be a 1d array, got an array of shape (177, 8) instead.
        if use == 'k':
            self.selector = SelectKBest(chi2, k=k)           
        else:
            self.selector = SelectPercentile(chi2, percentile=p)
        print('Original number of features:', X_array.shape)
        X_array = self.selector.fit_transform(X_array, Y_array)
        print('Reduced number of features:', X_array.shape)
        #######################################
            
        self.sklearn_model.fit(X_array, Y_array)
        feature_importance_list = list(zip([tup[0] for tup in sorted(list(self.enc.vocabulary_.items()), key=lambda x : x[1])], self.sklearn_model.feature_importances_.tolist()))
        feature_importance_list = sorted(feature_importance_list, key = lambda x : -x[1])
        return feature_importance_list
    
    def predict(self, X, type="str"):
        X_array = self.list2onehot(X, mode="test")
        #Y_array = self.class2onehot(Y, mode="test")
        ##########selection################
        X_array = self.selector.transform(X_array) 
        ###########selection#################
        pred = self.sklearn_model.predict(X_array)
        return self.lenc.inverse_transform(pred) if type=="str" else pred
    
    def predict_proba(self, X):
        X_array = self.list2onehot(X, mode="test")
        #Y_array = self.class2onehot(Y, mode="test")
        ##########selection################
        X_array = self.selector.transform(X_array) 
        ###########selection#################
        pred = self.sklearn_model.predict_proba(X_array)
        return np.array([r[:,1] for r in pred]).T
    
    def _PRscore(self, y_true, pred):
        hit_matrix = np.zeros_like(pred)
        hit_matrix[np.where((pred == y_true) & (y_true > 0))] = 1
        
        tp = hit_matrix.sum(axis=1)
        pred_sum = pred.sum(axis=1)
        true_sum = y_true.sum(axis=1)
        pred_l = []
        recall_l = []
        for ix in range(tp.shape[0]):
            pred_score = (1.0 if true_sum[ix] == 0 else 0.0) if pred_sum[ix] == 0 else tp[ix]/pred_sum[ix]
            recall_score = (1.0 if pred_sum[ix] == 0 else 0.0) if true_sum[ix] == 0 else tp[ix]/true_sum[ix]
            pred_l.append(pred_score)
            recall_l.append(recall_score)

        return np.array(pred_l).mean(), np.array(recall_l).mean() 
    
    def score(self, X, Y, k=5):
        X_array = self.list2onehot(X, mode="test")
        Y_array = self.class2onehot(Y, mode="test")
        ##########selection################
        X_array = self.selector.transform(X_array) 
        ###########selection#################
        pred = self.predict(X, type="num")
        
        acc = self.sklearn_model.score(X_array, Y_array)
        # 'numpy.ndarray'
        print(f'X_array_len: {len(X_array)}')
        print(X_array[0])
        print(len(X_array[0]))
        print(f'Y_array_len: {len(Y_array)}')
        print(Y_array[0])
        print(len(Y_array[0]))
    
        precision, recall = self._PRscore(Y_array, pred)
        
        pred_proba = self.predict_proba(X)
        topk_class = np.argsort(pred_proba, axis=1)[:,-k:]
        topk_pred = np.zeros_like(pred)
        for i in range(pred.shape[0]):
            if pred[i, :].sum() > 0:
                topk_pred[i, topk_class[i]] = 1
        
        _, topk = self._PRscore(Y_array, topk_pred)
        return acc, recall, precision, topk
    def get_depth(self):
        return self.sklearn_model.get_depth()
    def get_leaves(self):
        return self.sklearn_model.get_n_leaves()

In [11]:
# from models.decision_tree import TokenDecisionTreeModel
#     model
#     self.enc = TfidfVectorizer
#     self.lenc = CountVectorizer

def score_tokenize_withChiSquared(df , token_col_name, class_col_name, print_mes = False, max_depth =None, min_samples_split = 2, test_size = 0.3, random_state = 42, threshold = 0.05):
    
    train, test = train_test_split(df, test_size = test_size, random_state= random_state)
    
    train_token_list = list(train[token_col_name])
    train_class_list = list(train[class_col_name])
    test_token_list = list(test[token_col_name])
    test_class_list = list(test[class_col_name])

    print(f'training dataset size: {len(train_token_list)}')
    print(f'testing dataset size: {len(test_token_list)}')
    model = TokenDecisionTreeModel_withChiSquared(dt_args={"random_state":42,"max_depth": max_depth, "min_samples_split":min_samples_split}, tfidf_args={"stop_words":"english"})
    

    model.list2onehot(train_token_list)
    model.class2onehot(train_class_list)
    
    print(f'x_columns_size: {len(model.enc.vocabulary_)}')
    print(f'y_columns_size: {len(model.lenc.vocabulary_)}')
    
    # print(model.lenc.vocabulary_)
    
    feature_importance_list = model.train(train_token_list, train_class_list)
    print(f'feature_importance_list: {len(feature_importance_list)}')
    print(f'feature_importance_list: {feature_importance_list[:30]}')
    
    # predict
    predict_return_matrix_num = model.predict(test_token_list ,type='num')
    predict_return_matrix = model.predict(test_token_list)

    # score
    acc, recall, precision, topk = model.score(test_token_list, test_class_list)
    score_df = pd.DataFrame([{'acc': acc, 'recall': recall, 'precision': precision, 'topk':topk}])

    # 印出一些東東
    if print_mes:   
        acutual_average = model.class2onehot(df[class_col_name]).sum(axis=1).mean()
        acutual_distribution = model.class2onehot(df[class_col_name]).sum(axis=0)
        print(f'原始資料： {class_col_name} 數目：{acutual_average}')
        print(f'原始資料：每個 {class_col_name} 被使用之數目：{acutual_distribution}')
        print(f'實際{class_col_name}數 {len(acutual_distribution)}')
        
        predict_average = predict_return_matrix_num.sum(axis=1).mean()
        predict_distribution = predict_return_matrix_num.sum(axis=0)
        print(f'預測中：平均 {class_col_name} 數目：{predict_average}')
        print(f'預測中：每個 {class_col_name} 被預測之數目：{predict_distribution}')
        print(f'預測{class_col_name}數 {len(predict_distribution)}')
        
        print(f'{class_col_name} model 深度: {model.get_depth()}, 葉子數：{model.get_leaves()}')
        
        error_cnt = 0
        for i in range(len(test_class_list)):
            if len(test_class_list[i]) == len(predict_return_matrix[i]):
                if test_class_list[i].sort() == predict_return_matrix[i].sort():
                    continue
            error_cnt+=1
            print(f'predict: {predict_return_matrix[i]} , true: {test_class_list[i]}')
        print(error_cnt)
    
    return model, score_df.T, predict_return_matrix, predict_return_matrix_num

In [12]:
%%time
DT_SELECT_SAVE_FILE_NAME='./Model/no_stop_1000_30_p_15_DT_selection.bin'
model_selection, score_df_selection, predict_return_matrix, predict_return_matrix_num \
            = score_tokenize_withChiSquared(df, TEXT_TOKEN_COL_NAME, CLASS_COL_NAME,\
                             test_size = TEST_SIZE, random_state = RANDOM_STATE, \
                             print_mes= False) 
with open(DT_SELECT_SAVE_FILE_NAME, 'wb') as files:
    pickle.dump(model_selection.sklearn_model, files)
score_df_selection

training dataset size: 14494
testing dataset size: 3624


  "The parameter 'stop_words' will not be used"


x_columns_size: 230487
y_columns_size: 30
Original number of features: (14494, 230487)
Reduced number of features: (14494, 34573)
feature_importance_list: 34573
feature_importance_list: [('2.12,', 0.03013062985920991), ('4,500mAh', 0.019147398665316846), ('37.', 0.018037071123108185), ('(PPE)', 0.014016596454025277), ('26th)', 0.012359464998430053), ('Cesar', 0.011141169822933486), ('$ARGON).', 0.01081229006419966), ('470,000', 0.008066725671395547), ('CHANCE', 0.0073178828411806646), ('Airdrop,', 0.006712557054889796), ('Alie', 0.006618222864364567), ('(“Look', 0.006349698774315565), ('2019,report', 0.005711989458745443), ('Anker', 0.005585690361118194), ('AspNetCoreAppInsightsSeries-local', 0.005576952540199506), ('(Side', 0.005558307751788679), ("'CAT-2':", 0.004724318252810196), ('(Reference:', 0.004712434717491588), ('<meta', 0.00467620058964388), ('15:1.', 0.004583343084707127), ('0.2', 0.004517897531872837), ('378.', 0.004516380685528193), ('Alto', 0.004486434510230299), ('#thew

Unnamed: 0,0
acc,0.149283
recall,0.315531
precision,0.332575
topk,0.426131


## 分別對各類別 select 再合併重要 feature

In [None]:
# 計算量過大，機器無法負荷，只能使用小的資料集觀察

In [14]:
# from models.decision_tree import TokenDecisionTreeModel

class TokenDecisionTreeModel_withSelection():
    def __init__(self, dt_args = dict(), tfidf_args = dict()):
        self.sklearn_model = tree.DecisionTreeClassifier(**dt_args)
        
        if "analyzer" not in tfidf_args.keys():
            tfidf_args["analyzer"] = lambda x : x
        if "decode_error" not in tfidf_args.keys():
            tfidf_args["decode_error"] = "ignore"
        
        self.enc = TfidfVectorizer(**tfidf_args)
        self.lenc = CountVectorizer(analyzer = lambda x : x, decode_error="ignore")
        self.drop_keys = []
        
    def list2onehot(self, l, mode="train"):
        if mode=="train":
            self.enc.fit(l)
        return self.enc.transform(l).toarray()
    
    def class2onehot(self, lc, mode="train"):
        if mode=="train":
            self.lenc.fit(lc)
        return self.lenc.transform(lc).toarray()
        
    def train(self, X, Y):
        ## X : list of tokens (n,t)
        ## Y : list of class (n,c)
        ## return token
        X_array = self.list2onehot(X)
        Y_array = self.class2onehot(Y)
        
        ######## selection ####################
        threshold = 0.05
        X_array_df = pd.DataFrame(X_array)
        Y_array_df = pd.DataFrame(Y_array)
        
        selected_features = [] 
        selector = SelectKBest(chi2, k='all')
 
        for i in Y_array_df.keys():
            selector = SelectKBest(chi2, k='all')
            selector.fit(X_array, Y_array_df[i])
            selected_features.append(list(selector.scores_))
    
        # MeanCS 
        selected_features = np.mean(selected_features, axis=0) > threshold
        # MaxCS
#        selected_features = np.max(selected_features, axis=0) > threshold
           
        for i in range(X_array_df.shape[1]):
            if selected_features[i] == False:
                self.drop_keys.append(i)
        print(X_array_df.shape)
        X_array_df = X_array_df.drop(columns = self.drop_keys)
        print(X_array_df.shape) 
        X_array = X_array_df.values.tolist()
        
        ######## selection ###################
        
        self.sklearn_model.fit(X_array, Y_array)
        feature_importance_list = list(zip([tup[0] for tup in sorted(list(self.enc.vocabulary_.items()), key=lambda x : x[1])], self.sklearn_model.feature_importances_.tolist()))
        feature_importance_list = sorted(feature_importance_list, key = lambda x : -x[1])
        return feature_importance_list
    
    def predict(self, X, type="str"):
        X_array = self.list2onehot(X, mode="test")
        ##########  selection  ################
        X_array_df = pd.DataFrame(X_array)
        X_array_df = X_array_df.drop(columns = self.drop_keys)
        X_array = X_array_df.values.tolist()
        ###########   selection  #################
        
        #Y_array = self.class2onehot(Y, mode="test")
        pred = self.sklearn_model.predict(X_array)
        return self.lenc.inverse_transform(pred) if type=="str" else pred
    
    def predict_proba(self, X):
        X_array = self.list2onehot(X, mode="test")
        ##########   selection   ################
        X_array_df = pd.DataFrame(X_array)
        X_array_df = X_array_df.drop(columns = self.drop_keys)
        X_array = X_array_df.values.tolist()
        ###########   selection  #################
        #Y_array = self.class2onehot(Y, mode="test")
        pred = self.sklearn_model.predict_proba(X_array)
        return np.array([r[:,1] for r in pred]).T
    
    def _PRscore(self, y_true, pred):
        hit_matrix = np.zeros_like(pred)
        hit_matrix[np.where((pred == y_true) & (y_true > 0))] = 1
        
        tp = hit_matrix.sum(axis=1)
        pred_sum = pred.sum(axis=1)
        true_sum = y_true.sum(axis=1)
        pred_l = []
        recall_l = []
        for ix in range(tp.shape[0]):
            pred_score = (1.0 if true_sum[ix] == 0 else 0.0) if pred_sum[ix] == 0 else tp[ix]/pred_sum[ix]
            recall_score = (1.0 if pred_sum[ix] == 0 else 0.0) if true_sum[ix] == 0 else tp[ix]/true_sum[ix]
            pred_l.append(pred_score)
            recall_l.append(recall_score)

        return np.array(pred_l).mean(), np.array(recall_l).mean()     
    def score(self, X, Y, k=5):
        X_array = self.list2onehot(X, mode="test")
        ##########  selection  ################
        X_array_df = pd.DataFrame(X_array)
        X_array_df = X_array_df.drop(columns = self.drop_keys)
        X_array = X_array_df.values.tolist()
        ###########  selection  #################
        
        Y_array = self.class2onehot(Y, mode="test")
        pred = self.predict(X, type="num")
        
        acc = self.sklearn_model.score(X_array, Y_array)
#         precision = precision_score(Y_array, pred, average='micro')
#         recall = recall_score(Y_array, pred, average='micro')
#         pr_metrix = precision_recall_fscore_support(Y_array, pred, average=None)

        precision, recall = self._PRscore(Y_array, pred)
        
        pred_proba = self.predict_proba(X)
        topk_class = np.argsort(pred_proba, axis=1)[:,-k:]
        topk_pred = np.zeros_like(pred)
        
        for i in range(pred.shape[0]):
            topk_pred[i, topk_class[i]] = 1
        
        topk = recall_score(Y_array, topk_pred, average='micro')
        return acc, recall, precision, pr_metrix, topk
    def get_depth(self):
        return self.sklearn_model.get_depth()
    def get_leaves(self):
        return self.sklearn_model.get_n_leaves()

In [15]:
# from models.decision_tree import TokenDecisionTreeModel
#     model
#     self.enc = TfidfVectorizer
#     self.lenc = CountVectorizer

def score_tokenize_withSelection(df , token_col_name, class_col_name, print_mes = False, max_depth =None, min_samples_split = 2, test_size = 0.3, random_state = 42, threshold = 0.05):
    
    train, test = train_test_split(df, test_size = test_size, random_state= random_state)
    
    train_token_list = list(train[token_col_name])
    train_class_list = list(train[class_col_name])
    test_token_list = list(test[token_col_name])
    test_class_list = list(test[class_col_name])

    print(f'training dataset size: {len(train_token_list)}')
    print(f'testing dataset size: {len(test_token_list)}')
    model = TokenDecisionTreeModel_withSelection(dt_args={"random_state":42,"max_depth": max_depth, "min_samples_split":min_samples_split}, tfidf_args={"stop_words":"english"})
    

    model.list2onehot(train_token_list)
    model.class2onehot(train_class_list)
    
    print(f'x_columns_size: {len(model.enc.vocabulary_)}')
    print(f'y_columns_size: {len(model.lenc.vocabulary_)}')
    
    # print(model.lenc.vocabulary_)
    
    feature_importance_list = model.train(train_token_list, train_class_list)
    print(f'feature_importance_list: {len(feature_importance_list)}')
    print(f'feature_importance_list: {feature_importance_list[:30]}')
    
    # predict
    predict_return_matrix_num = model.predict(test_token_list ,type='num')
    predict_return_matrix = model.predict(test_token_list)

    # score
    acc, recall, precision, pr_metrix, topk = model.score(test_token_list, test_class_list)
    score_df = pd.DataFrame([{'acc': acc, 'recall': recall, 'precision': precision, 'topk':topk}])

    # 印出一些東東
    if print_mes:   
        acutual_average = model.class2onehot(df[class_col_name]).sum(axis=1).mean()
        acutual_distribution = model.class2onehot(df[class_col_name]).sum(axis=0)
        print(f'原始資料： {class_col_name} 數目：{acutual_average}')
        print(f'原始資料：每個 {class_col_name} 被使用之數目：{acutual_distribution}')
        print(f'實際{class_col_name}數 {len(acutual_distribution)}')
        
        predict_average = predict_return_matrix_num.sum(axis=1).mean()
        predict_distribution = predict_return_matrix_num.sum(axis=0)
        print(f'預測中：平均 {class_col_name} 數目：{predict_average}')
        print(f'預測中：每個 {class_col_name} 被預測之數目：{predict_distribution}')
        print(f'預測{class_col_name}數 {len(predict_distribution)}')
        
        print(f'{class_col_name} model 深度: {model.get_depth()}, 葉子數：{model.get_leaves()}')
        
        error_cnt = 0
        for i in range(len(test_class_list)):
            if len(test_class_list[i]) == len(predict_return_matrix[i]):
                if test_class_list[i].sort() == predict_return_matrix[i].sort():
                    continue
            error_cnt+=1
            print(f'predict: {predict_return_matrix[i]} , true: {test_class_list[i]}')
        print(error_cnt)
    
    return model, score_df.T, pr_metrix, predict_return_matrix, predict_return_matrix_num

In [None]:
%%time
model_selection, score_df_selection, pr_metrix, predict_return_matrix, predict_return_matrix_num \
            = score_tokenize_withSelection(df, TEXT_TOKEN_COL_NAME, CLASS_COL_NAME,\
                             test_size = TEST_SIZE, random_state = RANDOM_STATE, \
                             print_mes= True) 
with open(DT_SELECT_SAVE_FILE_NAME, 'wb') as files:
    pickle.dump(model_selection.sklearn_model, files)
score_df_selection

training dataset size: 14494
testing dataset size: 3624


  "The parameter 'stop_words' will not be used"


x_columns_size: 230487
y_columns_size: 30


In [50]:
DT_SELECT_SAVE_FILE_NAME

'./Model/no_stop_1000_30_DT_selection.bin'

In [None]:
print(SAMPLE_SIZE)
score_df_selection


# Random Forest

In [14]:
from sklearn.ensemble import RandomForestClassifier

##  沒 selection

In [15]:
from sklearn.ensemble import RandomForestClassifier

class RandomForestModel():
    def __init__(self, dt_args = dict(), tfidf_args = dict()):
        self.sklearn_model = RandomForestClassifier(**dt_args)
        if "analyzer" not in tfidf_args.keys():
            tfidf_args["analyzer"] = lambda x : x
        if "decode_error" not in tfidf_args.keys():
            tfidf_args["decode_error"] = "ignore"
        self.enc = TfidfVectorizer(**tfidf_args)
        self.lenc = CountVectorizer(analyzer = lambda x : x, decode_error="ignore")
        
    def list2onehot(self, l, mode="train"):
        if mode=="train":
            self.enc.fit(l)
        return self.enc.transform(l).toarray()
    
    def class2onehot(self, lc, mode="train"):
        if mode=="train":
            self.lenc.fit(lc)
        return self.lenc.transform(lc).toarray()
        
    def train(self, X, Y):
        ## X : list of tokens (n,t)
        ## Y : list of class (n,c)
        ## return token
        X_array = self.list2onehot(X)
        Y_array = self.class2onehot(Y)
        self.sklearn_model.fit(X_array, Y_array)
        feature_importance_list = list(zip([tup[0] for tup in sorted(list(self.enc.vocabulary_.items()), key=lambda x : x[1])], self.sklearn_model.feature_importances_.tolist()))
        feature_importance_list = sorted(feature_importance_list, key = lambda x : -x[1])
        return feature_importance_list
    
    def predict(self, X, type="str"):
        X_array = self.list2onehot(X, mode="test")
        #Y_array = self.class2onehot(Y, mode="test")
        pred = self.sklearn_model.predict(X_array)
        return self.lenc.inverse_transform(pred) if type=="str" else pred
    
    def predict_proba(self, X):
        X_array = self.list2onehot(X, mode="test")
        #Y_array = self.class2onehot(Y, mode="test")
        pred = self.sklearn_model.predict_proba(X_array)
        return np.array([r[:,1] for r in pred]).T
    
    def _PRscore(self, y_true, pred):
        hit_matrix = np.zeros_like(pred)
        hit_matrix[np.where((pred == y_true) & (y_true > 0))] = 1
        
        tp = hit_matrix.sum(axis=1)
        pred_sum = pred.sum(axis=1)
        true_sum = y_true.sum(axis=1)
        pred_l = []
        recall_l = []
        for ix in range(tp.shape[0]):
            pred_score = (1.0 if true_sum[ix] == 0 else 0.0) if pred_sum[ix] == 0 else tp[ix]/pred_sum[ix]
            recall_score = (1.0 if pred_sum[ix] == 0 else 0.0) if true_sum[ix] == 0 else tp[ix]/true_sum[ix]
            pred_l.append(pred_score)
            recall_l.append(recall_score)

        return np.array(pred_l).mean(), np.array(recall_l).mean() 
    
    def score(self, X, Y, k=5):
        X_array = self.list2onehot(X, mode="test")
        Y_array = self.class2onehot(Y, mode="test")
        pred = self.predict(X, type="num")
        
        acc = self.sklearn_model.score(X_array, Y_array)
#         precision = precision_score(Y_array, pred, average='micro')
#         recall = recall_score(Y_array, pred, average='micro')
#         pr_metrix = precision_recall_fscore_support(Y_array, pred, average=None)
        
        pred_proba = self.predict_proba(X)
        topk_class = np.argsort(pred_proba, axis=1)[:,-k:]
        topk_pred = np.zeros_like(pred)
        for i in range(pred.shape[0]):
            topk_pred[i, topk_class[i]] = 1
        
        topk = recall_score(Y_array, topk_pred, average='micro')
        return acc, recall, precision, topk
    
    def get_depth(self):
        return self.sklearn_model.get_depth()
    def get_leaves(self):
        return self.sklearn_model.get_n_leaves()

In [16]:
# from models.decision_tree import TokenDecisionTreeModel
#     model
#     self.enc = TfidfVectorizer
#     self.lenc = CountVectorizer

def score_tokenize_random_forest(df , token_col_name, class_col_name, print_mes = False, criterion = 'log_loss',max_depth =None, min_samples_split = 2, test_size = 0.3, random_state = 42, threshold = 0.05):
    
    train, test = train_test_split(df, test_size = test_size, random_state= random_state)
    
    train_token_list = list(train[token_col_name])
    train_class_list = list(train[class_col_name])
    test_token_list = list(test[token_col_name])
    test_class_list = list(test[class_col_name])

    print(f'training dataset size: {len(train_token_list)}')
    print(f'testing dataset size: {len(test_token_list)}')
    model = RandomForestModel(dt_args={"random_state":42, "criterion": criterion})
    
    model.list2onehot(train_token_list)
    model.class2onehot(train_class_list)
    
    print(f'x_columns_size: {len(model.enc.vocabulary_)}')
    print(f'y_columns_size: {len(model.lenc.vocabulary_)}')
    
    # print(model.lenc.vocabulary_)
    
    feature_importance_list = model.train(train_token_list, train_class_list)
    print(f'feature_importance_list: {len(feature_importance_list)}')
    print(f'feature_importance_list: {feature_importance_list[:30]}')
    
    # predict
    predict_return_matrix_num = model.predict(test_token_list ,type='num')
    predict_return_matrix = model.predict(test_token_list)

    # score
    acc, recall, precision, topk = model.score(test_token_list, test_class_list)
    score_df = pd.DataFrame([{'acc': acc, 'recall': recall, 'precision': precision, 'topk':topk}])

    # 印出一些東東
    if print_mes:   
        acutual_average = model.class2onehot(df[class_col_name]).sum(axis=1).mean()
        acutual_distribution = model.class2onehot(df[class_col_name]).sum(axis=0)
        print(f'原始資料： {class_col_name} 數目：{acutual_average}')
        print(f'原始資料：每個 {class_col_name} 被使用之數目：{acutual_distribution}')
        print(f'實際{class_col_name}數 {len(acutual_distribution)}')
        
        predict_average = predict_return_matrix_num.sum(axis=1).mean()
        predict_distribution = predict_return_matrix_num.sum(axis=0)
        print(f'預測中：平均 {class_col_name} 數目：{predict_average}')
        print(f'預測中：每個 {class_col_name} 被預測之數目：{predict_distribution}')
        print(f'預測{class_col_name}數 {len(predict_distribution)}')
    
    return model, score_df.T, predict_return_matrix, predict_return_matrix_num

In [13]:
%%time
model, score_df, predict_return_matrix, predict_return_matrix_num \
            = score_tokenize_random_forest(df, TEXT_TOKEN_COL_NAME, CLASS_COL_NAME,\
                             test_size = TEST_SIZE, random_state = RANDOM_STATE, \
                             print_mes= True) 
with open(RF_SAVE_FILE_NAME, 'wb') as files:
    pickle.dump(model.sklearn_model, files)
score_df

## 對全部類別作 selection

In [14]:
from sklearn import tree
# from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics import precision_score, recall_score, precision_recall_fscore_support

import numpy as np

# from models.decision_tree import TokenDecisionTreeModel
class RandomForestModel_withChiSquared():
    def __init__(self, dt_args = dict(), tfidf_args = dict()):
        self.sklearn_model = RandomForestClassifier(**dt_args)
        if "analyzer" not in tfidf_args.keys():
            tfidf_args["analyzer"] = lambda x : x
        if "decode_error" not in tfidf_args.keys():
            tfidf_args["decode_error"] = "ignore"
        self.enc = TfidfVectorizer(**tfidf_args)
        self.lenc = CountVectorizer(analyzer = lambda x : x, decode_error="ignore")
        self.drop_keys = []
        
        
    def list2onehot(self, l, mode="train"):
        if mode=="train":
            self.enc.fit(l)
        return self.enc.transform(l).toarray()
    
    def class2onehot(self, lc, mode="train"):
        if mode=="train":
            self.lenc.fit(lc)
        return self.lenc.transform(lc).toarray()
        
    def train(self, X, Y, k=10000, p=10, use='p'):
        ## X : list of tokens (n,t)
        ## Y : list of class (n,c)
        ## return token
        X_array = self.list2onehot(X)
        Y_array = self.class2onehot(Y)
    
   
        ######## selection ####################
        # ValueError: y should be a 1d array, got an array of shape (177, 8) instead.
        if use == 'k':
            self.selector = SelectKBest(chi2, k=k)           
        else:
            self.selector = SelectPercentile(chi2, percentile=p)
        print('Original number of features:', X_array.shape)
        X_array = self.selector.fit_transform(X_array, Y_array)
        print('Reduced number of features:', X_array.shape)
        #######################################
            
        self.sklearn_model.fit(X_array, Y_array)
        feature_importance_list = list(zip([tup[0] for tup in sorted(list(self.enc.vocabulary_.items()), key=lambda x : x[1])], self.sklearn_model.feature_importances_.tolist()))
        feature_importance_list = sorted(feature_importance_list, key = lambda x : -x[1])
        return feature_importance_list
    
    def predict(self, X, type="str"):
        X_array = self.list2onehot(X, mode="test")
        #Y_array = self.class2onehot(Y, mode="test")
        ##########selection################
        X_array = self.selector.transform(X_array) 
        ###########selection#################
        pred = self.sklearn_model.predict(X_array)
        return self.lenc.inverse_transform(pred) if type=="str" else pred
    
    def predict_proba(self, X):
        X_array = self.list2onehot(X, mode="test")
        #Y_array = self.class2onehot(Y, mode="test")
        ##########selection################
        X_array = self.selector.transform(X_array) 
        ###########selection#################
        pred = self.sklearn_model.predict_proba(X_array)
        return np.array([r[:,1] for r in pred]).T
    
    def _PRscore(self, y_true, pred):
        hit_matrix = np.zeros_like(pred)
        hit_matrix[np.where((pred == y_true) & (y_true > 0))] = 1
        
        tp = hit_matrix.sum(axis=1)
        pred_sum = pred.sum(axis=1)
        true_sum = y_true.sum(axis=1)
        pred_l = []
        recall_l = []
        for ix in range(tp.shape[0]):
            pred_score = (1.0 if true_sum[ix] == 0 else 0.0) if pred_sum[ix] == 0 else tp[ix]/pred_sum[ix]
            recall_score = (1.0 if pred_sum[ix] == 0 else 0.0) if true_sum[ix] == 0 else tp[ix]/true_sum[ix]
            pred_l.append(pred_score)
            recall_l.append(recall_score)
        return np.array(pred_l).mean(), np.array(recall_l).mean() 
    
    def score(self, X, Y, k=5):
        X_array = self.list2onehot(X, mode="test")
        Y_array = self.class2onehot(Y, mode="test")
        ##########selection################
        X_array = self.selector.transform(X_array) 
        ###########selection#################
        pred = self.predict(X, type="num")
        
        acc = self.sklearn_model.score(X_array, Y_array)
        
#         precision = precision_score(Y_array, pred, average=None)
#         recall = recall_score(Y_array, pred, average=None)
#         pr_metrix = precision_recall_fscore_support(Y_array, pred, average=None)
        precision, recall = self._PRscore(Y_array, pred)
        
        pred_proba = self.predict_proba(X)
        topk_class = np.argsort(pred_proba, axis=1)[:,-k:]
        topk_pred = np.zeros_like(pred)
        for i in range(pred.shape[0]):
            if pred[i, :].sum() > 0:
                topk_pred[i, topk_class[i]] = 1
        
#         topk = recall_score(Y_array, topk_pred, average=None)
        _, topk = self._PRscore(Y_array, topk_pred)
        return acc, recall, precision, topk
  

In [68]:
# from models.decision_tree import TokenDecisionTreeModel
#     model
#     self.enc = TfidfVectorizer
#     self.lenc = CountVectorizer

def score_tokenize_random_forest_withChiSquared(df , token_col_name, class_col_name, print_mes = False, max_depth =None, min_samples_split = 2, test_size = 0.3, random_state = 42, threshold = 0.05):
    
    train, test = train_test_split(df, test_size = test_size, random_state= random_state)
    
    train_token_list = list(train[token_col_name])
    train_class_list = list(train[class_col_name])
    test_token_list = list(test[token_col_name])
    test_class_list = list(test[class_col_name])

    print(f'training dataset size: {len(train_token_list)}')
    print(f'testing dataset size: {len(test_token_list)}')
    model = RandomForestModel_withChiSquared(dt_args={"random_state":42})
    

    model.list2onehot(train_token_list)
    model.class2onehot(train_class_list)
    
    print(f'x_columns_size: {len(model.enc.vocabulary_)}')
    print(f'y_columns_size: {len(model.lenc.vocabulary_)}')
    
    # print(model.lenc.vocabulary_)
    
    feature_importance_list = model.train(train_token_list, train_class_list)
    print(f'feature_importance_list: {len(feature_importance_list)}')
    print(f'feature_importance_list: {feature_importance_list[:30]}')
    
    # predict
    predict_return_matrix_num = model.predict(test_token_list ,type='num')
    predict_return_matrix = model.predict(test_token_list)

    # score
    acc, recall, precision, topk = model.score(test_token_list, test_class_list)
    score_df = pd.DataFrame([{'acc': acc, 'recall': recall, 'precision': precision, 'topk':topk}])

    # 印出一些東東
    if print_mes:   
        acutual_average = model.class2onehot(df[class_col_name]).sum(axis=1).mean()
        acutual_distribution = model.class2onehot(df[class_col_name]).sum(axis=0)
        print(f'原始資料： {class_col_name} 數目：{acutual_average}')
        print(f'原始資料：每個 {class_col_name} 被使用之數目：{acutual_distribution}')
        print(f'實際{class_col_name}數 {len(acutual_distribution)}')
        
        predict_average = predict_return_matrix_num.sum(axis=1).mean()
        predict_distribution = predict_return_matrix_num.sum(axis=0)
        print(f'預測中：平均 {class_col_name} 數目：{predict_average}')
        print(f'預測中：每個 {class_col_name} 被預測之數目：{predict_distribution}')
        print(f'預測{class_col_name}數 {len(predict_distribution)}')
        
#         print(f'{class_col_name} model 深度: {model.get_depth()}, 葉子數：{model.get_leaves()}')
        
        error_cnt = 0
        for i in range(len(test_class_list)):
            if len(test_class_list[i]) == len(predict_return_matrix[i]):
                if test_class_list[i].sort() == predict_return_matrix[i].sort():
                    continue
            error_cnt+=1
            print(f'predict: {predict_return_matrix[i]} , true: {test_class_list[i]}')
        print(error_cnt)
    
    return model, score_df.T, predict_return_matrix, predict_return_matrix_num

In [69]:
%%time
model_selection, score_df_selection, predict_return_matrix, predict_return_matrix_num \
            = score_tokenize_random_forest_withChiSquared(df, TEXT_TOKEN_COL_NAME, CLASS_COL_NAME,\
                             test_size = TEST_SIZE, random_state = RANDOM_STATE, \
                             print_mes= True) 
with open(RF_SELECT_SAVE_FILE_NAME, 'wb') as files:
    pickle.dump(model_selection.sklearn_model, files)
score_df_selection

training dataset size: 14494
testing dataset size: 3624
x_columns_size: 230487
y_columns_size: 30
Original number of features: (14494, 230487)
Reduced number of features: (14494, 23049)
feature_importance_list: 23049
feature_importance_list: [('(“Pacman”),', 0.005364659646389867), ('(28)', 0.005096398749797424), (".text('\\uf2b9')", 0.004786869798847242), ('$(brew', 0.003909535495943108), ('-Attend', 0.0038503508427258247), ('.filter(),', 0.0037863691039497576), ('2019;', 0.003271299758249466), ('ANGRY', 0.0030788101466418826), ('.foregroundColor(.secondary)', 0.0027686961455537394), ('@click="$emit(\'swim\',', 0.0024318299466715237), ('$0.25', 0.002412986183065953), ('/usr/lib/python3/dist-packages', 0.0023464166348412616), ('3,700', 0.0022939837334630445), ('2005.', 0.00228395026416754), ('330', 0.002191752830896782), ('200k+', 0.0021514990656080306), ('#blockchainevent', 0.0021482162562003605), ('(Rare)', 0.002116253232986729), ('(effectiveed.tech)', 0.002089216526078022), ('(AI)', 

predict: [] , true: ['Covid 19']
predict: [] , true: ['Poetry']
predict: [] , true: ['Education']
predict: [] , true: ['Programming']
predict: [] , true: ['Machine Learning', 'Python', 'Programming', 'Software Development']
predict: [] , true: ['Data Science', 'Machine Learning']
predict: [] , true: ['Startup']
predict: [] , true: ['Startup']
predict: [] , true: ['Programming', 'JavaScript']
predict: [] , true: ['Design']
predict: [] , true: ['Self Improvement', 'Writing', 'Relationships', 'Life']
predict: [] , true: ['Software Development']
predict: [] , true: ['Relationships']
predict: [] , true: ['Self Improvement', 'Life Lessons']
predict: [] , true: ['Poetry']
predict: [] , true: ['Startup']
predict: [] , true: ['Cryptocurrency', 'Bitcoin']
predict: [] , true: ['Software Development', 'Programming']
predict: [] , true: ['Self Improvement']
predict: [] , true: ['Technology']
predict: [] , true: ['Startup', 'Entrepreneurship', 'Marketing', 'Business']
predict: ['Blockchain'] , true:

CPU times: user 4min 23s, sys: 34.3 s, total: 4min 57s
Wall time: 4min 13s


Unnamed: 0,0
acc,0.059879
recall,0.09123
precision,0.10619
topk,0.12328


In [25]:
RF_SELECT_SAVE_FILE_NAME

'./Model/no_stop_1000_30_RF_selection.bin'

## 分別對各類別 select 再合併重要 feature

In [None]:
# 計算量過大，機器無法負荷，只能使用小的資料集觀察

In [20]:
from sklearn.ensemble import RandomForestClassifier

class RandomForestModel_withSelection():
    def __init__(self, dt_args = dict(), tfidf_args = dict()):
        self.sklearn_model = RandomForestClassifier(**dt_args)
        
        if "analyzer" not in tfidf_args.keys():
            tfidf_args["analyzer"] = lambda x : x
        if "decode_error" not in tfidf_args.keys():
            tfidf_args["decode_error"] = "ignore"
        
        self.enc = TfidfVectorizer(**tfidf_args)
        self.lenc = CountVectorizer(analyzer = lambda x : x, decode_error="ignore")
        self.drop_keys = []
        
    def list2onehot(self, l, mode="train"):
        if mode=="train":
            self.enc.fit(l)
        return self.enc.transform(l).toarray()
    
    def class2onehot(self, lc, mode="train"):
        if mode=="train":
            self.lenc.fit(lc)
        return self.lenc.transform(lc).toarray()
        
    def train(self, X, Y):
        ## X : list of tokens (n,t)
        ## Y : list of class (n,c)
        ## return token
        X_array = self.list2onehot(X)
        Y_array = self.class2onehot(Y)
        
        ######## selection ####################
        threshold = 0.05
        X_array_df = pd.DataFrame(X_array)
        Y_array_df = pd.DataFrame(Y_array)
        
        selected_features = [] 
        for i in Y_array_df.keys():
            selector = SelectKBest(chi2, k='all')
            selector.fit(X_array, Y_array_df[i])
            selected_features.append(list(selector.scores_))
    
        # MeanCS 
        selected_features = np.mean(selected_features, axis=0) > threshold
        # MaxCS
#        selected_features = np.max(selected_features, axis=0) > threshold
           
        for i in range(X_array_df.shape[1]):
            if selected_features[i] == False:
                self.drop_keys.append(i)
        print(X_array_df.shape)
        X_array_df = X_array_df.drop(columns = self.drop_keys)
        print(X_array_df.shape) 
        X_array = X_array_df.values.tolist()
        
        ######## selection ###################
        
        self.sklearn_model.fit(X_array, Y_array)
        feature_importance_list = list(zip([tup[0] for tup in sorted(list(self.enc.vocabulary_.items()), key=lambda x : x[1])], self.sklearn_model.feature_importances_.tolist()))
        feature_importance_list = sorted(feature_importance_list, key = lambda x : -x[1])
        return feature_importance_list
    
    def predict(self, X, type="str"):
        X_array = self.list2onehot(X, mode="test")
        ##########  selection  ################
        X_array_df = pd.DataFrame(X_array)
        X_array_df = X_array_df.drop(columns = self.drop_keys)
        X_array = X_array_df.values.tolist()
        ###########   selection  #################
        
        #Y_array = self.class2onehot(Y, mode="test")
        pred = self.sklearn_model.predict(X_array)
        return self.lenc.inverse_transform(pred) if type=="str" else pred
    
    def predict_proba(self, X):
        X_array = self.list2onehot(X, mode="test")
        ##########   selection   ################
        X_array_df = pd.DataFrame(X_array)
        X_array_df = X_array_df.drop(columns = self.drop_keys)
        X_array = X_array_df.values.tolist()
        ###########   selection  #################
        #Y_array = self.class2onehot(Y, mode="test")
        pred = self.sklearn_model.predict_proba(X_array)
        return np.array([r[:,1] for r in pred]).T
    
    def score(self, X, Y, k=5):
        X_array = self.list2onehot(X, mode="test")
        ##########  selection  ################
        X_array_df = pd.DataFrame(X_array)
        X_array_df = X_array_df.drop(columns = self.drop_keys)
        X_array = X_array_df.values.tolist()
        ###########  selection  #################
        
        Y_array = self.class2onehot(Y, mode="test")
        pred = self.predict(X, type="num")
        
        acc = self.sklearn_model.score(X_array, Y_array)
        precision = precision_score(Y_array, pred, average='micro')
        recall = recall_score(Y_array, pred, average='micro')
        pr_metrix = precision_recall_fscore_support(Y_array, pred, average=None)
        
        pred_proba = self.predict_proba(X)
        topk_class = np.argsort(pred_proba, axis=1)[:,-k:]
        topk_pred = np.zeros_like(pred)
        
        for i in range(pred.shape[0]):
            topk_pred[i, topk_class[i]] = 1
        
        topk = recall_score(Y_array, topk_pred, average='micro')
        return acc, recall, precision, pr_metrix, topk
        
    def get_depth(self):
        return self.sklearn_model.get_depth()
    def get_leaves(self):
        return self.sklearn_model.get_n_leaves()

In [24]:
# from models.decision_tree import TokenDecisionTreeModel
#     model
#     self.enc = TfidfVectorizer
#     self.lenc = CountVectorizer

def score_tokenize_random_forest_withSelection(df , token_col_name, class_col_name, print_mes = False ,max_depth =None, min_samples_split = 2, test_size = 0.3, random_state = 42, threshold = 0.05):
    
    train, test = train_test_split(df, test_size = test_size, random_state= random_state)
    
    train_token_list = list(train[token_col_name])
    train_class_list = list(train[class_col_name])
    test_token_list = list(test[token_col_name])
    test_class_list = list(test[class_col_name])

    print(f'training dataset size: {len(train_token_list)}')
    print(f'testing dataset size: {len(test_token_list)}')
    model = RandomForestModel_withSelection(dt_args={"random_state":42})
    
    model.list2onehot(train_token_list)
    model.class2onehot(train_class_list)
    
    print(f'x_columns_size: {len(model.enc.vocabulary_)}')
    print(f'y_columns_size: {len(model.lenc.vocabulary_)}')
    
    # print(model.lenc.vocabulary_)
    
    feature_importance_list = model.train(train_token_list, train_class_list)
    print(f'feature_importance_list: {len(feature_importance_list)}')
    print(f'feature_importance_list: {feature_importance_list[:30]}')
    
    # predict
    predict_return_matrix_num = model.predict(test_token_list ,type='num')
    predict_return_matrix = model.predict(test_token_list)

    # score
    acc, recall, precision, pr_metrix, topk = model.score(test_token_list, test_class_list)
    score_df = pd.DataFrame([{'acc': acc, 'recall': recall, 'precision': precision, 'topk':topk}])

    # 印出一些東東
    if print_mes:   
        acutual_average = model.class2onehot(df[class_col_name]).sum(axis=1).mean()
        acutual_distribution = model.class2onehot(df[class_col_name]).sum(axis=0)
        print(f'原始資料： {class_col_name} 數目：{acutual_average}')
        print(f'原始資料：每個 {class_col_name} 被使用之數目：{acutual_distribution}')
        print(f'實際{class_col_name}數 {len(acutual_distribution)}')
        
        predict_average = predict_return_matrix_num.sum(axis=1).mean()
        predict_distribution = predict_return_matrix_num.sum(axis=0)
        print(f'預測中：平均 {class_col_name} 數目：{predict_average}')
        print(f'預測中：每個 {class_col_name} 被預測之數目：{predict_distribution}')
        print(f'預測{class_col_name}數 {len(predict_distribution)}')
    
    return model, score_df.T, pr_metrix, predict_return_matrix, predict_return_matrix_num

In [25]:
from sklearn.metrics import log_loss

In [26]:
%%time
model_selection, score_df_selection, pr_metrix, predict_return_matrix, predict_return_matrix_num \
            = score_tokenize_random_forest_withSelection(df[:5000], TEXT_TOKEN_COL_NAME, CLASS_COL_NAME,\
                             test_size = TEST_SIZE, random_state = RANDOM_STATE, \
                             print_mes= True) 
with open(DT_SELECT_SAVE_FILE_NAME, 'wb') as files:
    pickle.dump(model_selection.sklearn_model, files)
score_df_selection

training dataset size: 4000
testing dataset size: 1000
x_columns_size: 98511
y_columns_size: 30
(4000, 98511)
(4000, 75915)
feature_importance_list: 75915
feature_importance_list: [('Shapeshift.', 0.00452791998090063), ('[Code]', 0.0036231907031217567), ('DIY', 0.003375886157137125), ('Yahoo’s', 0.003022921569107749), ('YANA', 0.0027146494299852995), ('90/90/1', 0.0025601652391427313), ('800', 0.0024232837564157543), ('panicking,', 0.0022369317706686423), ('Trevor', 0.0021869707863577644), ('Yagub', 0.0021313995397155075), ('minor', 0.0020877307592224023), ('nonperishable', 0.0018733020363131516), ('(brokers)', 0.0018419386851931253), ('non-invasive', 0.0017237594215333907), ('beach', 0.0017143530112129612), ('desire.\u2063\u2063', 0.0016771591669242159), ('divisible', 0.0015686445140752213), ('free-style', 0.001535131265220076), ('133', 0.0015194302002593254), ('moderator/admin', 0.001509929809796408), ('23:59:59.', 0.0014758518029848822), ('Hydrogen,', 0.0014646496460463975), ('effec

  _warn_prf(average, modifier, msg_start, len(result))


原始資料： reduced_tags 數目：1.8408
原始資料：每個 reduced_tags 被使用之數目：[178 633 660 192  92 596 144 467 181 175 123 130 295 442 366 213 308 154
 208 413 177 726 241 198 289 317 354 191 361 380]
實際reduced_tags數 30
預測中：平均 reduced_tags 數目：0.223
預測中：每個 reduced_tags 被預測之數目：[ 0 61 61  0  0 45  0  8  0  0  0  0  1  0  0  0  1  0  0  6  0 34  1  0
  0  0  2  0  0  3]
預測reduced_tags數 30
CPU times: user 2min 23s, sys: 50.8 s, total: 3min 13s
Wall time: 1min 51s


Unnamed: 0,0
acc,0.036
recall,0.095609
precision,0.7713
topk,0.705948
