In [6]:
import os
import yaml
import numpy as np
import pandas as pd
from utils import mysqlDatabase
from myBertTools import myBertModel, myTokenizer
from sklearn.model_selection import train_test_split

In [7]:
def get_data(MysqlDatabase, sql):
    df = MysqlDatabase.select_table(sql)
    df['text'] = df['title'] + ' ' + df['excerpt'] + ' ' + df['topics']
    df = df[['text', 'name']]
    df.columns = ['text', 'label']
    ##
    df_t, df_v,= train_test_split(df, test_size = 0.1, random_state = 42, stratify = df.label)
    df_t['type'] = 'train'
    df_v['type'] = 'valid'
    df = pd.concat([df_t, df_v], sort=True)
    df = df.reset_index(drop=True)
    
    # 標籤存檔
    label_df = pd.get_dummies(df.label)
    Y = label_df.values
    print('Shape of label tensor:', Y.shape)

    label_list = list(label_df.columns)
    label_dic = { i : label_list[i] for i in range(0, len(label_list) ) }
    label_site = './model_output/dcard_cate_label_dic.npy'
    np.save(label_site, label_dic)

    ##
    Y_df = pd.DataFrame(Y)
    df = pd.concat([df, Y_df],sort = True, axis=1)
    print(df.shape)
    df.head()
    return df, Y_df

In [8]:
def seq_padding(X, padding=0):
    L = [len(x) for x in X]
    ML = max(L)
    return np.array([
        np.concatenate([x, [padding] * (ML - len(x))]) if len(x) < ML else x for x in X
    ])

In [9]:
class dataGenerator:
    def __init__(self, data, batch_size=32):
        self.data = data
        self.batch_size = batch_size
        self.steps = len(self.data) // self.batch_size
        if len(self.data) % self.batch_size != 0:
            self.steps += 1
    def __len__(self):
        return self.steps
    def __iter__(self):
        while True:
            idxs = list(range(len(self.data)))
            np.random.shuffle(idxs)
            X1, X2, Y = [], [], []
            for i in idxs:
                d = self.data[i]
                text = d[0][:maxlen]
                x1, x2 = MyTokenizer.encode(first=text)
                y = d[1:]
                X1.append(x1)
                X2.append(x2)
                Y.append(y)
                if len(X1) == self.batch_size or i == idxs[-1]:
                    X1 = seq_padding(X1)
                    X2 = seq_padding(X2)
                    Y = seq_padding(Y)
                    yield [X1, X2], Y
                    [X1, X2, Y] = [], [], []

In [10]:
if __name__ == '__main__':
    os.environ['TF_KERAS'] = '1'
    maxlen = 100
    pretrained_path = '/Users/jackyfu/Desktop/hwf87_git/bert_wwm/'
    config_path = os.path.join(pretrained_path, 'bert_config.json')
    checkpoint_path = os.path.join(pretrained_path, 'bert_model.ckpt')
    vocab_path = os.path.join(pretrained_path, 'vocab.txt')
    output_path = '/Users/jackyfu/Desktop/hwf87_git/Dcard_post_classification/model_output'
    with open('config.yml', 'r') as stream:
        myconfig = yaml.load(stream, Loader=yaml.CLoader)
    database_username = myconfig['mysql_database']['database_username']
    database_password = myconfig['mysql_database']['database_password']
    database_ip       = myconfig['mysql_database']['database_ip']
    database_name     = myconfig['mysql_database']['database_name']
    MysqlDatabase = mysqlDatabase(database_username, database_password, database_ip, database_name)
    sql = '''
    SELECT df.name, dp.*
    FROM Bigdata.dcard_posts dp
    left join Bigdata.dcard_forums df on dp.forumid = df.id
    WHERE 1=1
    and df.name in ('時事', '網路購物', '股票', '美妝', '工作', '考試', '穿搭', '3C', 'Apple', '感情', 
                    '美食', '理財', '居家生活', '臺灣大學', 'YouTuber');
    '''
    ##
    df, Y_df = get_data(MysqlDatabase, sql)
    MyBertModel = myBertModel(pretrained_path, config_path, checkpoint_path, vocab_path)
    token_dict = MyBertModel.get_token_dict()
    model = MyBertModel.build_model(Y_df)
    MyTokenizer = myTokenizer(token_dict)

    train_data = df[df['type'] == 'train'].drop(columns=['label', 'type']).values.tolist()
    valid_data = df[df['type'] == 'valid'].drop(columns=['label', 'type']).values.tolist()
    
    train_D = dataGenerator(train_data)
    valid_D = dataGenerator(valid_data)

    history = model.fit(
        train_D.__iter__(),
        steps_per_epoch=len(train_D),
        epochs=2,
        validation_data=valid_D.__iter__(),
        validation_steps=len(valid_D)
    )
    model_path = './model_output/dcard_post_cls_bert.h5'
    model.save(model_path)

Successfully select from Bigdata table
Shape of label tensor: (9134, 15)
(9134, 18)
Metal device set to: Apple M1

systemMemory: 16.00 GB
maxCacheSize: 5.33 GB



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_t['type'] = 'train'
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_v['type'] = 'valid'
2021-09-04 17:20:55.718164: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2021-09-04 17:20:55.718239: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 

Model: "model_2"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, None)]       0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, None)]       0                                            
__________________________________________________________________________________________________
model_1 (Functional)            (None, None, 768)    101677056   input_1[0][0]                    
                                                                 input_2[0][0]                    
__________________________________________________________________________________________________
lambda (Lambda)                 (None, 768)          0           model_1[0][0]              

2021-09-04 17:20:58.811399: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:176] None of the MLIR Optimization Passes are enabled (registered 2)
2021-09-04 17:20:58.811571: W tensorflow/core/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz
2021-09-04 17:21:03.128197: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.




2021-09-04 17:32:06.573084: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.


Epoch 2/2


