In [1]:
train_dir = 'C:/Users/ROG/OneDrive/桌面/FYP/Dataset/Train_data/train_data_after_washing.csv'
test_dir = 'C:/Users/ROG/OneDrive/桌面/FYP/Dataset/Test_data/test_data_after_washing.csv'

In [2]:
import os
import shutil

import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text
from tensorflow.keras.models import Model
from official.nlp import optimization  # to create AdamW optimizer

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import json
import time

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, accuracy_score, f1_score
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import confusion_matrix, roc_auc_score, recall_score, precision_score, cohen_kappa_score
tf.get_logger().setLevel('ERROR') # return ERROR messages, ignore others

In [3]:
data = pd.read_csv(train_dir)
data = pd.DataFrame(data)
test_data = pd.read_csv(test_dir)
test_data = pd.DataFrame(test_data)
data = data[["review","rating"]]
test_data = test_data[["review","rating"]]

In [4]:
data['labels'] = data['rating'].map({1 : 0,
                                     2 : 0,
                                     3 : 0,
                                     4 : 0,
                                     5 : 1,
                                     6 : 1,
                                     7 : 1,
                                     8 : 1,
                                     9 : 2,
                                     10 : 2})

In [5]:
test_data['labels'] = test_data['rating'].map({1 : 0,
                                     2 : 0,
                                     3 : 0,
                                     4 : 0,
                                     5 : 1,
                                     6 : 1,
                                     7 : 1,
                                     8 : 1,
                                     9 : 2,
                                     10 : 2})

In [6]:
data = data[["review","labels"]]
test_data = test_data[["review","labels"]]

In [7]:
x_train = data["review"].to_numpy()
y_train = data["labels"].to_numpy()
x_test = test_data["review"].to_numpy()
y_test = test_data["labels"].to_numpy()

In [8]:
tfhub_handle_encoder = 'https://tfhub.dev/tensorflow/albert_en_base/2'
tfhub_handle_preprocess = 'https://tfhub.dev/tensorflow/albert_en_preprocess/3'

In [9]:
bert_preprocess_model = hub.KerasLayer(tfhub_handle_preprocess)
bert_model = hub.KerasLayer(tfhub_handle_encoder)

In [10]:
def build_classifier_model():
    text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text')
    preprocessing_layer = hub.KerasLayer(tfhub_handle_preprocess, name='preprocessing')
    encoder_inputs = preprocessing_layer(text_input)
    encoder = hub.KerasLayer(tfhub_handle_encoder, trainable=True, name='BERT_encoder')
    outputs = encoder(encoder_inputs)
    net = outputs['pooled_output']
    net = tf.keras.layers.Dropout(0.1)(net)
    net = tf.keras.layers.Dense(128, activation='relu', name='classifier_dense_1')(net)
    net = tf.keras.layers.Dense(128, activation='relu', name='classifier_dense_2')(net)
    net = tf.keras.layers.Dense(128, activation='relu', name='classifier_dense_3')(net)
    net = tf.keras.layers.Dense(128, activation='relu', name='classifier_dense_4')(net)
    net = tf.keras.layers.Dense(3, activation='softmax', name='classifier_dense_5')(net)
    return tf.keras.Model(text_input, net)

In [11]:
classifier_model = build_classifier_model()

In [12]:
classifier_model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 text (InputLayer)              [(None,)]            0           []                               
                                                                                                  
 preprocessing (KerasLayer)     {'input_word_ids':   0           ['text[0][0]']                   
                                (None, 128),                                                      
                                 'input_type_ids':                                                
                                (None, 128),                                                      
                                 'input_mask': (Non                                               
                                e, 128)}                                                      

In [13]:
classifier_model.load_weights('C:/Users/ROG/OneDrive/桌面/FYP/Model/TWB_4_121_1.25/ckpt/cp.ckpt')

<tensorflow.python.checkpoint.checkpoint.CheckpointLoadStatus at 0x278106b71f0>

In [14]:
bert_layer_model = Model(classifier_model.input, outputs=classifier_model.get_layer('dropout').output)

In [15]:
X_train = bert_layer_model.predict(x_train)



In [16]:
X_test = bert_layer_model.predict(x_test)



In [19]:
def report_results(model, X, y, t, d):
    pred = model.predict(X)        
    acc = accuracy_score(y, pred)
    f1 = f1_score(y, pred,average='weighted')
    prec = precision_score(y, pred,average='weighted')
    rec = recall_score(y, pred,average='weighted')
    kappa = cohen_kappa_score(y, pred)
    result = {'f1': f1, 'acc': acc, 'precision': prec, 'recall': rec, 'kappa': kappa}
    
    con_mat = confusion_matrix(y, pred)
    con_mat_norm = con_mat.astype('float') / con_mat.sum(axis=1)[:, np.newaxis]     # 归一化
    con_mat_norm = np.around(con_mat_norm, decimals=2)
    plt.figure(figsize=(8, 8))
    sns.heatmap(con_mat_norm, annot=True, cmap='Blues')
    plt.ylim(0, 3)
    plt.xlabel('Predicted labels')
    plt.ylabel('True labels')
    
    direct='C:/Users/ROG/OneDrive/桌面/FYP/Model/BERT-RF/' + t + '_' + d + '.png'
    plt.savefig(fname=direct, dpi=300)
    plt.close()
    
    return result

In [24]:
for t in [10,20,50,100,200,1000]:
    for d in [10,50,100,300]:
        t0 = time.time()
        rf_classifier = RandomForestClassifier(n_estimators=t,max_depth=d)
        rf_classifier.fit(X_train, y_train)
        t1 = time.time()
        time_train = t1-t0
        time_train = str(time_train) + ' s'
        tree = str(t)
        depth = str(d)
        result = report_results(rf_classifier, X_test, y_test, tree, depth)
        text_file = open('C:/Users/ROG/OneDrive/桌面/FYP/Model/BERT-RF/'+tree+'_'+depth+ '.txt', 'w')
        text_file.write('time:'+ time_train + ';')
        text_file.write(json.dumps(result))
        text_file.close()   

KeyboardInterrupt: 