In [1]:
train_dir = 'C:/Users/ROG/OneDrive/桌面/FYP/Dataset/Train_data/train_data_after_washing.csv'
test_dir = 'C:/Users/ROG/OneDrive/桌面/FYP/Dataset/Test_data/test_data_after_washing.csv'

In [2]:
import os
import shutil

import tensorflow as tf
from official.nlp import optimization  # to create AdamW optimizer

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

import multiprocessing
import seaborn as sns
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import confusion_matrix, roc_auc_score, recall_score, precision_score, cohen_kappa_score, roc_curve, auc, make_scorer, accuracy_score, f1_score 
tf.get_logger().setLevel('ERROR') # return ERROR messages, ignore others

In [3]:
#one hot
def encode_one_hot(ori_dataframe):
    dummies = pd.get_dummies(ori_dataframe)
    res = pd.concat([ori_dataframe, dummies], axis=1)
    return res

In [4]:
#load data
data = pd.read_csv(train_dir)
data = pd.DataFrame(data)
test_data = pd.read_csv(test_dir)
test_data = pd.DataFrame(test_data)
train_data = pd.DataFrame()
val_data = pd.DataFrame()
data = data[["review","rating"]]
test_data = test_data[["review","rating"]]

In [5]:
data['labels'] = data['rating'].map({1 : 0,
                                     2 : 0,
                                     3 : 0,
                                     4 : 0,
                                     5 : 1,
                                     6 : 1,
                                     7 : 1,
                                     8 : 1,
                                     9 : 2,
                                     10 : 2})

In [6]:
test_data['labels'] = test_data['rating'].map({1 : 0,
                                               2 : 0,
                                               3 : 0,
                                               4 : 0,
                                               5 : 1,
                                               6 : 1,
                                               7 : 1,
                                               8 : 1,
                                               9 : 2,
                                               10 : 2})

In [7]:
#extract data
data = data[["review","labels"]]
test_data = test_data[["review","labels"]]

In [8]:
def clean(text):
    wn = nltk.WordNetLemmatizer()
    stopword = nltk.corpus.stopwords.words('english') #remove useless words
    tokens = nltk.word_tokenize(text) #Tokenizers divide strings into lists of substrings
    lower = [word.lower() for word in tokens] #remove uppercase
    no_stopwords = [word for word in lower if word not in stopword]
    no_alpha = [word for word in no_stopwords if word.isalpha()]
    lemm_text = [wn.lemmatize(word) for word in no_alpha]
    clean_text = lemm_text
    return clean_text

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer

def vectorize(data,tfidf_vect_fit):
    X_tfidf = tfidf_vect_fit.transform(data) #Transform doc to matrix 
    words = tfidf_vect_fit.get_feature_names_out() #Get features names
    X_tfidf_df = pd.DataFrame(X_tfidf.toarray())
    X_tfidf_df.columns = words
    return(X_tfidf_df)

In [10]:
def build_classifier_model(s):
    inputs = tf.keras.Input(shape=(s,))
    net = tf.keras.layers.Dense(128, activation='gelu', name='classifier_dense_1')(inputs)
    net = tf.keras.layers.Dropout(0.3)(net)
    net = tf.keras.layers.Dense(128, activation='gelu', name='classifier_dense_2')(net)
    net = tf.keras.layers.Dropout(0.1)(net)
    net = tf.keras.layers.Dense(128, activation='gelu', name='classifier_dense_3')(net)
    net = tf.keras.layers.Dropout(0.1)(net)
    net = tf.keras.layers.Dense(128, activation='gelu', name='classifier_dense_4')(net)
    net = tf.keras.layers.Dropout(0.1)(net)
    net = tf.keras.layers.Dense(3, activation='softmax', name='classifier_dense_5')(net)
    return tf.keras.Model(inputs, net)

In [11]:
def ff_loss(y_true,y_pred):
    alpha = tf.constant([[1],[2],[1]], dtype=tf.float32) #adjust weight for each label
    gamma = 1.25
    epsilon = 1.e-7
    y_true = tf.cast(y_true, tf.float32)
    y_pred = tf.clip_by_value(y_pred, epsilon, 1. - epsilon)
    y_t = tf.multiply(y_true, y_pred) + tf.multiply(1-y_true, 1-y_pred)
    ce = -tf.math.log(y_t)
    weight = tf.pow(tf.subtract(1., y_t), gamma)
    fl = tf.matmul(tf.multiply(weight, ce), alpha)
    loss = tf.reduce_mean(fl)
    loss = tf.abs(loss-0.10) + 0.10
    loss = tf.convert_to_tensor(loss)
    return loss

In [12]:
def graded_precision(y_true, y_pred, weights):
    precision_0 = precision_score(y_true, y_pred, labels=[0], average='macro')
    precision_1 = precision_score(y_true, y_pred, labels=[1], average='macro')
    precision_2 = precision_score(y_true, y_pred, labels=[2], average='macro')
    gp = ( weights[0] * precision_0 + weights[1] * precision_1 + weights[2] * precision_2 ) / ( weights[0] + weights[1] + weights[2] )
    return gp
def graded_recall(y_true, y_pred, weights):
    recall_0 = recall_score(y_true, y_pred, labels=[0], average='macro')
    recall_1 = recall_score(y_true, y_pred, labels=[1], average='macro')
    recall_2 = recall_score(y_true, y_pred, labels=[2], average='macro')
    gr = ( weights[0] * recall_0 + weights[1] * recall_1 + weights[2] * recall_2 ) / ( weights[0] + weights[1] + weights[2] )
    return gr
def graded_f1(precision, recall):
    f1 = 2 * (precision * recall) / (precision + recall)
    return f1

In [13]:
import time
def model_training(x_train, y_train, x_val, y_val, times):
    times = str(times)
    t0 = time.time()
    tfidf_vect = TfidfVectorizer(analyzer=clean)
    tfidf_vect_fit=tfidf_vect.fit(x_train)
    x_train = vectorize(x_train,tfidf_vect_fit)
    x_val = vectorize(x_val,tfidf_vect_fit)
    x_train = tf.convert_to_tensor(x_train.to_numpy())
    y_train = tf.convert_to_tensor(y_train.to_numpy())
    x_val = tf.convert_to_tensor(x_val.to_numpy())
    y_val = tf.convert_to_tensor(y_val.to_numpy())
    s = int(tf.shape(x_train)[1])   
    classifier_model = build_classifier_model(s)
    epochs = 5
    steps_per_epoch = 0
    for _ in x_train:
        steps_per_epoch = steps_per_epoch + 1
    num_train_steps = steps_per_epoch/24 * epochs
    num_warmup_steps = int(0.1*num_train_steps)

    init_lr = 3e-6
    optimizer = optimization.create_optimizer(init_lr=init_lr,
                                              num_train_steps=num_train_steps,
                                              num_warmup_steps=num_warmup_steps,
                                              optimizer_type='adamw')
    classifier_model.compile(optimizer=optimizer,
                             loss=ff_loss,
                             metrics=["accuracy", tf.keras.metrics.Recall(name='recall')])
    file_name='WF_' + times
    checkpoint_path = 'C:/Users/ROG/OneDrive/桌面/FYP/Model/'+ file_name + '/ckpt/cp.ckpt'
    cp_callback = tf.keras.callbacks.ModelCheckpoint(checkpoint_path,
                                                 save_weights_only=True,
                                                 verbose=1)
    csv_callback = tf.keras.callbacks.CSVLogger(
    'C:/Users/ROG/OneDrive/桌面/FYP/Model/'+ file_name + '/record.csv', separator=',', append=False
    )
    print(times)
    history = classifier_model.fit(x=x_train,
                                   y=y_train,
                                   validation_data=(x_val,y_val),
                                   epochs=5,
                                   batch_size=24,
                                   callbacks=[cp_callback,csv_callback])
    t1 = time.time()
    time_train = t1-t0
    del x_train
    del x_val
    del y_train
    del y_val
    return (tfidf_vect_fit,classifier_model,time_train)

In [14]:
def model_testing(encoder, model, X, y, times,val):
    times = str(times)
    #generate y_true and prediction results
    y = tf.convert_to_tensor(y.to_numpy())
    y_true = y
    if val == True:
        y_true = np.argmax(y, axis=1)    
    X = vectorize(X, encoder)
    X = tf.convert_to_tensor(X.to_numpy())
    pred = np.argmax(model.predict(X),axis=1)
    print(pred)
    print(y_true)
    #different metrics
    acc = accuracy_score(y_true, pred)
    weights = [2, 1, 1]
    prec = graded_precision(y_true, pred, weights)
    rec = graded_recall(y_true, pred, weights)
    f1 = graded_f1(prec, rec)
    kappa = cohen_kappa_score(y_true, pred)
    print(acc, prec, rec, f1, kappa)
    #CM
    if val == False:
        con_mat = confusion_matrix(y, pred)
        con_mat_norm = con_mat.astype('float') / con_mat.sum(axis=1)[:, np.newaxis]     # 归一化
        con_mat_norm = np.around(con_mat_norm, decimals=2)
        plt.figure(figsize=(8, 8))
        sns.heatmap(con_mat_norm, annot=True, cmap='Blues')
        plt.ylim(0, 3)
        plt.xlabel('Predicted labels')
        plt.ylabel('True labels')
        #save CM
        file_name='WF_' + times
        plt.savefig(fname='C:/Users/ROG/OneDrive/桌面/FYP/Model/'+ file_name + '/CM.png', dpi=300)
        plt.close()
    del y
    del y_true
    del X
    del pred
    return (acc, prec, rec, f1, kappa)

In [15]:
data = data[["review","labels"]]
y = data["labels"].to_numpy()
Y = data["labels"]
test_data = test_data[["review","labels"]]
X = data["review"].to_numpy()
x = data["review"]
test_X = test_data["review"]
test_y = test_data["labels"]

In [16]:
#10-fold

skf = StratifiedKFold(n_splits=10)
val_acc = []
val_gp = []
val_gr = []
val_f1 = []
val_kp = []
tes_acc = []
tes_gp = []
tes_gr = []
tes_f1 = []
tes_kp = []
train_time = []
times = 0
for train_index, val_index in skf.split(X, y):
    #physical_device = tf.config.list_physical_devices('GPU')[0] # get the first GPU device
    #tf.config.experimental.set_memory_growth(physical_device, True)
    X_train, X_val = x[train_index], x[val_index]
    y_train, y_val = Y[train_index], Y[val_index]
    y_train = encode_one_hot(y_train)
    y_val = encode_one_hot(y_val)
    y_train = y_train[[0,1,2]]
    y_val = y_val[[0,1,2]]
    encoder, model,time_train = model_training(X_train,y_train, X_val, y_val, times)
    train_time.append(time_train)
    
    val = True
    acc, prec, rec, f1, kappa = model_testing(encoder,model, X_val, y_val, times, val)
    val_acc.append(acc)
    val_gp.append(prec)
    val_gr.append(rec)
    val_f1.append(f1)
    val_kp.append(kappa)
    val = False
    acc, prec, rec, f1, kappa = model_testing(encoder,model, test_X, test_y, times, val)
    tes_acc.append(acc)
    tes_gp.append(prec)
    tes_gr.append(rec)
    tes_f1.append(f1)
    tes_kp.append(kappa)
    tf.keras.backend.clear_session
    del encoder
    del model
    times = times + 1

0
Epoch 1/5
Epoch 1: saving model to C:/Users/ROG/OneDrive/桌面/FYP/Model/WF_0/ckpt\cp.ckpt
Epoch 2/5
Epoch 2: saving model to C:/Users/ROG/OneDrive/桌面/FYP/Model/WF_0/ckpt\cp.ckpt
Epoch 3/5
Epoch 3: saving model to C:/Users/ROG/OneDrive/桌面/FYP/Model/WF_0/ckpt\cp.ckpt
Epoch 4/5
Epoch 4: saving model to C:/Users/ROG/OneDrive/桌面/FYP/Model/WF_0/ckpt\cp.ckpt
Epoch 5/5
Epoch 5: saving model to C:/Users/ROG/OneDrive/桌面/FYP/Model/WF_0/ckpt\cp.ckpt
[2 2 2 ... 2 2 2]
[2 2 2 ... 1 1 1]
0.5124124761298536 0.1281031190324634 0.25 0.16940235690235692 0.0


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


[2 2 2 ... 2 2 2]
tf.Tensor([2 2 0 ... 2 2 2], shape=(10434,), dtype=int64)
0.5115967030860648 0.1278991757715162 0.25 0.1692239411615521 0.0


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


1
Epoch 1/5
Epoch 1: saving model to C:/Users/ROG/OneDrive/桌面/FYP/Model/WF_1/ckpt\cp.ckpt
Epoch 2/5
Epoch 2: saving model to C:/Users/ROG/OneDrive/桌面/FYP/Model/WF_1/ckpt\cp.ckpt
Epoch 3/5
Epoch 3: saving model to C:/Users/ROG/OneDrive/桌面/FYP/Model/WF_1/ckpt\cp.ckpt
Epoch 4/5
Epoch 4: saving model to C:/Users/ROG/OneDrive/桌面/FYP/Model/WF_1/ckpt\cp.ckpt
Epoch 5/5
Epoch 5: saving model to C:/Users/ROG/OneDrive/桌面/FYP/Model/WF_1/ckpt\cp.ckpt


InternalError: Failed copying input tensor from /job:localhost/replica:0/task:0/device:CPU:0 to /job:localhost/replica:0/task:0/device:GPU:0 in order to run _EagerConst: Dst tensor is not initialized.

In [17]:
    val = True
    acc, prec, rec, f1, kappa = model_testing(encoder,model, X_val, y_val, times, val)
    val_acc.append(acc)
    val_gp.append(prec)
    val_gr.append(rec)
    val_f1.append(f1)
    val_kp.append(kappa)
    val = False
    acc, prec, rec, f1, kappa = model_testing(encoder,model, test_X, test_y, times, val)
    tes_acc.append(acc)
    tes_gp.append(prec)
    tes_gr.append(rec)
    tes_f1.append(f1)
    tes_kp.append(kappa)
    tf.keras.backend.clear_session
    del encoder
    del model
    times = times + 1

[2 2 2 ... 2 2 2]
[0 0 0 ... 1 1 1]
0.5124124761298536 0.1281031190324634 0.25 0.16940235690235692 0.0


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


[2 2 2 ... 2 2 2]
tf.Tensor([2 2 0 ... 2 2 2], shape=(10434,), dtype=int64)
0.5115967030860648 0.1278991757715162 0.25 0.1692239411615521 0.0


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [18]:
train_time

[79.52303552627563, 78.848872423172]

In [19]:
    t0 = time.time()
    tfidf_vect = TfidfVectorizer(analyzer=clean)
    tfidf_vect_fit=tfidf_vect.fit(x_train)
    t1 = time.time()
    time_train = t1-t0

NameError: name 'x_train' is not defined

In [None]:
time_train

In [None]:
matrics_list = [val_acc, val_gp, val_gr, val_f1, val_kp, tes_acc, tes_gp, tes_gr, tes_f1, tes_kp, train_time]
avg_results = []
for matric in matrics_list:
    total = 0
    for item in matric:
        total = total + item
    avg_results.append(total/len(matric))

In [None]:
with open("C:/Users/ROG/OneDrive/桌面/FYP/Model/W2V_FCNN.txt", "w") as f:
    f.write("val_acc: ")
    for item in val_acc:
        f.write(str(item))
        if val_acc.index(item) == len(val_acc) - 1: # Check if last item
            f.write(';')
        else:
            f.write(', ')
    f.write("\n")
    
    f.write("val_gp: ")
    for item in val_gp:
        f.write(str(item))
        if val_gp.index(item) == len(val_acc) - 1: # Check if last item
            f.write(';')
        else:
            f.write(', ')
    f.write("\n")
    
    f.write("val_gr: ")
    for item in val_gr:
        f.write(str(item))
        if val_gr.index(item) == len(val_acc) - 1: # Check if last item
            f.write(';')
        else:
            f.write(', ')
    f.write("\n")  
        
    f.write("val_f1: ")
    for item in val_f1:
        f.write(str(item))
        if val_f1.index(item) == len(val_acc) - 1: # Check if last item
            f.write(';')
        else:
            f.write(', ')
    f.write("\n")  
    
    f.write("val_kp: ")
    for item in val_kp:
        f.write(str(item))
        if val_kp.index(item) == len(val_acc) - 1: # Check if last item
            f.write(';')
        else:
            f.write(', ')
    f.write("\n")   
    
    f.write("tes_acc: ")
    for item in tes_acc:
        f.write(str(item))
        if tes_acc.index(item) == len(val_acc) - 1: # Check if last item
            f.write(';')
        else:
            f.write(', ')
    f.write("\n")  
        
    f.write("tes_gp: ")
    for item in tes_gp:
        f.write(str(item))
        if tes_gp.index(item) == len(val_acc) - 1: # Check if last item
            f.write(';')
        else:
            f.write(', ')
    f.write("\n")  
        
    f.write("tes_gr: ")
    for item in tes_gr:
        f.write(str(item))
        if tes_gr.index(item) == len(val_acc) - 1: # Check if last item
            f.write(';')
        else:
            f.write(', ')
    f.write("\n")  
            
    f.write("tes_f1: ")
    for item in tes_f1:
        f.write(str(item))
        if tes_f1.index(item) == len(val_acc) - 1: # Check if last item
            f.write(';')
        else:
            f.write(', ')
    f.write("\n")  
            
    f.write("tes_kp: ")
    for item in tes_kp:
        f.write(str(item))
        if tes_kp.index(item) == len(val_acc) - 1: # Check if last item
            f.write(';')
        else:
            f.write(', ')
    f.write("\n")
    
    f.write("train_time: ")
    for item in train_time:
        f.write(str(item))
        if train_time.index(item) == len(train_time) - 1: # Check if last item
            f.write(';')
        else:
            f.write(', ')
    f.write("\n")
    
    f.write("average_results: ")
    for item in avg_results:
        f.write(str(item))
        if avg_results.index(item) == len(avg_results) - 1: # Check if last item
            f.write(';')
        else:
            f.write(', ')
    f.write("\n")