In [None]:
!gdown --fuzzy "https://docs.google.com/spreadsheets/d/1xi7zilOJ8qxZgm5vLVjheVKziOfwqiIw/edit?usp=drive_link&ouid=103889639646613430352&rtpof=true&sd=true" -O train_df.xlsx
!gdown --fuzzy "https://docs.google.com/spreadsheets/d/1ppJhP-wunCc69GTLA9Yc8jXPlHcjYo_l/edit?usp=drive_link&ouid=103889639646613430352&rtpof=true&sd=true" -O test_df.xlsx
!gdown --fuzzy "https://drive.google.com/file/d/1Aj4olw8KDH7NrKFLQZeSN_Hfxj8d4uuT/view?usp=drive_link" -O stopwords.txt
!gdown --fuzzy "https://drive.google.com/file/d/1cDHLwy0ZgNxWN5nvCvqWWPNyLwMP1A0d/view?usp=drive_link" -O dict.txt

Downloading...
From: https://drive.google.com/uc?id=1xi7zilOJ8qxZgm5vLVjheVKziOfwqiIw
To: /content/train_df.xlsx
100% 2.14M/2.14M [00:00<00:00, 196MB/s]
Downloading...
From: https://drive.google.com/uc?id=1ppJhP-wunCc69GTLA9Yc8jXPlHcjYo_l
To: /content/test_df.xlsx
100% 889k/889k [00:00<00:00, 159MB/s]
Downloading...
From: https://drive.google.com/uc?id=1Aj4olw8KDH7NrKFLQZeSN_Hfxj8d4uuT
To: /content/stopwords.txt
100% 7.86k/7.86k [00:00<00:00, 23.5MB/s]
Downloading...
From: https://drive.google.com/uc?id=1cDHLwy0ZgNxWN5nvCvqWWPNyLwMP1A0d
To: /content/dict.txt
100% 1.17k/1.17k [00:00<00:00, 6.35MB/s]


In [None]:
!pip install openpyxl
!pip install jieba
!pip install opencc
!pip install keras_nlp



In [None]:
import os
import re
from typing import List
from pprint import pprint

import pandas as pd
import numpy as np
from opencc import OpenCC
import jieba


from sklearn.model_selection import train_test_split

In [None]:
train_df = pd.read_excel("train_df.xlsx")
test_df = pd.read_excel("test_df.xlsx")

train_df['text'] = train_df['text'].apply(lambda x: str(x))
test_df['text'] = test_df['text'].apply(lambda x: str(x))

train_df = train_df[['index', 'text', 'score']]
test_df = test_df[['index', 'text']]

In [None]:
train_df, dev_df = train_test_split(train_df,
                                    test_size=0.2,
                                    stratify=train_df['score'],
                                    random_state=42)

In [None]:
cc = OpenCC('s2t')
train_df['text'] = train_df['text'].apply(lambda x: cc.convert(x))
test_df['text'] = test_df['text'].apply(lambda x: cc.convert(x))
dev_df['text'] = dev_df['text'].apply(lambda x: cc.convert(x))

jieba.initialize()
jieba.load_userdict('dict.txt')

train_df['ckip_ws'] = train_df['text'].apply(lambda x: jieba.lcut(x))
test_df['ckip_ws'] = test_df['text'].apply(lambda x: jieba.lcut(x))
dev_df['ckip_ws'] = dev_df['text'].apply(lambda x: jieba.lcut(x))

Building prefix dict from the default dictionary ...
DEBUG:jieba:Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
DEBUG:jieba:Loading model from cache /tmp/jieba.cache
Loading model cost 1.212 seconds.
DEBUG:jieba:Loading model cost 1.212 seconds.
Prefix dict has been built successfully.
DEBUG:jieba:Prefix dict has been built successfully.


In [None]:
train_df['ckip_ws'] = train_df['ckip_ws'].apply(lambda x: " ".join(x))
test_df['ckip_ws'] = test_df['ckip_ws'].apply(lambda x: " ".join(x))
dev_df['ckip_ws'] = dev_df['ckip_ws'].apply(lambda x: " ".join(x))

In [None]:
train_df.reset_index(drop=True, inplace=True)
dev_df.reset_index(drop=True, inplace=True)
dev_df.reset_index(drop=True, inplace=True)

In [None]:
from keras.models import Model
from keras.layers import LSTM, Activation, Dense, Dropout, Input, Embedding
from keras.optimizers import RMSprop
from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence
from keras.utils import to_categorical
from keras.callbacks import EarlyStopping

In [None]:
from sklearn.preprocessing import LabelEncoder

In [None]:
train_X = train_df.ckip_ws
dev_X = dev_df.ckip_ws
test_X = test_df.ckip_ws

y_encoder = LabelEncoder()

train_y = y_encoder.fit_transform(train_df.score).reshape(-1, 1)
dev_y = y_encoder.transform(dev_df.score).reshape(-1, 1)

In [None]:
MAX_WORDS = 20_000
MAX_LEN = train_df.ckip_ws.str.split(" ").apply(lambda x: len(x)).max()
tokenizer = Tokenizer(num_words = MAX_WORDS,
                      lower=True)
tokenizer.fit_on_texts(train_X)

In [None]:
train_sequences = tokenizer.texts_to_sequences(train_X)
dev_sequences = tokenizer.texts_to_sequences(dev_X)
test_sequences = tokenizer.texts_to_sequences(test_X)

In [None]:
train_sequences_matrix = sequence.pad_sequences(train_sequences,
                                          maxlen=MAX_LEN)
dev_sequences_matrix = sequence.pad_sequences(dev_sequences,
                                          maxlen=MAX_LEN)
test_sequences_matrix = sequence.pad_sequences(test_sequences,
                                          maxlen=MAX_LEN)

In [None]:
from keras.layers import Bidirectional
from keras.layers import Flatten
from keras.layers import BatchNormalization

In [None]:
def LSTM_CLS_MODEL():
    inputs = Input(name='inputs',shape=[MAX_LEN])
    layer = Embedding(input_dim=MAX_WORDS,
          output_dim=128,
          input_length=MAX_LEN)(inputs)

    layer = Bidirectional(LSTM(64, return_sequences=True))(layer)
    layer = Bidirectional(LSTM(32, return_sequences=True))(layer)
    layer = Bidirectional(LSTM(16, return_sequences=True))(layer)
    # layer = Bidirectional(LSTM(8, return_sequences=True))(layer)

    layer = Flatten()(layer)
    layer = Dense(1024,
                  name='FC1')(layer)
    layer = Activation('relu')(layer)
    # layer = BatchNormalization()(layer)
    layer = Dropout(0.3)(layer)

    layer = Dense(512,
                  name='FC2')(layer)
    layer = Activation('relu')(layer)
    # layer = BatchNormalization()(layer)
    layer = Dropout(0.3)(layer)

    layer = Dense(5,
                  name='output_layer')(layer)
    layer = Activation('softmax')(layer)

    model = Model(inputs=inputs,
                  outputs=layer)
    return model

In [None]:
model = LSTM_CLS_MODEL()

In [None]:
model.summary()

In [None]:
from keras.optimizers import AdamW

In [None]:
import tensorflow as tf
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))

In [None]:
# import mlflow.tensorflow
# mlflow.tensorflow.autolog()

# mlflow.set_tracking_uri(uri="http://127.0.0.1:5000")

# # Create a new MLflow Experiment
# mlflow.set_experiment("MLflow Quickstart")

In [None]:
model.compile(loss='sparse_categorical_crossentropy',
              optimizer=AdamW(0.001),
              metrics=['accuracy'])

# train model
# run = mlflow.start_run()
model.fit(train_sequences_matrix,
          train_y,
          batch_size=64,
          epochs=10,
          validation_data=(dev_sequences_matrix,
                           dev_y),
          callbacks=[EarlyStopping(monitor='val_loss',
                                   min_delta=0.0001,
                                   restore_best_weights=True),
                    #  mlflow.keras.MlflowCallback(run)
                     ]
          )
# mlflow.end_run()

In [None]:
model.evaluate(dev_sequences_matrix, dev_y)

In [None]:
y_pred = model.predict(test_sequences_matrix)
y_pred = y_encoder.inverse_transform(np.argmax(y_pred, axis=1))

In [None]:
submission_file = test_df.copy()
submission_file['pred'] = y_pred
submission_file = submission_file[['index', 'pred']]
submission_file.to_csv('submission.csv', index=False)