In [2]:
!gdown --fuzzy "https://docs.google.com/spreadsheets/d/1xi7zilOJ8qxZgm5vLVjheVKziOfwqiIw/edit?usp=drive_link&ouid=103889639646613430352&rtpof=true&sd=true" -O train_df.xlsx
!gdown --fuzzy "https://docs.google.com/spreadsheets/d/1ppJhP-wunCc69GTLA9Yc8jXPlHcjYo_l/edit?usp=drive_link&ouid=103889639646613430352&rtpof=true&sd=true" -O test_df.xlsx
!gdown --fuzzy "https://drive.google.com/file/d/1Aj4olw8KDH7NrKFLQZeSN_Hfxj8d4uuT/view?usp=drive_link" -O stopwords.txt
!gdown --fuzzy "https://drive.google.com/file/d/1cDHLwy0ZgNxWN5nvCvqWWPNyLwMP1A0d/view?usp=drive_link" -O dict.txt

Downloading...
From: https://drive.google.com/uc?id=1xi7zilOJ8qxZgm5vLVjheVKziOfwqiIw
To: /content/train_df.xlsx
100% 2.14M/2.14M [00:00<00:00, 163MB/s]
Downloading...
From: https://drive.google.com/uc?id=1ppJhP-wunCc69GTLA9Yc8jXPlHcjYo_l
To: /content/test_df.xlsx
100% 889k/889k [00:00<00:00, 131MB/s]
Downloading...
From: https://drive.google.com/uc?id=1Aj4olw8KDH7NrKFLQZeSN_Hfxj8d4uuT
To: /content/stopwords.txt
100% 7.86k/7.86k [00:00<00:00, 18.3MB/s]
Downloading...
From: https://drive.google.com/uc?id=1cDHLwy0ZgNxWN5nvCvqWWPNyLwMP1A0d
To: /content/dict.txt
100% 1.17k/1.17k [00:00<00:00, 3.75MB/s]


In [3]:
!pip install -U python-dotenv
!pip install -U colab-xterm
%load_ext colabxterm



In [4]:
!pip install mlflow



In [5]:
!pip install openpyxl
!pip install jieba
!pip install opencc



In [6]:
import os
import re
from typing import List
from pprint import pprint

import pandas as pd
import numpy as np
from opencc import OpenCC
import jieba


from sklearn.model_selection import train_test_split

In [7]:
train_df = pd.read_excel("train_df.xlsx")
test_df = pd.read_excel("test_df.xlsx")

train_df['text'] = train_df['text'].apply(lambda x: str(x))
test_df['text'] = test_df['text'].apply(lambda x: str(x))

train_df = train_df[['index', 'text', 'score']]
test_df = test_df[['index', 'text']]

In [8]:
train_df, dev_df = train_test_split(train_df,
                                    test_size=0.2,
                                    stratify=train_df['score'],
                                    random_state=42)

In [9]:
cc = OpenCC('s2t')
train_df['text'] = train_df['text'].apply(lambda x: cc.convert(x))
test_df['text'] = test_df['text'].apply(lambda x: cc.convert(x))
dev_df['text'] = dev_df['text'].apply(lambda x: cc.convert(x))

jieba.initialize()
jieba.load_userdict('dict.txt')

train_df['ckip_ws'] = train_df['text'].apply(lambda x: jieba.lcut(x))
test_df['ckip_ws'] = test_df['text'].apply(lambda x: jieba.lcut(x))
dev_df['ckip_ws'] = dev_df['text'].apply(lambda x: jieba.lcut(x))

Building prefix dict from the default dictionary ...
DEBUG:jieba:Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
DEBUG:jieba:Loading model from cache /tmp/jieba.cache
Loading model cost 1.341 seconds.
DEBUG:jieba:Loading model cost 1.341 seconds.
Prefix dict has been built successfully.
DEBUG:jieba:Prefix dict has been built successfully.


In [10]:
train_df['ckip_ws'] = train_df['ckip_ws'].apply(lambda x: " ".join(x))
test_df['ckip_ws'] = test_df['ckip_ws'].apply(lambda x: " ".join(x))
dev_df['ckip_ws'] = dev_df['ckip_ws'].apply(lambda x: " ".join(x))

In [11]:
train_df.reset_index(drop=True, inplace=True)
dev_df.reset_index(drop=True, inplace=True)
dev_df.reset_index(drop=True, inplace=True)

In [12]:
from keras.models import Model
from keras.layers import LSTM, Activation, Dense, Dropout, Input, Embedding
from keras.optimizers import RMSprop
from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence
from keras.utils import to_categorical
from keras.callbacks import EarlyStopping

In [13]:
from sklearn.preprocessing import LabelEncoder

In [14]:
train_X = train_df.ckip_ws
dev_X = dev_df.ckip_ws
test_X = test_df.ckip_ws

y_encoder = LabelEncoder()

train_y = y_encoder.fit_transform(train_df.score).reshape(-1, 1)
dev_y = y_encoder.transform(dev_df.score).reshape(-1, 1)

In [15]:
MAX_WORDS = 20_000
MAX_LEN = train_df.ckip_ws.str.split(" ").apply(lambda x: len(x)).max()
tokenizer = Tokenizer(num_words = MAX_WORDS,
                      lower=True)
tokenizer.fit_on_texts(train_X)

In [16]:
train_sequences = tokenizer.texts_to_sequences(train_X)
dev_sequences = tokenizer.texts_to_sequences(dev_X)
test_sequences = tokenizer.texts_to_sequences(test_X)

In [17]:
train_sequences_matrix = sequence.pad_sequences(train_sequences,
                                          maxlen=MAX_LEN)
dev_sequences_matrix = sequence.pad_sequences(dev_sequences,
                                          maxlen=MAX_LEN)
test_sequences_matrix = sequence.pad_sequences(test_sequences,
                                          maxlen=MAX_LEN)

In [18]:
from keras.layers import Bidirectional
from keras.layers import Flatten

In [37]:
def LSTM_CLS_MODEL():
    inputs = Input(name='inputs',shape=[MAX_LEN])
    layer = Embedding(input_dim=MAX_WORDS,
          output_dim=32,
          input_length=MAX_LEN)(inputs)

    layer = LSTM(64, return_sequences=True)(layer)
    layer = LSTM(32, return_sequences=True)(layer)
    layer = LSTM(16, return_sequences=True)(layer)
    layer = Flatten()(layer)
    layer = Dense(1024,
                  name='FC1')(layer)
    layer = Activation('relu')(layer)
    layer = Dropout(0.3)(layer)
    layer = Dense(512,
                  name='FC2')(layer)
    layer = Activation('relu')(layer)
    layer = Dense(5,
                  name='output_layer')(layer)
    layer = Activation('softmax')(layer)

    model = Model(inputs=inputs,
                  outputs=layer)
    return model

In [38]:
model = LSTM_CLS_MODEL()

In [39]:
model.summary()

Model: "model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 inputs (InputLayer)         [(None, 308)]             0         
                                                                 
 embedding_1 (Embedding)     (None, 308, 32)           640000    
                                                                 
 lstm_3 (LSTM)               (None, 308, 64)           24832     
                                                                 
 lstm_4 (LSTM)               (None, 308, 32)           12416     
                                                                 
 lstm_5 (LSTM)               (None, 308, 16)           3136      
                                                                 
 flatten_1 (Flatten)         (None, 4928)              0         
                                                                 
 FC1 (Dense)                 (None, 1024)              5047

In [40]:
from keras.optimizers import AdamW

In [41]:
import tensorflow as tf
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

Num GPUs Available:  0


In [42]:
# import mlflow.tensorflow
# mlflow.tensorflow.autolog()

# mlflow.set_tracking_uri(uri="http://127.0.0.1:5000")

# # Create a new MLflow Experiment
# mlflow.set_experiment("MLflow Quickstart")

In [43]:
model.compile(loss='sparse_categorical_crossentropy',
              optimizer=AdamW(0.001),
              metrics=['accuracy'])

# train model
# run = mlflow.start_run()
model.fit(train_sequences_matrix,
          train_y,
          batch_size=128,
          epochs=10,
          validation_data=(dev_sequences_matrix,
                           dev_y),
          callbacks=[EarlyStopping(monitor='val_loss',
                                   min_delta=0.0001,
                                   restore_best_weights=True),
                    #  mlflow.keras.MlflowCallback(run)
                     ]
          )
# mlflow.end_run()

Epoch 1/10
Epoch 2/10
Epoch 3/10


<keras.src.callbacks.History at 0x7c10a1dc0280>

In [44]:
model.evaluate(dev_sequences_matrix, dev_y)



[0.791237473487854, 0.7578851580619812]

In [45]:
y_pred = model.predict(test_sequences_matrix)
y_pred = y_encoder.inverse_transform(np.argmax(y_pred, axis=1))



In [46]:
submission_file = test_df.copy()
submission_file['pred'] = y_pred
submission_file = submission_file[['index', 'pred']]
submission_file.to_csv('submission.csv', index=False)