# 第二層過濾
使用 Bidirectional LSTM 模型進行判斷

In [1]:
import pandas as pd
import numpy as np

In [2]:
# stop words
stopwords_file = open("../stop_words.txt", "r")
stopwords = stopwords_file.readlines()
stopwords = [item.strip("\n") for item in stopwords]
print("stop words OK.")

stop words OK.


In [3]:
# dictionary
import jieba

jieba.set_dictionary('../dict.txt.big')
jieba.load_userdict('../chem_dict.txt')
jieba.load_userdict('../crop_dict.txt')
jieba.load_userdict('../pest_dict.txt')

print("dictionary loaded OK.")

Building prefix dict from /Users/hsiaoping.zhang/Desktop/AIDEA/dict.txt.big ...
Loading model from cache /var/folders/rd/0nr5tzsn2z17vy9fjcj5rlsc0000gn/T/jieba.u5524b13f3f9f1a3fca714e7a1c7506b3.cache
Loading model cost 0.509 seconds.
Prefix dict has been built successfully.


dictionary loaded OK.


In [4]:
def main_crop(fileNum):
    rows = get_file_rows(fileNum)
    item = rows[0].strip("\n").split(",")[0]
    row2 = [] if rows[1] == "\n" else rows[1].strip("\n").split(",")

    return item

In [5]:
def get_file_rows(fileNum):
    file = open(f"../{currentFolder}/data/{fileNum}.csv", "r")
    rows = file.readlines()
    file.close()
    return rows

In [6]:
currentFolder = "train"

### 先讀入帶有標籤(label)的檔案當作 training data

In [7]:
def read_label_file(path):
    file = open(path, "r")
    result = file.readlines()[1:]
    result = [item.strip("\n") for item in result]
    return result

In [8]:
# train label: 從第一層過濾得來的組合結果
ans = read_label_file("../train/lstm-train-label.csv")
ans[:5], len(ans)

(['747,673,0', '381,747,0', '381,351,0', '381,673,0', '351,747,1'], 5918)

In [9]:
# test data: 過濾 label 為 related 和 unrelated 的比例為 1:1
test = read_label_file("../train/re-train.csv")
test_input = test.copy()
test[:5], len(test)

(['747, 673, 0', '381, 747, 0', '381, 351, 0', '381, 673, 0', '351, 747, 1'],
 5906)

In [10]:
# public label
file = open("../private/submission/1214_0443.csv", "r")
public = file.readlines()[1:]
public = [item.strip("\n") for item in public]
public[:5], len(public)

(['683, 641', '639, 657', '639, 708', '402, 683', '551, 948'], 1823)

### 正式讀入各文章的斷詞結果

In [11]:
from os import listdir
from os.path import isfile, isdir, join

In [12]:
# 讀檔
def get_article_segment(fileNum):
    file = open(f"../{currentFolder}/TF-IDF/{fileNum}.txt", "r")
    sentances = file.readlines()[0].strip("\n")
    
    return sentances

In [13]:
def load_to_df(label_list, is_label):
    sentance1, sentance2, label = [], [], []
    
    # 兩兩文章組合各自讀檔
    for itemString in label_list:
        items = itemString.split(",")
        item1, item2 = int(items[0]), int(items[1])
        sentance1.append(get_article_segment(item1))
        sentance2.append(get_article_segment(item2))
        if(is_label):
            label.append(int(items[2]))
    
    df = pd.DataFrame()
    df["sentance_1"] = sentance1
    df["sentance_2"] = sentance2
    
    return df, label

In [14]:
# 先把文章前 100 字 load 進 df 當中
print("...")
currentFolder = "train"
df, train_label = load_to_df(ans, True)
df["label"] = train_label
df.head()

...


Unnamed: 0,sentance_1,sentance_2,label
0,柑橘類 開花期 蚜蟲 潛葉蛾 薊馬 柑橘類 水果 種類 文旦 白柚 椪柑 柳丁 生長 開花期...,柑橘類 陸續 開花期 蚜蟲 薊馬 潛葉蛾 確保 品質 柑橘類 水果 種類 文旦 白柚 椪柑 ...,0
1,柑橘類 花期 農民 小型 害蟲 臺南 改場 防檢局 田邊 好幫手 關心 柑橘類 水果 種類 ...,柑橘類 開花期 蚜蟲 潛葉蛾 薊馬 柑橘類 水果 種類 文旦 白柚 椪柑 柳丁 生長 開花期...,0
2,柑橘類 花期 農民 小型 害蟲 臺南 改場 防檢局 田邊 好幫手 關心 柑橘類 水果 種類 ...,柑橘類 盛花期 臺南 蚜蟲 薊馬 柑橘類 水果 種類 文旦 白柚 椪柑 柳丁 盛花期 蚜蟲 ...,0
3,柑橘類 花期 農民 小型 害蟲 臺南 改場 防檢局 田邊 好幫手 關心 柑橘類 水果 種類 ...,柑橘類 陸續 開花期 蚜蟲 薊馬 潛葉蛾 確保 品質 柑橘類 水果 種類 文旦 白柚 椪柑 ...,0
4,柑橘類 盛花期 臺南 蚜蟲 薊馬 柑橘類 水果 種類 文旦 白柚 椪柑 柳丁 盛花期 蚜蟲 ...,柑橘類 開花期 蚜蟲 潛葉蛾 薊馬 柑橘類 水果 種類 文旦 白柚 椪柑 柳丁 生長 開花期...,1


In [15]:
df.iloc[0]["sentance_1"]

'柑橘類 開花期 蚜蟲 潛葉蛾 薊馬 柑橘類 水果 種類 文旦 白柚 椪柑 柳丁 生長 開花期 蚜蟲 潛葉蛾 薊馬 確保 柑橘 品質 南區 確保 品質 蚜蟲 柑橘 吸取 時會 新芽 生長 誘發 潛葉蛾 柑橘 生長 時期 幼蟲 孵化 潛入 嫩葉 組織 葉肉 被害 停止 生育 枝條 發育 柑橘類 薊馬 小黃薊馬 花薊馬 新芽 生長期 開花期 幼果 此類 害蟲 體型 細小 習性 新芽 生長期 受害 皺縮 生長 開花期 薊馬 群集 花器 取食 花器 受害 授粉 幼果 薊馬 刺吸式 口器 柑橘 幼果 表皮 細胞 吸取 汁液 果皮 粗糙 不規則 發現 受害 最佳 時機 薊馬 潛葉蛾 把握 新芽 開花 小果 最佳 時機 確保 柑橘 品質 薊馬 繁殖 速度 初期 事半功倍 小黃薊馬 丁基加保扶乳劑 柑橘 潛葉蛾 種類 芬諾克可濕性粉劑 陶斯松 濕性 粉劑 佈飛賽滅寧乳劑 蚜蟲 免扶克乳劑 大滅松乳劑 福賜米松溶液 參考 植物保護 手冊 用藥 本場 植保 研究室 人員 '

In [16]:
print("...")
df2, test_label = load_to_df(test, True)
df2["label"] = test_label
df2.head()

...


Unnamed: 0,sentance_1,sentance_2,label
0,柑橘類 開花期 蚜蟲 潛葉蛾 薊馬 柑橘類 水果 種類 文旦 白柚 椪柑 柳丁 生長 開花期...,柑橘類 陸續 開花期 蚜蟲 薊馬 潛葉蛾 確保 品質 柑橘類 水果 種類 文旦 白柚 椪柑 ...,0
1,柑橘類 花期 農民 小型 害蟲 臺南 改場 防檢局 田邊 好幫手 關心 柑橘類 水果 種類 ...,柑橘類 開花期 蚜蟲 潛葉蛾 薊馬 柑橘類 水果 種類 文旦 白柚 椪柑 柳丁 生長 開花期...,0
2,柑橘類 花期 農民 小型 害蟲 臺南 改場 防檢局 田邊 好幫手 關心 柑橘類 水果 種類 ...,柑橘類 盛花期 臺南 蚜蟲 薊馬 柑橘類 水果 種類 文旦 白柚 椪柑 柳丁 盛花期 蚜蟲 ...,0
3,柑橘類 花期 農民 小型 害蟲 臺南 改場 防檢局 田邊 好幫手 關心 柑橘類 水果 種類 ...,柑橘類 陸續 開花期 蚜蟲 薊馬 潛葉蛾 確保 品質 柑橘類 水果 種類 文旦 白柚 椪柑 ...,0
4,柑橘類 盛花期 臺南 蚜蟲 薊馬 柑橘類 水果 種類 文旦 白柚 椪柑 柳丁 盛花期 蚜蟲 ...,柑橘類 開花期 蚜蟲 潛葉蛾 薊馬 柑橘類 水果 種類 文旦 白柚 椪柑 柳丁 生長 開花期...,1


In [17]:
currentFolder = "private"

print("...")
df3, _ = load_to_df(public, False)
df3.head()

...


Unnamed: 0,sentance_1,sentance_2
0,柑桔窄胸天牛 羽化 柑桔窄胸天牛 羽化 工作 應於 展開 南區 農業 改良場 確實 降低 柑...,柑桔窄胸天牛 幼蟲 孵化 盛期 南區 農業 改良場 籲請 把握 黃金 柑桔窄胸天牛 幼蟲 孵...
1,柑桔 果實 幼果 台南 改場 薊馬 確保 果實 品質 柑桔 果實 幼果 台南 改場 薊馬 確...,柑橘類 開花 時期 薊馬 潛葉蛾 確保 品質 柑橘類 水果 種類 文旦 白柚 椪柑 柳丁 茂...
2,柑桔 果實 幼果 台南 改場 薊馬 確保 果實 品質 柑桔 果實 幼果 台南 改場 薊馬 確...,小黃薊馬 密度 台南 改場 確保 果實 品質 南區 水果 芒果 文旦 幼果 果實 小黃薊馬 ...
3,柑橘窄胸天牛 羽化 籲請 把握 關鍵 時機 臺南 改場 防檢局 田邊 好幫手 關心 柑橘窄胸...,柑桔窄胸天牛 羽化 柑桔窄胸天牛 羽化 工作 應於 展開 南區 農業 改良場 確實 降低 柑...
4,台東 水稻 徒長病 新聞稿 本田 拔除 水稻 徒長病 減少 本縣 池上 鄉鎮 水稻 徒長病 ...,水稻 秧苗 病蟲害 管理 新聞稿 北部 地區 一期稻作 育苗 本田 初期 氣候 多變 濕度 ...


### 使用 tensorflow
將輸入的字詞們轉成模型能夠接受的型態

In [18]:
MAX_NUM_WORDS = 15000
MAX_SEQUENCE_LENGTH = 100  # 一個標題最長有幾個詞彙

In [19]:
import tensorflow as tf
tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=MAX_NUM_WORDS)

corpus_x1 = df["sentance_1"]
corpus_x2 = df["sentance_2"]
corpus = pd.concat([corpus_x1, corpus_x2])
print(corpus.shape)

tokenizer.fit_on_texts(corpus)
print("-")

(11836,)
-


In [20]:
q1_train = tokenizer.texts_to_sequences(corpus_x1)
q2_train = tokenizer.texts_to_sequences(corpus_x2)
print("-")

-


In [21]:
X_train_q1 = tokenizer.texts_to_sequences(df['sentance_1'].values.astype(str))
X_train_q1 = tf.keras.preprocessing.sequence.pad_sequences(X_train_q1, maxlen = MAX_SEQUENCE_LENGTH, padding='post')

X_train_q2 = tokenizer.texts_to_sequences(df['sentance_2'].values.astype(str))
X_train_q2 = tf.keras.preprocessing.sequence.pad_sequences(X_train_q2, maxlen = MAX_SEQUENCE_LENGTH, padding='post')

Y_train = df["label"]

print(len(df.iloc[0]['sentance_1'].split(" ")))
print(X_train_q1[0])
print("-")

127
[1029 1164   76  722 1521  665   76   50   36  241 1131  353  110  341
   90   47  168  810  189  957  232  186   69 1164 1263  268  129  725
   92 1307  881 1118 1164 1263   45  903   76  268  232  535  808  366
  808   45  850  129  232 1221  830  665  129  267  642  867  508  258
 1058  504   15   45 1272  106  232 1521  184 1164  367 1036 1272  106
   39  665   38  232  228  598    7 1192  186 1023  665 1521  191 1705
  515  289  167 1706  230 1618 1619 1762   16   11   13  159  171  305
  375  120]
-


In [22]:
X_test_q1 = tokenizer.texts_to_sequences(df2['sentance_1'].values.astype(str))
X_test_q1 = tf.keras.preprocessing.sequence.pad_sequences(X_test_q1, maxlen = MAX_SEQUENCE_LENGTH, padding='post')

X_test_q2 = tokenizer.texts_to_sequences(df2['sentance_2'].values.astype(str))
X_test_q2 = tf.keras.preprocessing.sequence.pad_sequences(X_test_q2, maxlen = MAX_SEQUENCE_LENGTH, padding='post')

y_test = df2["label"]
print("-")

-


In [23]:
X_public_q1 = tokenizer.texts_to_sequences(df3['sentance_1'].values.astype(str))
X_public_q1 = tf.keras.preprocessing.sequence.pad_sequences(X_public_q1, maxlen = MAX_SEQUENCE_LENGTH, padding='post')

X_public_q2 = tokenizer.texts_to_sequences(df3['sentance_2'].values.astype(str))
X_public_q2 = tf.keras.preprocessing.sequence.pad_sequences(X_public_q2, maxlen = MAX_SEQUENCE_LENGTH, padding='post')

print("-")

-


In [24]:
X_train_q1[0]

array([1029, 1164,   76,  722, 1521,  665,   76,   50,   36,  241, 1131,
        353,  110,  341,   90,   47,  168,  810,  189,  957,  232,  186,
         69, 1164, 1263,  268,  129,  725,   92, 1307,  881, 1118, 1164,
       1263,   45,  903,   76,  268,  232,  535,  808,  366,  808,   45,
        850,  129,  232, 1221,  830,  665,  129,  267,  642,  867,  508,
        258, 1058,  504,   15,   45, 1272,  106,  232, 1521,  184, 1164,
        367, 1036, 1272,  106,   39,  665,   38,  232,  228,  598,    7,
       1192,  186, 1023,  665, 1521,  191, 1705,  515,  289,  167, 1706,
        230, 1618, 1619, 1762,   16,   11,   13,  159,  171,  305,  375,
        120], dtype=int32)

In [25]:
X_train_q1.shape

(5918, 100)

In [26]:
from sklearn.model_selection import train_test_split

VALIDATION_RATIO = 0.2
RANDOM_STATE = 9527

x_train_q1, x_val_q1, \
x_train_q2, x_val_q2, \
y_train, y_val = \
    train_test_split(
        X_train_q1, X_train_q2, Y_train, 
        test_size=VALIDATION_RATIO, 
        random_state=RANDOM_STATE
)
print("-")

-


In [27]:
# 分兩類

y_train = tf.keras.utils.to_categorical(y_train)
print(y_train[:5])

print("-")

y_val = tf.keras.utils.to_categorical(y_val)
print(y_val[:5])

[[1. 0.]
 [1. 0.]
 [1. 0.]
 [1. 0.]
 [1. 0.]]
-
[[0. 1.]
 [0. 1.]
 [1. 0.]
 [1. 0.]
 [1. 0.]]


In [28]:
NUM_EMBEDDING_DIM = 256  # 一個詞向量的維度
NUM_LSTM_UNITS = 128     # LSTM 輸出的向量維度
NUM_CLASSES = 2          # related / unrelated

In [29]:
from gensim.models import word2vec

model = word2vec.Word2Vec.load("20180309-wiki-model/20180309wiki_model.bin")
print("-")

-


### embedding 層
使用 word2vec 進行嵌入

In [30]:
import numpy as np

embedding_matrix = np.zeros((len(list(model.wv.index_to_key)) + 1, model.vector_size))
word2idx = {}

vocab_list = [(word, model.wv[word]) for word in list(model.wv.key_to_index.keys())]
i = 0
for vocab in enumerate(vocab_list):
    word = vocab
    word, vec = vocab_list[i][0], vocab_list[i][1]

    embedding_matrix[i + 1] = vec
    word2idx[word] = i + 1
    i += 1
print("-")

-


In [31]:
embedding_matrix.shape

(771279, 250)

In [32]:
from tensorflow.keras import layers

# embedding_layer = layers.Embedding(MAX_NUM_WORDS, NUM_EMBEDDING_DIM)
embedding_layer = layers.Embedding(input_dim=embedding_matrix.shape[0],
                            output_dim=embedding_matrix.shape[1],
                            weights=[embedding_matrix],
                            trainable=False)
print("-")

-


In [33]:
MAX_SEQUENCE_LENGTH = 100

In [34]:
# 建立孿生 LSTM 架構（Siamese LSTM）
from tensorflow.keras import Input
from tensorflow.keras.layers import Embedding, LSTM, concatenate, Dense, Bidirectional, GRU, Activation
from tensorflow.keras.layers import Flatten
from tensorflow.keras.models import Model

# 分別定義 2 個文章 q1 & q2 為模型輸入兩個標題都是一個長度為 100 的數字序列
q1_input = Input(
    shape=(MAX_SEQUENCE_LENGTH, ), 
    dtype='int32')
q2_input = Input(
    shape=(MAX_SEQUENCE_LENGTH, ), 
    dtype='int32')


# 詞嵌入層：經過詞嵌入層的轉換，兩個文章都變成一個詞向量的序列，而每個詞向量的維度為 256

# 原本 keras 預設可支援的
# embedding_layer = Embedding(MAX_NUM_WORDS, NUM_EMBEDDING_DIM)

q1_embedded = embedding_layer(q1_input)
q2_embedded = embedding_layer(q2_input)

# LSTM 層：兩個文章經過此層後為一個 128 維度向量
shared_lstm = Bidirectional(LSTM(NUM_LSTM_UNITS, dropout=0.1))

q1_output = shared_lstm(q1_embedded)
q2_output = shared_lstm(q2_embedded)


# 串接層將兩個文章的結果串接單一向量方便跟全連結層相連
merged = concatenate(
    [q1_output, q2_output], 
    axis=-1)

outer_dense = Dense(32, activation="relu")

# 全連接層搭配 Softmax Activation 可以回傳 2 個文章屬於各類別的可能機率
dense =  Dense(
    units=NUM_CLASSES, 
    activation='softmax')

# predictions = dense(merged)
predictions = dense(outer_dense(merged))

# 模型就是將數字序列的輸入，轉換成 2 個分類的機率的所有步驟 / 層的總和
model = Model(
    inputs=[q1_input, q2_input], 
    outputs=predictions)
print("-")

-


In [35]:
from tensorflow.keras.utils import plot_model
plot_model(
    model, 
    to_file='model.png', 
    show_shapes=True, 
    show_layer_names=False, 
    rankdir='LR')

('Failed to import pydot. You must `pip install pydot` and install graphviz (https://graphviz.gitlab.io/download/), ', 'for `pydotprint` to work.')


In [36]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 100)]        0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, 100)]        0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 100, 250)     192819750   input_1[0][0]                    
                                                                 input_2[0][0]                    
__________________________________________________________________________________________________
bidirectional (Bidirectional)   (None, 256)          388096      embedding[0][0]              

In [49]:
model.compile(
    optimizer='adam',
    loss='categorical_crossentropy',
    metrics=['accuracy'])

In [38]:
X_test_q1.shape

(5906, 100)

In [50]:
# 決定一次要放多少成對標題給模型訓練
BATCH_SIZE = 100  # 70 (for the best score)

# 決定模型要看整個訓練資料集幾遍
NUM_EPOCHS = 10  # 20 (for the best score)

# 實際訓練模型
history = model.fit(
    # 輸入是兩個長度為 100 的數字序列
    x=[x_train_q1, x_train_q2], 
    y=y_train,
    batch_size=BATCH_SIZE,
    epochs=NUM_EPOCHS,
    # 每個 epoch 完後計算驗證資料集上的 Loss 以及準確度
    validation_data=(
        [x_val_q1, x_val_q2], 
        y_val
    ),
    # 每個 epoch 隨機調整訓練資料集裡頭的數據以讓訓練過程更穩定
    shuffle=True
)

Epoch 1/10
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: unsupported operand type(s) for -: 'NoneType' and 'int'
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: unsupported operand type(s) for -: 'NoneType' and 'int'
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: unsupported operand type(s) for -: 'NoneType' and 'int'
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: unsupported operand type(s) for -: 'NoneType' and 'int'
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/

### Prediction for test data

In [51]:
# 利用已訓練的模型做預測
test_predictions = model.predict(
    [X_test_q1, X_test_q2])
test_predictions[:5]

Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: unsupported operand type(s) for -: 'NoneType' and 'int'
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: unsupported operand type(s) for -: 'NoneType' and 'int'


array([[9.8459727e-01, 1.5402757e-02],
       [9.9427468e-01, 5.7253367e-03],
       [9.9985445e-01, 1.4554997e-04],
       [9.7935426e-01, 2.0645689e-02],
       [2.6339078e-02, 9.7366095e-01]], dtype=float32)

### 查看結果

In [41]:
def get_crop_list_index(name):
    if(name == "水稻"):
        return 0
    elif("蕉" in name):
        return 1
    else:
        return 2

In [45]:
currentFolder = "train"

In [53]:
errors = [0 for i in range(3)]  # pred=1 ans=0
losses = [0 for i in range(3)]  # pred=0 ans=1

# prediction
pred_true = [0 for i in range(3)]
pred_false = [0 for i in range(3)]

# answer
ans_true = [0 for i in range(3)]
ans_false = [0 for i in range(3)]

# article number count
crops = [0 for i in range(3)]

true_11 = [0 for i in range(3)]
true_00 = [0 for i in range(3)]

no = 0
threshold = 0.7
count = 0


for i in range(len(test_predictions)):
    row = test_input[i].split(",")
    item1, item2 = int(row[0]), int(row[1])
    
    crop = main_crop(item1)
    index = get_crop_list_index(crop)
    
    # add to crop list
    crops[index] += 1
    
    pred = test_predictions[i]
    pred_label = 0 if(pred[0] > threshold) else 1
    ans_label = df2.iloc[i]["label"]
    
    if(pred_label == 0 and ans_label == 1):
        pred_false[index] += 1
        ans_true[index] += 1
        losses[index] += 1
        # print(f"unlike: [{item1}, {item2}]", round(pred[0], 2))
            
    elif(pred_label == 1 and ans_label == 1):
        pred_true[index] += 1
        ans_true[index] += 1
        true_11[index] += 1
        # print(f"true related: [{item1}, {item2}] {main_crop(item1)} -> {main_crop(item2)}")

    elif(pred_label == 1 and ans_label == 0):
        pred_true[index] += 1
        ans_false[index] += 1
        errors[index] += 1
        # print(f"like: [{item1}, {item2}] {round(pred[0], 2)}")
        
    else:
        pred_false[index] += 1
        ans_false[index] += 1
        true_00[index] += 1
        # print(f"true unrelated: [{item1}, {item2}] {main_crop(item1)} -> {main_crop(item2)}")


right, total_guess, total_related = 0, 0, 0
for i in range(3):
    right += true_11[i]
    total_guess += pred_true[i]
    total_related += ans_true[i]
    
print("guess / answer:", threshold)
print("%-10s | %-10s | %-10s | %-10s | %-10s" % ("crop", "00", "01(unlike)", "10(like)", "11"))
print("- - - - - - - - - - - - - - - - - - - - - - - - - - - -")
names = ["rice", "banana", "others"]
for i in range(3):
    print("%-10s | %-10d | %-10d | %-10d | %-10d" % (names[i], true_00[i], losses[i], errors[i], true_11[i]))
    
precision = right / total_guess
recall = right / total_related
base = 0.85  # 假設上一層的 recall 最多只能達到 0.85
print(f"precision: {round(precision, 2)} | total guess: {total_guess} | right: {right}")
print(f"recall: {round(recall, 2)} | total related: {total_related}")
recall = recall * base
print(f"socre: {2*(precision * recall) / (precision + recall)} | recall: {recall}")
print(f"total candidate: {len(test_input)}")

guess / answer: 0.7
crop       | 00         | 01(unlike) | 10(like)   | 11        
- - - - - - - - - - - - - - - - - - - - - - - - - - - -
rice       | 3176       | 109        | 305        | 569       
banana     | 509        | 33         | 177        | 395       
others     | 474        | 4          | 58         | 97        
precision: 0.66 | total guess: 1601 | right: 1061
recall: 0.88 | total related: 1207
socre: 0.7024164184045019 | recall: 0.7471830985915493
total candidate: 5906


### Prediction for public data

In [54]:
currentFolder = "private"

In [55]:
# 利用已訓練的模型做預測
predictions = model.predict(
    [X_public_q1, X_public_q2])
predictions[:5]

array([[0.58139694, 0.41860312],
       [0.993046  , 0.00695405],
       [0.81041807, 0.18958198],
       [0.9186404 , 0.08135963],
       [0.8374131 , 0.16258694]], dtype=float32)

### 結果寫入檔案

In [59]:
# write to file
file = open(f"../{currentFolder}/result.csv", "w")
file.write("Test,Reference\n")
test, ref = [], []
threshold = 0.7
pred_true, pred_false = 0, 0


for i in range(len(predictions)):
    pred = predictions[i]
    crop = main_crop(int(public[i].split(",")[0]))
    
    pred_label = 0 if(pred[0] > threshold) else 1
    row = public[i].split(",")

    item1, item2 = int(row[0]), int(row[1])
    
    if(pred_label == 1):
        file.write(f"{item1}, {item2}\n")
        test.append(item1)
        ref.append(item2)
        pred_true += 1
        
    else:
        pred_false += 1
file.close()
print(f"related: {pred_true} | unrelated: {pred_false} | ratio: {pred_true/len(test_predictions)}")
print(len(predictions))

related: 609 | unrelated: 1214 | ratio: 0.10311547578733492
1823


In [None]:
def get_file(num1, num2):
    rows1 = get_file_rows(num1)
    rows2 = get_file_rows(num2)
    return rows1, rows2

In [None]:
def display_file(num1, num2):
    directory = f"../{currentFolder}/data{currentFolder.capitalize()}Complete/"
    file_1 = directory + str(num1) + ".txt"
    file = open(directory + str(num1) + ".txt", "r")
    print(file.read())
    file.close()
    
    print("- - -")
    
    file = open(directory + str(num2) + ".txt", "r")
    print(file.read())
    file.close()

In [None]:
def showSegment(num1, num2):
    directory = f"../{currentFolder}/TF-IDF/"
    file = open(directory + str(num1) + ".txt", "r")
    print(file.read())
    file.close()
    
    print("- - -")
    
    file = open(directory + str(num2) + ".txt", "r")
    print(file.read())
    file.close()

In [None]:
def check(num1, num2):
    display_file(num1, num2)
    showSegment(num1, num2)
    print(get_file_rows(num1))
    print(get_file_rows(num2))
    
currentFolder = "train"
check(1236, 1194)