# 資料前處理

將主辦方提供的資料進行處理(斷詞/分類)，以利後續處理分析

In [40]:
currentFolder = "train"

### 贅字錯誤排除

由於主辦方給的字典在第一次轉換後還是會產生 `\x7f` 的字形造成無法順利判讀，因此先針對 pest 和 chem 類別進行再處理。
並將同一種含意但不同詞的表示方法建立一個 dictionary 提供查找

In [2]:
def replace_abnormal_character(file, name_list, name_dict):
    file = open(file, "r", encoding='utf-8-sig')
    # only get content with .readlines() method, the program can read the abnormal character
    rows = file.readlines()  
    file.close()
    
    for i in range(len(rows)):
        row = rows[i].strip("\n").strip(",").replace("\x7f", "").split(",")
        name_list.append(row[0])
        for item in row:
            name_dict[item] = i  # the same meaning words with the same index nuber

In [42]:
# 作物字典

crop_list = []
crop_dict = {}

replace_abnormal_character("train/KeyWords/02crop_list.csv", crop_list, crop_dict)
crop_list[:10]

['文旦柚', '水稻', '青蔥', '龍鬚菜', '韭菜', '甘藍', '西瓜', '芋頭', '香蕉', '甘藷']

In [9]:
## 疫病及蟲害字典

pest_dict = {}
pest_list = []

replace_abnormal_character("train/Keywords/02pest_list.csv", pest_list, pest_dict)
print(pest_list[:10])

['斜紋夜蛾', '甜菜夜蛾', '黑點病', '軟腐病', '疫病', '炭疽病', '潰瘍病', '白絹病', '黑腐病', '紫斑病']


In [61]:
pest_dict["柑桔窄胸天牛"]

104

In [10]:
## 化學字典

chem_dict = {}
chem_list = []

replace_abnormal_character("train/Keywords/02chem_list.csv", chem_list, chem_dict)
print(chem_list[:10])

['貝芬硫醌', '腈硫醌', '鋅錳乃浦', '性費洛蒙', '蘇力菌', '比多農', '夏油', '亞磷酸', '乳化葵花油', '窄域油']


### 文章完全斷詞空格分開

In [41]:
import re
import jieba
import jieba.analyse
import jieba.posseg as pseg 

In [12]:
# 載入使用者定義字典：在斷詞的時候能夠區別關鍵字並給予對應新增類別詞性 (ex: crop, chem, pest...)

jieba.set_dictionary('dict.txt.big')
jieba.load_userdict('chem_dict.txt')
jieba.load_userdict('crop_dict.txt')
jieba.load_userdict('pest_dict.txt')

print("OK")

Building prefix dict from /Users/hsiaoping.zhang/Desktop/AIDEA/dict.txt.big ...
Loading model from cache /var/folders/rd/0nr5tzsn2z17vy9fjcj5rlsc0000gn/T/jieba.u5524b13f3f9f1a3fca714e7a1c7506b3.cache
Loading model cost 0.532 seconds.
Prefix dict has been built successfully.


OK


In [13]:
# stop words

stopwords_file = open("stop_words.txt", "r")
stopwords = stopwords_file.readlines()
stopwords = [item.strip("\n") for item in stopwords]
print("stop words OK.")

stop words OK.


### 文章斷詞

將文章使用斷詞將字詞以空格分開，並去除掉數字與英文的部份(這裡視為沒有意義的詞，因為出現不多，且大多指示藥劑濃度)  
並在斷詞時因為有載入先前使用者定義字典幫助斷詞並分類詞性

In [25]:
from os import listdir
from os.path import isfile, isdir, join

In [39]:
def article_segment(full_file_path, fileNum):
    
    file = open(full_file_path, "r", encoding='utf-8-sig', errors='ignore')
    sentances = file.readlines()
    file.close()

    text = ""
    for item in sentances:
        replace = re.sub(r'[0-9]+|[%]|[-]|[a-z]','', item.replace("\x7f", ""))
        text += replace
        
    result = ""
    words = pseg.cut(text)
    except_speech = ["m", "p", "d", "c", "eng"]
    part_speech = ["v", "n", "t", "event", "crop", "pest", "chem", "city"]
    for word, flag in words:
        # only extract a word with length >= 2 and part of speech with v, n, t, and key words!!
        if(word not in stopwords and len(word) > 1 and flag not in except_speech):
            result += (word + " ")

    # record to another file
    file = open(f"{currentFolder}/ArticleSegment/{fileNum}.txt", "w")  # 放入 /ArticleSegment 資料夾
    file.write(f"{result}\n")
    file.close()

In [None]:
currentFolder = "private"

In [40]:
# all file in directory
mypath = f"{currentFolder}/data{currentFolder.capitalize()}Complete"
files = listdir(mypath)

count = 0
print("...")

for f in files:
    fullpath = join(mypath, f)
    if isfile(fullpath):
        fileNum = f.split(".")[0]
        article_segment(fullpath, fileNum)
    count += 1
    if(count % 100 == 0):
        print(count, end=" | ")

print("\nfinish")

100
200
300
400
500
finish


### 提取關鍵詞

只留下關鍵字作為第一層判斷依據，crop pest chem 三類並另存到 csv 檔案當中  
並在提取關鍵字的過程當中，將一些同義異字的詞轉換為同一個代表詞，以利於後續判斷

In [43]:
def extract_keywords_to_file(full_file_path, fileNum):
    file = open(full_file_path, "r", encoding='utf-8-sig', errors='ignore')
    sentances = file.readlines()
    file.close()
    
    text = ""
    for item in sentances:
        replace = re.sub(r'[0-9]+|[%]|[-]','', item.replace("\x7f", ""))
        text += replace
    text = text.replace("寧", "寧")
    
    # 斷詞取詞性
    words = pseg.cut(text)
    keywords = [[] for i in range(4)]
    key_index_list = ["crop", "pest", "chem", "city"]
    
    
    # extract keywords in the given dictionaries
    for word, flag in words:
        if flag in key_index_list:
            index = key_index_list.index(flag)
            if(flag == "city"):
                word = word.replace("市", "").replace("縣", "").replace("台", "臺")
                
            if(word not in keywords[index]):
                if(flag =="crop"):
                    i = crop_dict[word]
                    word = crop_list[i]
            
                elif(flag == "pest"):
                    i = pest_dict[word]
                    word = pest_list[i]
                    
                elif(flag == "chem"):
                    i = chem_dict[word]
                    word = chem_list[i]
                    
                if(word in keywords[index]):  # 再次檢查是否重複
                    continue
                    
                keywords[index].append(word)
            
    # write file
    print(f"file: {fileNum}")
    file = open(f"{currentFolder}/data/" + str(fileNum) + ".csv", "w")  # 放入 /data 資料夾
    print(keywords)
    
    for key_list in keywords:
        row = ""
        for key in key_list:
            row += (key + ",")
        row = row.strip(",") + "\n"
        file.write(row)

    file.close()
    print("- - -")
    
print("updated")

updated


In [138]:
# all file in directory
mypath = f"{currentFolder}/data{currentFolder.capitalize()}Complete"
files = listdir(mypath)


for f in files:
    fullpath = join(mypath, f)
    if isfile(fullpath):
        fileNum = f.split(".")[0]
        extract_keywords_to_file(fullpath, fileNum)
        
print("finish")

file: 1047
[['水稻', '雜草'], ['葉稻熱病', '水稻水象鼻蟲'], ['甲基多保淨', '嘉賜三賽唑', '護粒三賽唑', '保米熱斯', '三賽唑', '加普胺', '培丹', '免扶克', '加保扶', '丁基加保扶'], ['苗栗']]
- - -
file: 1090
[['水稻'], ['稻熱病', '細菌性條斑病', '葉稻熱病', '白葉枯病'], ['微生物製劑', '亞磷酸'], ['花蓮']]
- - -
file: 1279
[['香蕉', '雜草'], ['花薊馬', '水銹斑'], ['第滅寧', '陶斯松', '福瑞松'], []]
- - -
file: 1286
[['香蕉'], ['象鼻蟲', '假莖象鼻蟲', '球莖象鼻蟲'], ['凡殺護矽得', '加保扶'], []]
- - -
file: 15
[['水稻'], ['瘤野螟', '飛蝨類', '白背飛蝨', '褐飛蝨'], ['蘇力菌', '苦楝油', '菸草水'], ['花蓮']]
- - -
file: 855
[['芋頭', '雜草'], ['斜紋夜蛾'], ['美文松', '培丹', '阿巴汀', '蘇力菌', '性費洛蒙'], ['花蓮', '宜蘭']]
- - -
file: 100
[['水稻'], ['葉稻熱病', '稻熱病'], ['甲基多保淨', '芬諾尼', '嘉賜黴素', '克熱賜圃', '嘉賜三賽唑', '富米熱斯', '保米熱斯', '護粒三賽唑', '丙基喜樂松', '亞賜圃'], ['桃園']]
- - -
file: 882
[['鳳梨'], ['葉螨'], ['芬普寧', '克芬螨', '密滅汀'], ['花蓮']]
- - -
file: 316
[['番荔枝', '鳳梨釋迦', '雜草'], ['葉螨', '荔枝葉螨', '神澤氏葉螨', '二點葉螨'], ['芬普螨', '亞醌螨', '密滅汀', '芬普寧', '畢達本', '畢汰芬', '得芬瑞', '賜派芬', '賜滅芬'], ['臺東']]
- - -
file: 302
[['水稻'], ['稻熱病', '穗稻熱病'], ['護粒松', '喜樂克拉', '三賽唑'], ['臺東']]
- - -
file: 317
[['水稻', '雜草'], ['

file: 11
[['荔枝', '龍眼', '無患子科', '台灣欒樹'], ['荔枝椿象'], ['芬殺松', '加保利', '加保扶', '第滅寧'], ['宜蘭', '花蓮']]
- - -
file: 1241
[['蕉農'], ['花薊馬'], ['第滅寧', '陶斯松', '福瑞松'], []]
- - -
file: 931
[['水稻'], ['葉稻熱病', '稻熱病'], ['克熱賜圃', '加普胺', '嘉賜三賽唑', '富米熱斯', '保米熱斯', '護粒三賽唑'], ['桃園']]
- - -
file: 266
[['葡萄'], ['白粉病'], ['依瑞菲克利', '滅芬農', '邁克尼'], ['臺中', '彰化']]
- - -
file: 500
[['水稻'], ['葉稻熱病'], ['撲殺熱', '嘉賜三賽唑', '護粒三賽唑', '保米熱斯', '三賽唑', '加普胺'], ['苗栗']]
- - -
file: 1043
[['水稻'], ['葉稻熱病', '稻熱病', '二化螟'], ['撲殺熱', '三賽唑', '護粒松', '喜樂克拉', '亞賜圃', '加普胺', '嘉賜三賽唑', '培丹', '芬普尼', '撲滅松', '陶斯松'], ['臺南']]
- - -
file: 1057
[['檬果'], ['小黃薊馬', '薊馬'], ['丁基加保扶', '滅賜克', '賽洛寧', '賜諾特'], ['高雄', '屏東']]
- - -
file: 1030
[['落花生'], ['秋行軍蟲'], [], ['彰化']]
- - -
file: 1018
[['水稻'], ['稻熱病'], ['加普胺', '保米熱斯', '保米黴素', '嘉賜黴素', '熱必斯', '三賽唑'], ['花蓮']]
- - -
file: 567
[['水稻'], ['白葉枯病'], ['克枯爛', '撲殺熱'], []]
- - -
file: 942
[['水稻', '雜草'], ['瘤野螟'], ['布得芬諾', '矽護芬', '賽洛寧', '毆殺松'], ['臺北']]
- - -
file: 981
[['水稻'], ['白葉枯病', '紋枯病'], ['賓得克利', '達滅淨', '維利黴素', '鏈四環黴素', '撲殺

file: 827
[['水稻'], ['紋枯病', '白葉枯病'], ['待克利', '賓克隆', '滅普寧', '福多寧', '滅紋', '鐵甲砷酸銨', '維利黴素', '撲殺熱', '克枯爛', '鏈四環黴素'], ['花蓮']]
- - -
file: 1343
[['蕉農'], [], [], ['嘉義']]
- - -
file: 614
[['檬果', '荔枝', '龍眼', '番石榴', '楊桃', '果樹'], ['東方果實蠅', '果實蠅'], ['含毒甲基丁香油', '蛋白質水解物', '馬拉松', '芬殺松', '三氯松'], ['臺南']]
- - -
file: 1394
[['水稻'], ['稻熱病', '穗稻熱病'], [], ['雲林', '臺南']]
- - -
file: 98
[['水稻'], ['白葉枯病'], [], ['桃園']]
- - -
file: 166
[['番茄', '茄科', '馬鈴薯'], ['晚疫病', '早疫病'], ['亞磷酸', '氫氧化銅', '鹼性氯氧化銅', '嘉賜銅', '亞托敏', '凡殺克絕', '達滅芬'], ['高雄']]
- - -
file: 1169
[['蕉農', '豆科', '茄科', '雜草'], ['嵌紋病', '胡瓜嵌紋病毒', '蚜蟲'], ['益達胺'], []]
- - -
file: 364
[['水稻'], ['稻熱病', '葉稻熱病'], ['撲殺熱', '三賽唑', '亞賜圃', '喜樂克拉', '加普胺', '護粒松'], ['臺南']]
- - -
file: 402
[['文旦', '白柚', '芸香科', '果樹'], ['柑橘窄胸天牛'], [], ['臺南']]
- - -
file: 1183
[['芒果', '雜草'], ['小黃薊馬', '薊馬'], ['丁基加保扶', '芬殺松', '滅大松', '納乃得', '加保扶'], ['臺南']]
- - -
file: 1154
[['果樹', '荔枝', '檸檬', '番石榴', '木瓜', '紅龍果', '柑桔'], ['露疫病', '酸腐病', '潰瘍病', '炭疽病', '黑星病', '瘡痂病', '疫病', '根腐病', '褐腐病', '黑腐病'], ['中性化亞磷酸', '

file: 1189
[['蕉農'], ['黑星病'], ['普克利', '鋅錳乃浦', '礦物油'], []]
- - -
file: 384
[['木瓜', '柑桔', '蔬果類', '水稻'], ['果疫病', '褐腐病', '疫病', '炭疽病', '蒂腐病', '果腐病', '細菌性斑點病', '軟腐病', '果斑病', '潰瘍病', '白葉枯病'], [], ['臺南']]
- - -
file: 637
[['水稻'], ['穗稻熱病', '紋枯病', '稻熱病'], ['三賽唑', '保米熱斯', '嘉賜三賽唑', '加普胺', '克熱淨', '保米賜圃', '賓克隆', '福多寧', '菲克利', '維利熱必斯'], []]
- - -
file: 623
[['洋香瓜'], ['白粉病'], ['克熱淨', '快諾芬', '賽普洛', '平克座', '賽普待克利', '四克利', '克收欣', '得克利', '達克利', '普得松', '依滅列', '芬瑞莫'], ['臺南']]
- - -
file: 1374
[['蓮霧'], ['炭疽病', '果腐病', '疫病'], ['亞托敏', '克熱淨', '白列克敏', '三氟敏', '賽座滅', '凡殺克絕'], ['高雄', '屏東']]
- - -
file: 2
[['雜草'], ['斜紋夜蛾', '甜菜夜蛾'], ['性費洛蒙'], ['宜蘭']]
- - -
file: 804
[['水稻', '雜草'], ['螟蟲', '二化螟', '三化螟', '水稻水象鼻蟲', '象鼻蟲', '螟蛾'], ['培丹', '芬普尼', '撲滅松', '賽達松', '加保扶', '芬殺松', '益滅松', '免扶克', '丁基加保扶'], ['苗栗']]
- - -
file: 1348
[['蕉株', '雜草'], ['花薊馬'], ['第滅寧', '陶斯松'], []]
- - -
file: 743
[['馬鈴薯', '番茄'], ['晚疫病'], ['滅達樂'], []]
- - -
file: 757
[['水稻'], ['葉稻熱病', '稻熱病'], ['撲殺熱', '三賽唑', '護粒松', '喜樂克拉', '亞賜圃', '加普胺', '嘉賜三賽唑'], []]
- - -
file:

file: 1138
[['水稻'], ['穗稻熱病', '稻熱病'], ['護粒三賽唑', '亞賜圃', '嘉賜黴素'], ['高雄']]
- - -
file: 1104
[['水稻'], ['白葉枯病'], ['撲殺熱', '鏈四環黴素', '克枯爛'], ['彰化', '南投']]
- - -
file: 1110
[['水稻'], ['穗稻熱病', '葉稻熱病', '稻熱病', '紋枯病'], ['三賽唑', '護粒松', '加普胺', '嘉賜三賽唑', '克熱賜圃', '富米熱斯', '亞賜圃', '賓克隆', '福多寧', '維利黴素', '菲克利'], ['彰化', '南投']]
- - -
file: 36
[['水稻'], ['葉稻熱病'], ['甲基多保淨', '嘉賜三賽唑', '護粒三賽唑', '保米熱斯', '三賽唑', '加普胺', '撲殺培丹'], ['苗栗']]
- - -
file: 22
[['果樹'], ['東方果實蠅'], ['含毒甲基丁香油', '賜諾殺', '酵母球'], ['花蓮']]
- - -
file: 686
[['果樹', '芒果', '荔枝', '龍眼', '番石榴', '楊桃'], ['東方果實蠅', '果實蠅'], ['含毒甲基丁香油', '蛋白質水解物', '馬拉松', '芬殺松', '賜諾殺'], []]
- - -
file: 1312
[['蕉農'], ['黑星病', '葉斑病'], ['鋅錳乃浦', '普克利', '凡殺護矽得', '亞托敏'], []]
- - -
file: 719
[['葉菜類', '茼蒿', '落花生', '田菁', '毛豆', '豌豆', '大蒜', '青蔥', '甘藷', '玉米', '蓮花', '花卉'], ['斜紋夜蛾', '斑螟蛾'], ['性費洛蒙', '蘇力菌'], []]
- - -
file: 902
[['蓮花', '田菁', '雜草', '綠肥作物'], ['斜紋夜蛾', '小黃薊馬'], ['性費洛蒙', '芬普尼', '亞滅培', '益達胺', '蘇力菌'], ['臺北']]
- - -
file: 1272
[['香蕉', '雜草'], ['水銹斑', '花薊馬'], ['第滅寧', '陶斯松', '福瑞松'], []]
- - -
file:

In [141]:
file = open("train/dataTrainComplete/" + str(289) + ".txt", "r", encoding='utf-8-sig', errors='ignore')
sentances = file.readlines()
# print(sentances)
file.close()

text = ""
for item in sentances:
    replace = re.sub(r'[0-9]+|[%]|[-]','', item.replace("\x7f", ""))
    text += replace

jieba.analyse.set_stop_words('stop_words.txt')
tags = jieba.analyse.extract_tags(text, topK=20, withWeight=True)

words = pseg.cut(text)
keywords = [[] for i in range(4)]
key_index_list = ["crop", "pest", "chem", "city"]


# extract keywords in the given dictionaries
for word, flag in words:
    if flag in key_index_list:
        index = key_index_list.index(flag)
        if(word not in keywords[index]):
            keywords[index].append(word)
print(keywords)
words = pseg.cut(text)
print(text)
for word, flag in words:
    if(word not in stopwords):
        if(flag not in ["m", "p", "d", "c", "eng"]):
            print(word, flag)

[['水稻'], ['穗稻熱病', '稻熱病', '吊穗', '吊狗'], [], ['臺東']]
連日降雨且溫差大濕度高，請農友加強防範穗稻熱病，避免產量損失。
行政院農業委員會臺東區農業改良場（以下簡稱臺東場）表示，目前轄區水稻生長情形多已達孕穗期至抽穗期，惟近日受梅雨鋒面影響，連日有雨，氣溫落差大，為「穗稻熱病」好發時期。本病害對水稻產量影響甚劇，臺東場籲請農友務必做好防治工作，避免病害發生及蔓延。「穗稻熱病」主要發生於穗頸、枝梗、穀粒及護穎等部位，在水稻抽穗前，稻熱病孢子隨著雨水或露水飛落在劍葉葉舌及葉節上，抽穗時經過該部位即被孢子感染。罹病初期穗頸及枝梗上之患部呈暗褐色，病原菌侵入組織壞死以致養分不能向上輸送，影響整穗發育，並自罹病部位彎曲，俗稱「吊穗」或「吊狗」，若穀粒受感染，病斑則呈暗灰色，嚴重時穀粒無法充實或不稔，俗稱「空包彈」。若發生於穗頸未及時做好防治工作，對稻米產量影響最為嚴重，農友應特別注意防範。
本病害防治時機，應把握抽穗前及齊穗期進行穗稻熱病預防性施藥，建議於抽穗前~天完成施藥，若天氣不穩定或病害持續發生，於齊穗期再進行第二次施藥，以確保防治成效，施藥時應確實遵守植保手冊相關規定，尤其生育後期更應注意安全採收期等注意事項，藥劑種類請參考「農藥資訊服務網」（http://pesticide.baphiq.gov.tw）或「植物保護手冊」（http://www.tactri.gov.tw），或洽詢臺東場植物保護研究室（電話）。臺東場表示，去年因水稻抽穗後遇連日有雨，部分農友未及時做好預防工作，造成損失；今年抽穗後又遇梅雨，為避免疫情發生，務必請農友做好預防工作。另，農委會動植物防疫檢疫局亦於植物疫情管理資訊網(http://phis.baphiq.gov.tw/WebEvery.nsf/index)設置「稻熱病疫情現況」，提供民眾上網參考。

降雨 event
溫差 x
大濕度 n
高 a
請 zg
農友 x
防範 x
穗稻熱病 pest
產量 x
損失 x

 x
行政院 n
農業 x
委員會 x
臺東區 x
農業 x
改良場 x
簡稱 x
臺東 city
場 zg
轄區 x
水稻 crop
生長 x
情形 n
達 v
孕穗期 t
抽穗期 event
惟 b
近日 t
受 v
梅雨 nr
鋒面 event
影響 x
雨 n
氣溫 x
落差 

### 檢視結果
對每個檔案查看斷詞結果的前五個字詞

In [22]:
currentFolder = "train"

In [None]:
def display_file(path):
    file = open(path, "r")
    print(file.readlines()[0].split(" ")[:5])
    file.close()

In [37]:
path = f"{currentFolder}/ArticleSegment"
files = listdir(mypath)

for f in files:
    fullpath = join(mypath, f)
    if isfile(fullpath):
        fileNum = f.split(".")[0]
        if(len(fileNum) == 0):
            continue
        print(fileNum, end=": ") 
        display_file(f"{currentFolder}/ArticleSegment/{fileNum}.txt")

289: ['降雨', '溫差', '大濕度', '農友', '防範']
504: ['危害', '多種', '作物', '木瓜', '秀粉介殼蟲']
262: ['關心', '肺炎', '疫情', '留心', '西瓜']
276: ['臺東', '地區', '水稻', '瘤野螟', '縱捲葉蟲']
510: ['苗栗', '水稻', '發白', '葉枯病', '籲請']
921: ['發佈', '水稻', '葉稻熱病', '疫情', '警報']
1292: ['冬末', '初春', '種植', '香蕉', '組培']
29: ['發布', '水稻', '葉稻熱病', '防治', '警報']
1331: ['水稻', '飛蝨類', '害蟲', '密度', '攀升']
1319: ['蕉農', '務必', '採行', '正確', '香蕉葉']
841: ['寒流', '到來', '各類', '作物', '作好']
672: ['紫斑病', '薊馬', '危害', '日益嚴重', '台南']
1127: ['中部', '地區', '一期稻作', '葉稻熱病', '發生']
1133: ['氣候', '多變', '溫差', '籲請', '農友']
470: ['發佈', '穗稻熱病', '白葉枯病', '警報', '警報']
458: ['水稻', '稻熱病', '發生', '花蓮', '農業']
303: ['巨爵', '颱風', '影響', '近日', '風雨']
465: ['梨木蝨', '危害', '籲請', '防治', '苗栗縣']
471: ['氣候', '高溫', '乾燥', '苗栗', '改場']
1132: ['天氣', '變化', '花蓮', '籲請', '農友']
1126: ['氣候', '多變', '溫差', '臺南', '農業']
129: ['豪雨', '水稻', '白葉枯病', '發生', '農友']
897: ['近期', '果實蠅', '密度', '農友', '果實蠅']
667: ['檬果', '栽培', '薊馬', '密度', '農友']
673: ['柑橘類', '陸續', '開花期', '農友', '蚜蟲']
698: ['田園', '管理', '起工', '柑橘類', '採收']
14: ['花蓮', '地區', '發布', 