# Mount Google Drive files

In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


# Load Data

In [2]:
!apt-get install unzip

Reading package lists... Done
Building dependency tree       
Reading state information... Done
unzip is already the newest version (6.0-21ubuntu1.1).
0 upgraded, 0 newly installed, 0 to remove and 37 not upgraded.


In [3]:
import os
from shutil import copyfile

path = "./data/"
gpath = "/content/gdrive/MyDrive/Colab Notebooks/農業文字/data/"

!mkdir ./data

copyfile(gpath + "Train.zip" , path + "Train.zip")
copyfile(gpath + "Keywords.zip" , path + "Keywords.zip")
copyfile(gpath + "TrainLabel.csv" , path + "TrainLabel.csv")
copyfile(gpath + "submission_example.csv" , path + "submission_example.csv")
copyfile(gpath + "dataPrivateComplete.zip" , path + "dataPrivateComplete.zip")

'./data/dataPrivateComplete.zip'

In [4]:
!rm -rf ./Keywords
!unzip ./data/Keywords.zip

Archive:  ./data/Keywords.zip
   creating: Keywords/
  inflating: Keywords/02chem.list.xlsx  
  inflating: Keywords/02crop.list.xlsx  
  inflating: Keywords/02local.xlsx   
  inflating: Keywords/02pest.list.xlsx  
  inflating: Keywords/local.xlsx     
  inflating: Keywords/season.xlsx    


In [5]:
!ls -R ./data
!ls -R ./Keywords

./data:
dataPrivateComplete.zip  submission_example.csv  Train.zip
Keywords.zip		 TrainLabel.csv
./Keywords:
02chem.list.xlsx  02local.xlsx	    local.xlsx
02crop.list.xlsx  02pest.list.xlsx  season.xlsx


# Read csv/xlsx files

In [6]:
import pandas as pd

df_label = pd.read_csv('./data/TrainLabel.csv')
df_label.head()

Unnamed: 0,Test,Reference
0,3,415
1,3,649
2,9,5
3,25,32
4,25,41


In [7]:
df_key_chem = pd.read_excel('./Keywords/02chem.list.xlsx', engine='openpyxl',header=None)
df_key_chem

Unnamed: 0,0,1,2,3,4,5,6
0,貝芬硫醌,貝芬硫可濕性粉劑,貝芬硫琨,,,,
1,腈硫醌,硫水懸劑,硫水分散性粒劑,硫可濕性粉劑,腈硫醌水懸劑,睛硫琨水懸劑,腈硫醌可濕性粉劑
2,鋅錳乃浦,鋅錳乃浦水懸劑,鋅錳乃浦可濕性粉劑,鋅錳乃普水懸劑,,,
3,性費洛蒙,費洛蒙,性費洛,費洛蒙緩釋劑,,,
4,蘇力菌,蘇力菌水分散性粒劑,蘇力菌可濕性粉劑,生物製劑蘇力菌,蘇力菌製劑,,
...,...,...,...,...,...,...,...
364,嘉賜圃,嘉賜圃可濕性粉劑,,,,,
365,百快隆,百快隆粒劑,,,,,
366,嘉賜米松,嘉賜米松可濕性粉劑,,,,,
367,撲殺賜圃,撲殺賜圃粒劑,,,,,


In [8]:
df_key_crop = pd.read_excel('./Keywords/02crop.list.xlsx', engine='openpyxl',header=None)
df_key_crop

Unnamed: 0,0,1,2,3
0,文旦柚,文旦,麻豆文旦,
1,水稻,,,
2,青蔥,蔥,,
3,龍鬚菜,,,
4,韭菜,,,
...,...,...,...,...
183,果菜類,,,
184,無患子科,,,
185,洛神葵,,,
186,冬瓜,,,


In [9]:
df_key_pest = pd.read_excel('./Keywords/02pest.list.xlsx', engine='openpyxl',header=None)
df_key_pest

Unnamed: 0,0,1,2,3,4,5
0,斜紋夜蛾,斜紋夜盜蟲,黑蟲,行軍蟲,黑肚蟲,夜盜蟲
1,甜菜夜蛾,蔥仔管蟲,管仔蟲,,,
2,黑點病,,,,,
3,軟腐病,,,,,
4,疫病,,,,,
...,...,...,...,...,...,...
202,姬黃薊馬,,,,,
203,猝倒病,,,,,
204,路易氏始葉螨,,,,,
205,大螟,,,,,


In [10]:
df_key_season = pd.read_excel('./Keywords/season.xlsx', engine='openpyxl',header=None)
df_key_season

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,夏,乾旱,7月,8月,9月,炎熱,二期,乾燥,高溫
1,秋,10月,11月,12月,,,,,
2,春,梅雨,4月,5月,6月,,,,
3,冬,1月,2月,3月,寒冷,,,,


In [11]:
df_key_local = pd.read_excel('./Keywords/local.xlsx', engine='openpyxl',header=None)
df_key_local

Unnamed: 0,0,1
0,台北,臺北
1,新北,
2,桃園,
3,台中,臺中
4,台南,臺南
5,高雄,
6,新竹,
7,苗栗,
8,彰化,
9,南投,


In [12]:
df_key_02local = pd.read_excel('./Keywords/02local.xlsx', engine='openpyxl',header=None)
df_key_02local

Unnamed: 0,0,1,2
0,臺北市政府,,
1,新北市政府,,
2,桃園區農,桃園農改場,桃園場
3,臺中區農,臺中農改場,臺中場
4,臺南區農,臺南農改場,臺南場
5,高雄區農,高雄農改場,高雄場
6,新竹縣政府,,
7,苗栗區農,苗栗農改場,苗栗場
8,彰化縣政府,,
9,南投縣政府,,


# Extract data from ZIP file

In [13]:
import pandas as pd
import numpy as np
import zipfile
import os
import re

#Extract data from ZIP file
def ZipFileProcessing (ZipfilePath):
    ids=[]
    txtlist=[]
        
    if zipfile.is_zipfile(ZipfilePath): 
        zip = zipfile.ZipFile(ZipfilePath, 'r')
        for files in zip.namelist():
            if '.txt' in files:
                zfile = zip.read(files)
                ids.append(int(re.sub("[^0-9]", "", files)))
                txtlist.append(zfile.decode())
        zip.close()
    data ={'Id': ids ,'Sentence': txtlist }
    df = pd.DataFrame(data)

    return df

In [14]:
df = ZipFileProcessing('./data/Train.zip')
#Sort dataframe
df = df.sort_values(by = ['Id'])
df = df.reset_index(drop=True)
df.head()

Unnamed: 0,Id,Sentence
0,1,梅雨季來臨，文旦黑點病易發生，請注意病徵，以及早加強防治措施。\n5月已進入梅雨季節，近日連...
1,3,夏季為斜紋夜盜及甜菜夜蛾發生盛期，請加強注意，及時防治避免損失。\n夏日乾燥炎熱的氣候，正是...
2,5,花蓮地區水稻褐飛蝨、白背飛蝨及斑飛蝨已陸續發生，請立即加強防治措施，避免損失。花蓮農改場、防...
3,6,做好梨木蝨及黑星病防治，維護食安少用藥。花蓮農改場、防檢局及田邊好幫手關心您。\n梨木蝨是梨...
4,9,請加強水稻飛蝨類害蟲防治工作。花蓮區農改場、防檢局及田邊好幫手關心您。\n近日於花蓮縣富里鄉...


In [15]:
test_df = ZipFileProcessing('./data/dataPrivateComplete.zip')
#Sort dataframe
test_df = test_df.sort_values(by = ['Id'])
test_df = test_df.reset_index(drop=True)
test_df.head()

Unnamed: 0,Id,Sentence
0,2,田間斜紋夜蛾及甜菜夜蛾密度有增加趨勢且幼蟲危害蔓延迅速，請加強防治措施。\n宜蘭縣各鄉鎮蔥田...
1,4,水稻瘤野螟已於田間陸續發生，請農友加強防治措施，避免產量受損。花蓮農改場、防檢局及田邊好幫手...
2,8,天乾物燥小心夜盜，夜蛾疫情拉警報，籲請農友注意防範。花蓮區農改場、防檢局及田邊好幫手關心您。...
3,11,荔枝椿象入侵宜蘭，請民眾注意。花蓮區農改場、防檢局及田邊好幫手關心您。\n近日於宜蘭縣發現一...
4,15,瘤野螟與飛蝨現蹤，水稻二期作慎防蟲蟲危機。花蓮區農改場、防檢局及田邊好幫手關心您。\n花蓮縣...


# Rule Based Method-1

In [16]:
def get_list(sentence , keyword_db):
    
    #keyword_id = []
    kdyword_num = []
    for index in range(len(keyword_db)):
        count = 0
        for obj in keyword_db.iloc[index].values:
            if obj is not np.nan :
               count = count + sentence.count(obj)
        if count>0:
          count=1
        kdyword_num.append(count)   
    return kdyword_num

In [17]:
key_pest_num_list = []
key_crop_num_list = []
key_chem_num_list = []
key_season_num_list = []
key_local_num_list = []
key_02local_num_list = []
for sentence in test_df['Sentence']:#test_df
    key_pest_num_list.append(get_list(sentence, df_key_pest))
    key_crop_num_list.append(get_list(sentence, df_key_crop))
    key_chem_num_list.append(get_list(sentence, df_key_chem))
    key_season_num_list.append(get_list(sentence, df_key_season))
    key_local_num_list.append(get_list(sentence, df_key_local))
    key_02local_num_list.append(get_list(sentence, df_key_02local))

In [18]:
data ={'ID': test_df['Id'] ,'key_pest_num': key_pest_num_list, \
       'key_crop_num': key_crop_num_list, 'key_chem_num':key_chem_num_list, \
       'key_season_num': key_season_num_list, 'key_local_num':key_local_num_list,'key_02local_num':key_02local_num_list }#test_df
df_keynum = pd.DataFrame(data)
df_keynum.head()

Unnamed: 0,ID,key_pest_num,key_crop_num,key_chem_num,key_season_num,key_local_num,key_02local_num
0,2,"[1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,4,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ...","[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, ...","[1, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ..."
2,8,"[1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, ...","[1, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ..."
3,11,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 1, 1]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ..."
4,15,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ...","[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, ...","[1, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ..."


In [19]:
from tqdm.notebook import tqdm 

id_test_list = []
id_reference_list = []
score_list = []
chem_score_list = []
chem_zero_list = []
crop_score_list = []
crop_zero_list = []
pest_score_list = []
season_score_list = []
local_score_list = []
local02_score_list = []
Ids = df_keynum['ID']

key_pest_len = len(df_keynum['key_pest_num'][0])
key_crop_len = len(df_keynum['key_crop_num'][0])
key_chem_len = len(df_keynum['key_chem_num'][0])
key_season_len = len(df_keynum['key_season_num'][0])
key_local_len = len(df_keynum['key_local_num'][0])
key_02local_len = len(df_keynum['key_02local_num'][0])

key_pest_num = df_keynum['key_pest_num']
key_crop_num = df_keynum['key_crop_num']
key_chem_num = df_keynum['key_chem_num']
key_season_num = df_keynum['key_season_num']
key_local_num = df_keynum['key_local_num']
key_02local_num = df_keynum['key_02local_num']
for i in tqdm(range(len(df_keynum))):
    id_test = Ids[i]
    for j in range(len(df_keynum)):
        id_reference = Ids[j]
        score = 0
        count = 0
        
        if id_test == id_reference:
                continue
        score_test=[key_pest_num[i][x] - key_pest_num[j][x] for x in range(key_pest_len)]
        for index in range(key_pest_len):
          if score_test[index] < 0:#A-B
              score_test[index]= 0
          if score_test[index] == 1:#A-B
              count +=1
          if key_pest_num[i][index] ==1:#A
              score +=1
        if score > 0:
          pest = count/score
        else:
          pest = 0#病蟲 
        score = 0
        count = 0
        score_test=[key_crop_num[i][x] - key_crop_num[j][x] for x in range(key_crop_len)]
        for index in range(key_crop_len):
          if score_test[index] < 0:
              score_test[index]= 0
          if score_test[index] == 1:
              count +=1
          if key_crop_num[i][index] ==1:
              score +=1
        if score > 0:
          crop_divide_by_zero=1
          crop = count/score
        else:
          crop_divide_by_zero=0
          crop = 1
        score = 0
        count = 0
        score_test=[key_chem_num[i][x] - key_chem_num[j][x] for x in range(key_chem_len)]
        for index in range(key_chem_len):
          if score_test[index] < 0:
              score_test[index]= 0
          if score_test[index] == 1:
              count +=1
          if key_chem_num[i][index] ==1:
              score +=1
        if score > 0:
          chem_divide_by_zero=1
          chem = count/score
        else:
          chem_divide_by_zero=0
          chem = 1#藥物     
        score = 0
        count = 0
        score_test=[key_season_num[i][x] - key_season_num[j][x] for x in range(key_season_len)]
        for index in range(key_season_len):
          
          if score_test[index] < 0:
              score_test[index]= 0
          if score_test[index] == 1:
              count +=1
          if key_season_num[i][index] ==1:
              score +=1
        if score > 0:
          season = count/score
        else:
          season = 1        
        score = 0
        count = 0
        score_test=[key_local_num[i][x] - key_local_num[j][x] for x in range(key_local_len)]
        for index in range(key_local_len):
          
          if score_test[index] < 0:
              score_test[index]= 0
          if score_test[index] == 1:
              count +=1
          if key_local_num[i][index] ==1:
              score +=1
        if score > 0:
          local = count/score
        else:
          local = 1
        score = 0
        count = 0
        score_test=[key_02local_num[i][x] - key_02local_num[j][x] for x in range(key_02local_len)]
        for index in range(key_02local_len):
          
          if score_test[index] < 0:
              score_test[index]= 0
          if score_test[index] == 1:
              count +=1
          if key_02local_num[i][index] ==1:
              score +=1
        if score > 0:
          local02 = count/score
        else:
          local02 = 1
        local02_score_list.append(local02)
        local_score_list.append(local)
        season_score_list.append(season)
        chem_score_list.append(chem)
        chem_zero_list.append(chem_divide_by_zero)
        crop_score_list.append(crop)
        crop_zero_list.append(crop_divide_by_zero)
        pest_score_list.append(pest)
        id_test_list.append(id_test)
        id_reference_list.append(id_reference)

  0%|          | 0/420 [00:00<?, ?it/s]

In [20]:
#print(season_score_list)
data ={'Test': id_test_list ,'Reference': id_reference_list ,'chem_score': chem_score_list, \
       'crop_score': crop_score_list, 'pest_score':pest_score_list, \
       'season_score': season_score_list, 'local_score':local_score_list, \
       'local02_score':local02_score_list, 'chem_zero':chem_zero_list, 'crop_zero':crop_zero_list }
score_list = pd.DataFrame(data)

score_list


Unnamed: 0,Test,Reference,chem_score,crop_score,pest_score,season_score,local_score,local02_score,chem_zero,crop_zero
0,2,4,1.0,1.0,1.000000,1.0,1.0,1.0,1,1
1,2,8,0.0,0.5,0.000000,1.0,0.0,1.0,1,1
2,2,11,1.0,1.0,1.000000,1.0,0.0,1.0,1,1
3,2,15,1.0,1.0,1.000000,1.0,1.0,1.0,1,1
4,2,16,1.0,1.0,1.000000,1.0,0.0,1.0,1,1
...,...,...,...,...,...,...,...,...,...,...
175975,1397,1379,1.0,1.0,1.000000,1.0,1.0,1.0,0,1
175976,1397,1386,1.0,0.0,1.000000,1.0,1.0,1.0,0,1
175977,1397,1394,1.0,0.0,0.714286,1.0,0.0,0.5,0,1
175978,1397,1395,1.0,0.0,0.571429,1.0,0.0,0.0,0,1


In [21]:
Test = []
Reference = []
for i  in range(len(score_list)):#最重要
    if score_list['chem_score'][i] <= 0 and \
        score_list['crop_score'][i] <= 0 and \
        score_list['pest_score'][i]  <= 0:#0.8
        Test.append(score_list['Test'][i])
        Reference.append(score_list['Reference'][i])
        continue
    elif score_list['chem_score'][i] <= 0 and \
        score_list['pest_score'][i] <= 0 and \
        score_list['crop_score'][i] > 0.3 and \
        score_list['crop_score'][i] < 0.4:#642
        Test.append(score_list['Test'][i])
        Reference.append(score_list['Reference'][i])
        continue 
    elif score_list['chem_score'][i] <= 0 and \
        score_list['crop_score'][i] <= 0 and \
        score_list['pest_score'][i] > 0.3 and \
        score_list['pest_score'][i] < 0.4:#642
        Test.append(score_list['Test'][i])
        Reference.append(score_list['Reference'][i])
        continue 

    elif score_list['chem_zero'][i] <= 0 and \
        score_list['crop_score'][i] <=0 and \
        score_list['pest_score'][i] <=0 and \
        score_list['local_score'][i] <= 0 and \
        score_list['local02_score'][i] <= 0:#0.8
        Test.append(score_list['Test'][i])
        Reference.append(score_list['Reference'][i])
        continue

    elif score_list['chem_score'][i] <= 0 and \
        score_list['crop_score'][i] == 0.5 and \
        score_list['pest_score'][i] <= 0 and \
        score_list['season_score'][i] <= 0 :#684
        Test.append(score_list['Test'][i])
        Reference.append(score_list['Reference'][i])
        continue 

    elif score_list['chem_score'][i] <= 0 and \
        score_list['crop_score'][i] == 0.5 and \
        score_list['pest_score'][i] <= 0 and \
        score_list['local_score'][i] <= 0 :#693
        Test.append(score_list['Test'][i])
        Reference.append(score_list['Reference'][i])
        continue  


In [22]:
data ={'Test': Test ,'Reference': Reference }
submission = pd.DataFrame(data)
submission.to_csv('./not_00_05_03_684.csv', index=False)
print(key_chem_num[86])
print(key_crop_num[86])
print(key_pest_num[86])
print(key_season_num[86])
print(key_local_num[86])
print(key_02local_num[86])
submission

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

Unnamed: 0,Test,Reference
0,2,8
1,4,15
2,11,47
3,17,8
4,22,428
...,...,...
679,1396,848
680,1396,890
681,1396,1020
682,1396,1035
