In [None]:
from ckiptagger import data_utils, construct_dictionary

In [1]:
import tensorflow as tf
import keras

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import gc


Using TensorFlow backend.


In [8]:
def word_proc(data_path,column_name,
              tt_begin = 0,tt_end = 50,
              output_name="df_test_out.csv"):
    """
        data_path(str):資料路徑(字串)，輸入的資料應為pd.DataFrame
        column_name(str):要處理的欄位名稱
        tt_begin(int):起始列(整數)
        tt_end(int):結束列(整數)
        output_CSV_name:輸出的檔案名稱，指定為CSV檔    
    """
    from ckiptagger import WS, POS, NER
    data = pd.read_csv(data_path)
    text = data[column_name]

    ls_Test = list(text[tt_begin:tt_end])
    print("Got list\n Loading WS data...")
    
    ws = WS("./data")
    print("============================\n WS begun\n============================\n")
    word_sentence_list = ws(ls_Test
                                # sentence_segmentation=True, # To consider delimiters
                                # segment_delimiter_set = {",", "。", ":", "?", "!", ";"}),
                                # This is the defualt set of delimiters
                                # recommend_dictionary = dictionary1, # words in this dictionary are encouraged
                                # coerce_dictionary = dictionary2, # words in this dictionary are forced
                            )
    del ws
    del WS
    gc.collect()
    print("============================\n WS finished\n============================\nLoading POS data...")
    
    print("============================\n POS begun\n============================\n")
    pos = POS("./data")
    pos_sentence_list = pos(word_sentence_list)
    del pos
    del POS
    gc.collect()
    print("============================\n POS finished\n============================\nLoading NER data...")
    
    ner = NER("./data")
    entity_sentence_list = ner(word_sentence_list, pos_sentence_list)
    del NER
    del ner
    gc.collect()
    print("============================\n NER finished\n============================\n")
        
    df_Test = pd.DataFrame(ls_Test,columns=["Text"])
    df_Test['WS'] = np.array(word_sentence_list)
    df_Test['POS'] = np.array(pos_sentence_list)
    df_Test['NER'] = np.array(entity_sentence_list)
    print("============================\n Dataframe Created \n============================\n")
        
    df_Test.to_csv(output_name)
    return df_Test,word_sentence_list,pos_sentence_list,entity_sentence_list

In [9]:
data_path_a = "KCC_Data/KCCNews11554.csv"
column_name_a = "Text"
output_name_a = "mod_test.csv"

In [10]:
demo_df,ws_list,pos_list,ner_list = word_proc(data_path = data_path_a,
          column_name = column_name_a,
          tt_begin = 100,tt_end = 180,
          output_name=output_name_a)

Got list
 Loading WS data...
 WS begun

 WS finished
Loading POS data...
 POS begun

 POS finished
Loading NER data...
 NER finished

 Dataframe Created 



In [11]:
len(ner_list),type(ner_list[0]),len(ner_list[0])

(80, set, 17)

In [15]:
type(ner_list),len(ner_list),ner_list[:2]

(list,
 80,
 [{(1, 6, 'ORG', '高雄市議會'),
   (6, 10, 'ORG', '民進黨團'),
   (16, 19, 'PERSON', '馬英九'),
   (24, 27, 'PERSON', '陳水扁'),
   (28, 31, 'ORG', '親民黨'),
   (38, 40, 'PERSON', '阿扁'),
   (66, 69, 'ORG', '親民黨'),
   (87, 90, 'PERSON', '馬英九'),
   (95, 98, 'PERSON', '陳水扁'),
   (153, 156, 'ORG', '親民黨'),
   (158, 159, 'PERSON', '馬'),
   (190, 192, 'GPE', '高雄'),
   (194, 196, 'PERSON', '陳菊'),
   (217, 220, 'ORG', '親民黨'),
   (231, 234, 'PERSON', '陳水扁'),
   (235, 236, 'CARDINAL', '一'),
   (263, 266, 'ORG', '親民黨')},
  {(1, 19, 'EVENT', '高雄環狀輕軌捷運建設第二階段統包工程'),
   (20, 22, 'DATE', '昨天'),
   (29, 37, 'ORG', '西班牙CAF公司'),
   (60, 62, 'ORDINAL', '第二'),
   (92, 98, 'ORG', '高市府捷運局'),
   (106, 108, 'ORDINAL', '第二'),
   (135, 137, 'ORDINAL', '第二'),
   (183, 185, 'ORG', '長鴻'),
   (204, 207, 'ORG', '捷運局'),
   (236, 238, 'CARDINAL', '二階'),
   (247, 253, 'QUANTITY', '十三．四公里'),
   (258, 261, 'CARDINAL', '二十三'),
   (270, 272, 'ORDINAL', '第一'),
   (278, 284, 'FAC', '捷運橘線O1'),
   (285, 289, 'FAC', '西子灣站'),
   (293, 

In [16]:
type(ner_list[0]),len(ner_list[0]),len(ner_list[1]),ner_list[0]

(set,
 17,
 39,
 {(1, 6, 'ORG', '高雄市議會'),
  (6, 10, 'ORG', '民進黨團'),
  (16, 19, 'PERSON', '馬英九'),
  (24, 27, 'PERSON', '陳水扁'),
  (28, 31, 'ORG', '親民黨'),
  (38, 40, 'PERSON', '阿扁'),
  (66, 69, 'ORG', '親民黨'),
  (87, 90, 'PERSON', '馬英九'),
  (95, 98, 'PERSON', '陳水扁'),
  (153, 156, 'ORG', '親民黨'),
  (158, 159, 'PERSON', '馬'),
  (190, 192, 'GPE', '高雄'),
  (194, 196, 'PERSON', '陳菊'),
  (217, 220, 'ORG', '親民黨'),
  (231, 234, 'PERSON', '陳水扁'),
  (235, 236, 'CARDINAL', '一'),
  (263, 266, 'ORG', '親民黨')})