### Vectorize Data Sets

In [2]:
from numba import jit, autojit # accelerate for loops
import numpy as np
import numpy.random as npr
import re # regular expressions
import glob
import pandas as pd

In [10]:
###--- Methods ---###

#### bootstrapping: resampling (virtual example)

nb = 100  # how many samples ?
ln = 3000 # how many words for each sample ? (3000 ~ length of each text)

def btp_ori(tot,nb,ln):
    if len(tot) == 0: return np.empty(0)
    else:
        idx = npr.randint(0, len(tot), (nb,ln)) # randomly pick indices from the 'at' array, 'nb' rows
        return np.array(tot[idx])

#### extract function words
# function word list: Bei Yu 2012: Function Words for Chinese Authorship Attribution
fw1 = ["的","是","不","了","在","有","這","為","地","也","得","就","那","以","著","之","可","于"
       "麼","而","然","没","於","還","只","無","又","如","但","其","此","與","把","全","被","卻"]

# difficult word list: 司法院裁判書類通俗化小組
fw2 = ["殆","似","難認無過失","難認有理由","尚難謂為於法無違","非無理由","非有理由","尚非無疑","尚非不可能",
       "可徵","徵諸","洵堪","迭於","係經","係指","洵屬","伊","渠","致","爰","遽認","矧","略以","按","第按","是","茲",
       "揆諸","皆無該條適用","即","無庸","縱",
       "上開","足資","可稽","堪以認定","業據","云云","等語","之","次查","復查","再查","系爭","固非無見","旋","惟查","均係"]
fwt = fw1+fw2

@jit
def ext_fw(txt,fw):
    # txt = input texts; fw = function word list
    j = np.empty(0)
    for i in fw:
        l = re.findall(i,txt)
        j = np.concatenate((j,np.array(l)))
    return j

In [14]:
#### generate csv files
global file_idx
file_idx = 0
def fw_csv(path,typ):
    list_of_files = glob.glob(path)

    for file_n in list_of_files:
        with open(file_n,'r') as myfile:         
            sl = myfile.read().replace('\n','')   # returns a string
            at = np.array(re.findall(r'\S+', sl)) # returns an array of the above string
            
            ## all texts as one
            # at = np.concatenate((at,sa))
        
        if   typ == 0: btp = btp_ori(at,nb,ln,0) # virtual examples: # = nb, length = ln
        elif typ == 1: btp = np.array(at,ndmin=2) # for test data
            
        for i in range(btp.shape[0]):
            bts = " ".join(btp[i])
            fwr = ext_fw(bts,fw1+fw2)

            # frequency table
            fwr_str = "\t".join(fwr).split("\t")
            fwr_tb  = pd.Series(fwr_str).value_counts()

            # store in dictionary
            dict1 = {}
            dict1.update(fwr_tb)
            dict1.update({"NAME": "%s" %f})
            dict1.update({"file_n": file_idx})
            tp.append(dict1)
        global file_idx
        file_idx+=1
       
    return tp
    

  global file_idx


In [8]:
#### read corpus
list_of_jud   = ["翁岳生","城仲模","林永謀","王和雄","余雪明","廖義男","曾有田","楊仁壽",
                 "彭鳳至","賴英照","謝在全","徐璧湖","林子儀","許宗力","許玉秀","林錫堯",
                 "池啟明","李震山","蔡清遊","賴浩敏","蘇永欽","黃茂榮","陳新民","陳春生","陳敏","葉百修"]

tp = [] # an empty list
file_idx = 0
for f in list_of_jud:
    # for each author ...
    path = './segmented_txt_2/%s/*.txt' %f 
    rs   = fw_csv(path,0)
    
df = pd.DataFrame(rs) 
# output csv
df.to_csv("./vectorized.csv", encoding="utf-8")

---
### Examine results:

In [9]:
import pandas as pd
test = pd.read_csv("./vectorized.csv", encoding="utf-8")
test

Unnamed: 0.1,Unnamed: 0,NAME,file_n,上開,不,之,也,了,云云,以,...,致,與,茲,著,被,足資,這,遽認,還,那
0,0,翁岳生,0,1,67,340,,,,43,...,3,32,,4,1,,,,,
1,1,翁岳生,0,3,49,350,,,,43,...,4,34,,1,2,,,,,
2,2,翁岳生,0,1,52,374,,,,40,...,5,24,,2,,,,,,
3,3,翁岳生,0,,48,340,,,,35,...,2,54,,3,,,,,,
4,4,翁岳生,0,1,60,400,,,,41,...,1,36,,1,,,,,,
5,5,翁岳生,0,2,53,348,,,,40,...,6,34,,1,1,,,,,
6,6,翁岳生,0,2,46,370,,,,39,...,2,30,,3,1,,,,,
7,7,翁岳生,0,2,55,346,,,,40,...,2,26,,3,2,,,,,
8,8,翁岳生,0,2,48,348,,,,42,...,,41,,5,,,,,,
9,9,翁岳生,0,5,45,358,,,,52,...,4,46,,1,1,,,,,


---
### Generate testing data from judicial Yuan (cf. parser.ipynb)

In [15]:
tp = [] # an empty list
path = './test_d/*.txt'
f    = "test data"
rs   = fw_csv(path,1)
    
df = pd.DataFrame(rs) 
# output csv
df.to_csv("./test_ds.csv", encoding="utf-8")

In [16]:
import pandas as pd
test = pd.read_csv("./test_ds.csv", encoding="utf-8")
test

Unnamed: 0.1,Unnamed: 0,Unnamed: 1,NAME,file_n,上開,不,之,了,云云,以,...,系爭,縱,而,致,與,茲,著,被,足資,還
0,0,,test data,0,,8,44,,,12,...,1,,2,1,3,,1,,,
1,1,,test data,1,1,9,66,,,12,...,,1,3,3,2,,1,,,
2,2,,test data,2,,9,26,,,12,...,,1,3,1,1,1,,,,
3,3,,test data,3,,16,38,,,6,...,1,,3,1,3,,,1,,
4,4,,test data,4,,10,46,1,,11,...,,1,6,3,1,,,,1,
5,5,,test data,5,,4,46,,,9,...,,,3,,2,,,,,
6,6,,test data,6,,7,38,,,7,...,1,1,4,,1,,,1,,
7,7,,test data,7,1,13,50,,,15,...,,2,6,5,2,,,,,
8,8,,test data,8,1,8,16,,,5,...,1,,,1,1,,,,,
9,9,,test data,9,,15,78,,,21,...,2,,7,1,1,1,,3,1,
