In [38]:
import json
import os
import re
import numpy as np
import shutil
from ckip_transformers.nlp import CkipWordSegmenter, CkipPosTagger, CkipNerChunker
import torch
from flask import Flask, request
from flask import render_template
from flask_ngrok import run_with_ngrok
ws_driver = CkipWordSegmenter(level=3,device=0)

In [39]:
with open('fin_word.json', 'r', encoding='utf-8') as f:
    word2idx = json.load(f)
with open('fin_tag.json', 'r', encoding='utf-8') as f:
    tag2idx = json.load(f)

In [40]:
word2idx['<pad>'] = 0
idx2word = {word2idx[w]:w for w in word2idx}
idx2tag = {tag2idx[t]:t for t in tag2idx}

In [41]:
def clean_str(string):
    string = re.sub(r"\u3000", "", string)
    string = re.sub(r"\s{1,}", "", string)
    string = re.sub(r"\r", "", string)
    string = re.sub(r"\n", "", string)
    string = re.sub(r"【[^【】]+】","",string)
    string = re.sub(r"（[^（）]+）","",string)
    string = re.sub(r"「[^「」]+」","",string)
    string = re.sub(r"\([^\(\)]+\)","",string)
    if '審判長' in string:
        string = string.split('審判長')[0]
    else:
        string = string.split('書記官')[0]
    return string.strip()

In [43]:
j_fs = []

for file in os.listdir('金融判決'):
    with open(f'金融判決/{file}' , 'r' , encoding = 'utf-8') as f:
        j_f = json.load(f)
        try:
            if len(clean_str(j_f['judgement'])) <3000 and len(j_f['relatedIssues']) < 15 and '由於裁判書全文大於' not in j_f['judgement']:
                j_fs.append(file)
        except:
            pass

In [45]:
for file in os.listdir('驗證的判決'):
    with open(f'驗證的判決/{file}' , 'r' , encoding = 'utf-8') as f:
        j_f = json.load(f)
        with open(f'驗證的判決/{file}.txt','w',encoding = 'utf-8') as fw:
            fw.write(j_f['judgement'])

In [30]:
idx = 3
text = j_fs[idx]['judgement']
process_text = clean_str(text)
len(process_text)

IndexError: list index out of range

In [None]:
tags = []

for law in j_fs[idx]['relatedIssues']:
    str_ = law['lawName']+law['issueRef']
    str_ = str_.strip()
    if str_ in tags:
        continue
    if str_ in tag2idx:
        tags.append(str_)
tags

In [None]:
tags_idx = [tag2idx[t] for t in tags]
tags_idx

In [31]:
def inp_len(x):
    if len(x) > 0:
        return x

doc = []
for t in process_text.split('，'):
    if '。' in t:
        doc.extend(t.split('。'))
    else:
        doc.append(t)
doc = list(filter(inp_len,doc))
docs = [doc]

In [32]:
docs_p = []
sentence_length = []
doc_length = []
# ws = ws_driver(docs , batch_size=16)
for doc in docs:
    ws = ws_driver(doc)
    d_p = []
    s_len = []
    for voc in ws:
        d_p.append(voc)
        s_len.append(len(voc))
    sentence_length.append(s_len)
    docs_p.append(d_p)
    doc_length.append(len(s_len))

Tokenization: 100%|█████████████████████████████████████████████████████████████████| 55/55 [00:00<00:00, 27443.10it/s]
Inference: 100%|█████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 25.33it/s]


In [13]:
docs_w2i = [[list(map( lambda s:word2idx.get(s,word2idx['<unk>']) ,sent )) for sent in doc_p] for doc_p in docs_p ]

dict_han = {
    'docs':docs_w2i,
    'sentences_per_document':doc_length,
    'words_per_sentence':sentence_length,
    'labels':[tags_idx],
}

In [14]:
from torch.utils.data import TensorDataset, DataLoader ,SubsetRandomSampler ,ConcatDataset ,Dataset

word_limit = 40 ## 每句話的最長長度
sentence_limit = 80 ## 每篇判決的句子數目

class HANDataset(Dataset):
    def __init__(self,data,word_pad_idx,num_classes , word_limit = 40 ,sentence_limit = 80 ):
        # Load data
        self.data = data
        self.word_limit = word_limit
        self.sentence_limit = sentence_limit
        self.word_pad_idx = word_pad_idx
        self.num_classes = num_classes

    def __getitem__(self, id_doc):
        return (self.data['docs'][id_doc], \
               self.data['sentences_per_document'][id_doc], \
               self.data['words_per_sentence'][id_doc], \
               self.data['labels'][id_doc])

    def __len__(self):
        return len(self.data['labels'])
    
    def one_hot(self,tag):
        bs_oh_label = torch.tensor([0.0] * self.num_classes)
        for t in tag:
            bs_oh_label += torch.eye(self.num_classes)[t]
        return bs_oh_label.tolist()
    
    def turncut_dlen(self,x):
        if len(x) > self.max_doc_len:
            return x[:self.max_doc_len]
        elif len(x) < self.max_doc_len:
            return x + [0] * (self.max_doc_len-len(x))
        else:
            return x
        
    def max_slen(self,x):
        if x > self.max_sent_len:
            return self.max_sent_len
        else:
            return x
        
    def max_dlen(self,x):
        if x > self.max_doc_len:
            return self.max_doc_len
        else:
            return x
        
    def collate_fn(self, col_datasets):
        bs_doc ,bs_dlen,bs_slen,bs_label = [],[],[],[]
        for c_data in col_datasets:
            bs_doc.append(c_data[0])
            bs_dlen.append(c_data[1])
            bs_slen.append(c_data[2])
            bs_label.append(c_data[3])
        self.max_sent_len = min( max([lens for slen in bs_slen for lens in slen ]) ,self.word_limit )
        self.max_doc_len = min( self.sentence_limit , max(bs_dlen))
        pad_docs = []
        pad_labels = []
        for doc,label in zip(bs_doc,bs_label):
            pad_doc = []
            for sent in doc:
                if len(sent) > self.max_sent_len:
                    pad_doc.append(sent[:self.max_sent_len])
                else:
                    pad_doc.append(sent + (self.max_sent_len-len(sent))*[self.word_pad_idx])
            if len(pad_doc) > self.max_doc_len:
                pad_doc = pad_doc[:self.max_doc_len]
            else:
                pad_doc.extend((self.max_doc_len-len(pad_doc)) * [[self.word_pad_idx] * self.max_sent_len])
            pad_docs.append(pad_doc)
            pad_labels.append(self.one_hot(label))
        bs_slen_p = []
        for b_s in list(map(self.turncut_dlen ,bs_slen)):
            bs_slen_p.append(list(map(self.max_slen ,b_s)))
        bs_dlen2 = list(map(self.max_dlen,bs_dlen))
        return torch.LongTensor(pad_docs) , torch.LongTensor(bs_dlen2), torch.LongTensor(bs_slen_p), torch.tensor(pad_labels)     

In [15]:
import torch
import torch.nn as nn
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence, PackedSequence


class HierarchialAttentionNetwork(nn.Module):
    def __init__(self, n_classes, vocab_size, emb_size, word_rnn_size, sentence_rnn_size, word_rnn_layers,
                 sentence_rnn_layers, word_att_size, sentence_att_size, dropout=0.5):
        super(HierarchialAttentionNetwork, self).__init__()
        self.sentence_attention = SentenceAttention(vocab_size, emb_size, word_rnn_size, sentence_rnn_size,
                                                    word_rnn_layers, sentence_rnn_layers, word_att_size,
                                                    sentence_att_size, dropout)

        self.fc = nn.Linear(2 * sentence_rnn_size, n_classes)
        self.softmax = nn.Softmax()
        self.sigmoid = nn.Sigmoid()
        
        self.dropout = nn.Dropout(dropout)

    def forward(self, documents, sentences_per_document, words_per_sentence):
        document_embeddings, word_alphas, sentence_alphas = self.sentence_attention(documents, sentences_per_document,
                                                                                    words_per_sentence)  
        scores = self.sigmoid(self.fc(self.dropout(document_embeddings)) ) 
        
        return scores, word_alphas, sentence_alphas


class SentenceAttention(nn.Module):
    def __init__(self, vocab_size, emb_size, word_rnn_size, sentence_rnn_size, word_rnn_layers, sentence_rnn_layers,
                 word_att_size, sentence_att_size, dropout):
        super(SentenceAttention, self).__init__()

        self.word_attention = WordAttention(vocab_size, emb_size, word_rnn_size, word_rnn_layers, word_att_size,
                                            dropout)

        self.sentence_rnn = nn.GRU(2 * word_rnn_size, sentence_rnn_size, num_layers=sentence_rnn_layers,
                                   bidirectional=True, dropout=dropout, batch_first=True)

        self.sentence_attention = nn.Linear(2 * sentence_rnn_size, sentence_att_size)

        self.sentence_context_vector = nn.Linear(sentence_att_size, 1,
                                                 bias=False) 

        self.dropout = nn.Dropout(dropout)

    def forward(self, documents, sentences_per_document, words_per_sentence):
        packed_sentences = pack_padded_sequence(documents,
                                                lengths=sentences_per_document.tolist(),
                                                batch_first=True,
                                                enforce_sorted=False) 
        packed_words_per_sentence = pack_padded_sequence(words_per_sentence,
                                                         lengths=sentences_per_document.tolist(),
                                                         batch_first=True,
                                                         enforce_sorted=False)  
        sentences, word_alphas = self.word_attention(packed_sentences.data,
                                                     packed_words_per_sentence.data)  
        
        sentences = self.dropout(sentences)
        
        packed_sentences, _ = self.sentence_rnn(PackedSequence(data=sentences,
                                                               batch_sizes=packed_sentences.batch_sizes,
                                                               sorted_indices=packed_sentences.sorted_indices,
                                                               unsorted_indices=packed_sentences.unsorted_indices))  
        att_s = self.sentence_attention(packed_sentences.data)  
        att_s = torch.tanh(att_s)  
        
        att_s = self.sentence_context_vector(att_s).squeeze(1)  

        max_value = att_s.max()  
        att_s = torch.exp(att_s - max_value) 

        att_s, _ = pad_packed_sequence(PackedSequence(data=att_s,
                                                      batch_sizes=packed_sentences.batch_sizes,
                                                      sorted_indices=packed_sentences.sorted_indices,
                                                      unsorted_indices=packed_sentences.unsorted_indices),
                                       batch_first=True)  # (n_documents, max(sentences_per_document))

  
        sentence_alphas = att_s / torch.sum(att_s, dim=1, keepdim=True)

        documents, _ = pad_packed_sequence(packed_sentences,
                                           batch_first=True)  
        documents = documents * sentence_alphas.unsqueeze(2)  
        documents = documents.sum(dim=1)
        word_alphas, _ = pad_packed_sequence(PackedSequence(data=word_alphas,
                                                            batch_sizes=packed_sentences.batch_sizes,
                                                            sorted_indices=packed_sentences.sorted_indices,
                                                            unsorted_indices=packed_sentences.unsorted_indices),
                                             batch_first=True) 
        return documents, word_alphas, sentence_alphas


class WordAttention(nn.Module):
    def __init__(self, vocab_size, emb_size, word_rnn_size, word_rnn_layers, word_att_size, dropout):
        super(WordAttention, self).__init__()

        self.embeddings = nn.Embedding(vocab_size, emb_size)
        self.word_rnn = nn.GRU(emb_size, word_rnn_size, num_layers=word_rnn_layers, bidirectional=True,
                               dropout=dropout, batch_first=True)

        self.word_attention = nn.Linear(2 * word_rnn_size, word_att_size)
        self.word_context_vector = nn.Linear(word_att_size, 1, bias=False)

        self.dropout = nn.Dropout(dropout)

    def init_embeddings(self, embeddings):
        self.embeddings.weight = nn.Parameter(embeddings)

    def fine_tune_embeddings(self, fine_tune=False):
        for p in self.embeddings.parameters():
            p.requires_grad = fine_tune

    def forward(self, sentences, words_per_sentence):
        sentences = self.dropout(self.embeddings(sentences))
        packed_words = pack_padded_sequence(sentences,
                                            lengths=words_per_sentence.tolist(),
                                            batch_first=True,
                                            enforce_sorted=False) 
        # a PackedSequence object, where 'data' is the flattened words (n_words, word_emb)

        packed_words, _ = self.word_rnn(packed_words)  
        att_w = self.word_attention(packed_words.data)
        att_w = torch.tanh(att_w) 
        att_w = self.word_context_vector(att_w).squeeze(1)  # (n_words)

        max_value = att_w.max()  
        att_w = torch.exp(att_w - max_value)  
        
        att_w, _ = pad_packed_sequence(PackedSequence(data=att_w,batch_sizes=packed_words.batch_sizes,
                                    sorted_indices=packed_words.sorted_indices,
                                    unsorted_indices=packed_words.unsorted_indices),
                                       batch_first=True)
        word_alphas = att_w / torch.sum(att_w, dim=1, keepdim=True) 

        sentences, _ = pad_packed_sequence(packed_words,
                                           batch_first=True)  

        sentences = sentences * word_alphas.unsqueeze(2) 
        sentences = sentences.sum(dim=1) 

        return sentences, word_alphas

In [16]:
n_classes = len(tag2idx) 
word_rnn_size = 100  ## 超參數設定 可以調整這裡的參數
sentence_rnn_size = 100 
word_rnn_layers = 2  
sentence_rnn_layers = 2  
word_att_size = 200
sentence_att_size = 200  
dropout = 0.3

BS = 16
lr = 1e-3

## 到這裡之前

PAD_IDX = word2idx['<pad>']
dataset = HANDataset(dict_han,word2idx['<pad>'],len(tag2idx))
test_loader = DataLoader(dataset, batch_size=1,
                        collate_fn=dataset.collate_fn)
model = HierarchialAttentionNetwork(n_classes=n_classes,
                                    vocab_size=len(word2idx),
                                    emb_size=300,
                                    word_rnn_size=word_rnn_size,
                                    sentence_rnn_size=sentence_rnn_size,
                                    word_rnn_layers=word_rnn_layers,
                                    sentence_rnn_layers=sentence_rnn_layers,
                                    word_att_size=word_att_size,
                                    sentence_att_size=sentence_att_size,
                                    dropout=dropout)

In [17]:
model.load_state_dict(torch.load('legaltech_han_model.pt'))
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

HierarchialAttentionNetwork(
  (sentence_attention): SentenceAttention(
    (word_attention): WordAttention(
      (embeddings): Embedding(37743, 300)
      (word_rnn): GRU(300, 100, num_layers=2, batch_first=True, dropout=0.3, bidirectional=True)
      (word_attention): Linear(in_features=200, out_features=200, bias=True)
      (word_context_vector): Linear(in_features=200, out_features=1, bias=False)
      (dropout): Dropout(p=0.3, inplace=False)
    )
    (sentence_rnn): GRU(200, 100, num_layers=2, batch_first=True, dropout=0.3, bidirectional=True)
    (sentence_attention): Linear(in_features=200, out_features=200, bias=True)
    (sentence_context_vector): Linear(in_features=200, out_features=1, bias=False)
    (dropout): Dropout(p=0.3, inplace=False)
  )
  (fc): Linear(in_features=200, out_features=484, bias=True)
  (softmax): Softmax(dim=None)
  (sigmoid): Sigmoid()
  (dropout): Dropout(p=0.3, inplace=False)
)

In [36]:
    for test_data in test_loader:
        documents = test_data[0].to(device)
        print("Judgement : ")
        for sent in documents[0]:
            for word in sent:
                if word == 0:
                    continue
#                 print(idx2word[word.cpu().item()],end = '')
#             print()
        sentences_per_document = test_data[1].to(device)
        words_per_sentence = test_data[2].to(device)
        print(documents.shape)
        print(sentences_per_document)
        print(words_per_sentence)
        scores, word_alphas, sentence_alphas = model(documents, sentences_per_document,
                                                                 words_per_sentence)

Judgement : 
torch.Size([1, 55, 35])
tensor([55], device='cuda:0')
tensor([[20, 15,  2,  8,  8, 17,  5,  6,  6, 10,  7, 25,  7, 20, 11,  6, 18, 12,
          2,  3,  5,  4, 26,  1, 14, 14, 13, 21,  3,  8,  6,  6, 18, 15, 35, 12,
         15,  9, 11,  4,  5, 14, 13, 12,  5, 12, 13, 10,  6,  3,  6,  3, 10,  3,
          7]], device='cuda:0')


In [26]:
TEMPLATE = 'templates'
STATIC = 'static'

app = Flask(__name__,template_folder=TEMPLATE,static_folder=STATIC)
run_with_ngrok(app)
input_text = ''
@app.route('/')
def index():
    return render_template('form.html')
@app.route('/submit', methods=['POST'])
def submit():
    for test_data in test_loader:
        documents = test_data[0].to(device)
        print("Judgement : ")
        for sent in documents[0]:
            for word in sent:
                if word == 0:
                    continue
#                 print(idx2word[word.cpu().item()],end = '')
#             print()
        sentences_per_document = test_data[1].to(device)
        words_per_sentence = test_data[2].to(device)
        print(documents)
        scores, word_alphas, sentence_alphas = model(documents, sentences_per_document,
                                                                 words_per_sentence)
    print('\nPredict lawNames : ')
    predicts = ''
    for idx,s in enumerate(scores[0]):
        if s>0.5:
            print(idx2tag[idx])
            predicts += idx2tag[idx] + '\n'
    print('\nTrue lawNames : ')
    anwsers = []
    for idx,s in enumerate(test_data[3][0]):
        if s != 0:
            print(idx2tag[idx])
            anwsers.append(idx2tag[idx])
    input_text = request.form['text']
    if len(input_text) < 100:
        vis2 = 1
        s = 'Unable to send your message. Please fix errors then try again.'
        return render_template('form.html',vis2=vis2,input_text=input_text,s = s)
    elif len(input_text) > 100:
        vis1 = 1
        return render_template('form.html',vis1=vis1,input_text=input_text,s = predicts)

app.run()

 * Serving Flask app '__main__' (lazy loading)
 * Environment: production
   Use a production WSGI server instead.
 * Debug mode: off


 * Running on http://127.0.0.1:5000/ (Press CTRL+C to quit)
127.0.0.1 - - [11/Sep/2021 11:14:14] "GET / HTTP/1.1" 200 -
127.0.0.1 - - [11/Sep/2021 11:14:14] "GET /static/css/nicepage.css HTTP/1.1" 304 -
127.0.0.1 - - [11/Sep/2021 11:14:14] "GET /static/css/Page-1.css HTTP/1.1" 304 -
127.0.0.1 - - [11/Sep/2021 11:14:14] "GET /static/css/bootstrap/bootstrap.css HTTP/1.1" 304 -
127.0.0.1 - - [11/Sep/2021 11:14:14] "GET /static/images/desktop_top-banner.png HTTP/1.1" 304 -
127.0.0.1 - - [11/Sep/2021 11:14:14] "GET /static/images/banner-bg.png HTTP/1.1" 304 -


 * Running on http://7c82-140-127-114-22.ngrok.io
 * Traffic stats available on http://127.0.0.1:4040


Judgement : 
福建金門地方法院刑事裁定104年度金訴字第1號上訴人即被告蔡銀英上列被告因違反銀行法案件
不服本院民國104年12月31日104年度金訴字第1號第一審刑事判決
提起上訴
本院裁定如下：主文上訴駁回
理由一、按上訴期間為10日
自送達判決後起算；原審法院認為上訴不合法律上之程式者
應以裁定駁回之
刑事訴訟法第349條前段
第362條前段定有明文
次按送達於在監獄或看守所之人
應囑託該監所長官為之
刑事訴訟法第56條第2項亦定有明文；又送達於應受送達人之住、居所、事務所或營業所
不獲會晤應受送達人
亦無法準用民事訴訟法第137條規定將文書付與有辨別事理能力之同居人或受僱人者
得將文書寄存送達地之自治或警察機關
並作送達通知書2份
1份黏貼於應受送達人住居所、事務所、營業所或其就業處所門首
另一份置於該送達處所信箱或其他適當位置
以為送達
又寄存送達
自寄存之日起
經10日發生效力
刑事訴訟法第62條準用民事訴訟法第136條至138條規定甚明；而民事訴訟法第138條第2項所稱之
係指
二、經查：本件上訴人即被告蔡銀英違反銀行法案件
其於本院審理時陳明其住所為金門縣○○鎮○○路00號
有本院歷次準備程序筆錄、審判筆錄附卷可稽
而本院於民國104年12月31日以104年度金訴字第1號刑事判決判處有期徒刑1年8月
緩刑3年
並應於判決確定1年內
向公庫支付新臺幣8萬元
犯罪所得新臺幣<unk>元沒收
該判決於105年1月6日寄存送達被告住所所在之金門縣警察局金湖分局金湖派出所
被告本人於105年1月7日15時14分許前往該警局領取等情
有個人戶籍資料查詢結果、本院送達證書、入出境資訊連結作業、金門縣警察局金湖分局金湖派出所受理司法文書寄存登記簿各1份在卷可查
又被告於斯時並未因案在監執行或羈押
此有臺灣高等法院在監在押全國紀錄表1份附卷可據
是被告並非在監獄或看守所之人
其主張該判決應囑託監所長官為送達云云
顯非可採
則依上開送達情形
該判決正本既已經被告於105年1月7日前往金湖派出所具領
是本件上訴期間應自105年1月8日起算10日
再依法院訴訟當事人在途期間標準第2條之規定
加計在途期間1日
是其上訴期間之末日應為105年1月18日屆滿
詎上訴人遲至107年3月13日始向本院提起上訴
有刑事上訴狀上本院收狀戳章為憑
顯已逾越法定上訴期間
且無從補