# 此腳本為測試 stence-tranformers 文章向量的效果

In [2]:
from pathlib import Path
import json
import pickle

import pandas as pd
from sentence_transformers import SentenceTransformer, util

In [3]:
root_path = Path('/Users/zoe/Desktop/Kevin_project/judicial_analysis')
input_path = root_path / '108/upload10812'
supreme_court_path = root_path / '199601/最高法院民事'

#### step1. load civial code judical include 車禍 (supreme + 10812 normal)

In [4]:
accident_judicial = []

for file in supreme_court_path.glob('*'):
    with open(file, 'r') as f:
        judicial_json = json.loads(f.read())
    
    if '車禍' in judicial_json['JFULL']:
        accident_judicial.append(judicial_json)

#### step2. check cosine similarity of vectors with whole judgement & split judgement with dot (。)

In [5]:
with open('all_judicial.pickle', 'rb') as f:
    normal_judicial_json = pickle.load(f)

accident_judicial.extend([ {'JFULL': x['JFULLX']['JFULLCONTENT']} 
for x in normal_judicial_json 
if x.get('JFULLX') and '車禍' in x['JFULLX']['JFULLCONTENT']])

#### 1. 經測試後發現，將有提到車禍的判決，與query判決進行比對，會因為判決很常，稀釋了語義，若用句號切斷，相似度會較為明顯，如下第一個案例，
#### 整體文本相似度為 0.1126，但部分段落，相似度高達 0.6以上，所以未來應該會取段落最大相似度做代表，作為下一階段測試。

#### 2. 大部分高相似度的結果，會提到 車 與 負面意涵(事故/痛苦等)，符合 車禍，車 + 禍害 的文字意涵，但也有不準的時候，
#### 會需要進行約30個隨機抽樣，15個含有車禍 ＆ 不含的判決，評估準確率。

In [6]:
model = SentenceTransformer('all-MiniLM-L6-v2')
query_embedding = model.encode('車禍', convert_to_tensor=True)
all_vecs = []

for case in accident_judicial:
    text_sim_sort = sorted([[util.cos_sim(query_embedding, model.encode(paragraph, convert_to_tensor=True)), paragraph]
    for paragraph in case['JFULL'].split('。')], reverse=True)

    print('Whole corpus similarity: ')
    print(util.cos_sim(query_embedding, model.encode(case['JFULL'], convert_to_tensor=True)))

    for x in text_sim_sort:
        if x[0] > 0.6:
            print('Split corpus similarity: ', x)
    print('*'*100)

    all_vecs.extend(text_sim_sort)

Whole corpus similarity: 
tensor([[0.1126]])
Split corpus similarity:  [tensor([[0.6458]]), '事故所以發生，純係范陽宏高速行車，疏於注意，未為剎車\r\n等必要安全措施所肇致，而非甲○○過失之結果']
Split corpus similarity:  [tensor([[0.6310]]), '第查被上訴人因系爭車禍而截肢，\r\n其肉體、精神必然受有極大痛苦']
Split corpus similarity:  [tensor([[0.6090]]), '臺灣省桃園縣區車輛行車事故鑑定委員會桃鑑字第八三一○\r\n六號鑑定意見書亦同此見解']
****************************************************************************************************
Whole corpus similarity: 
tensor([[0.0633]])
Split corpus similarity:  [tensor([[0.6432]]), '足見本件係行車事\r\n故，依鐵路法第六十二條第一項規定，被上訴人對於詹叁妹、張辰之傷害，自應負損\r\n害賠償責任']
Split corpus similarity:  [tensor([[0.6173]]), '如遮斷器未放下或看守人員未表示停止時，仍應「\r\n看、聽」鐵路兩方確無火車駛來，始得通過']
****************************************************************************************************
Whole corpus similarity: 
tensor([[0.2189]])
Split corpus similarity:  [tensor([[0.6083]]), '而林德山所駕自\r\n用小客車擦撞被上訴人乙○○所駕大貨車左前車門脚踏板失控後，始追撞同向前行由\r\n連炳謙所駕大貨車，以致連炳謙車毀人亡，為原審認定之事實']
*************************************************

In [7]:
all_vecs_df = pd.DataFrame([ float(x[0][0][0]) for x in all_vecs], columns=['vec'])
all_vecs_df.describe()

Unnamed: 0,vec
count,36445.0
mean,0.355068
std,0.113749
min,0.005748
25%,0.278167
50%,0.330786
75%,0.409391
max,0.758359


# 含有車禍的判決段落，相似度分佈，呈現常態分佈

![](vec_bins.png)

In [89]:
all_vecs_df.to_excel('vecs.xlsx')

# 未實作部分

![](dev_spec.png)