In [1]:
import pickle
import numpy as np
from sklearn.metrics import (accuracy_score, classification_report, f1_score,
                             precision_score, recall_score)
import torch
from torch import optim
from torch.utils.data.dataloader import DataLoader

from utils.optim import ScheduledOptim

from transformers import AutoTokenizer

from gensim import corpora
from gensim.summarization import bm25

In [2]:
with open('/data/ganleilei/law/ContrastiveLJP/w2id_thulac.pkl', 'rb') as f:
    word2id_dict = pickle.load(f)
    f.close()
print(word2id_dict['无故'])
id2word_dict = {item[1]: item[0] for item in word2id_dict.items()}
# print("word2id dict:", word2id_dict)
# print("id2word dict:", id2word_dict)

1853


In [None]:
data_path = "/data/home/ganleilei/law/ContrastiveLJP/"
train_data, valid_data, test_data = load_dataset(data_path)
train_size = len(train_data["accu_label_lists"])
accu_labels_set = {}
law_labels_set = {}
term_labels_set = {}
for idx in range(train_size):
    accu_label = train_data["accu_label_lists"][idx]
    law_label = train_data["law_label_lists"][idx]
    term_label = train_data["term_lists"][idx]

    if str(accu_label) not in accu_labels_set:
        accu_labels_set[str(accu_label)] = 1
    else:
        accu_labels_set[str(accu_label)] = accu_labels_set[str(accu_label)] + 1

    if str(law_label) not in law_labels_set:
        law_labels_set[str(law_label)] = 1
    else:
        law_labels_set[str(law_label)] = law_labels_set[str(law_label)] + 1

    if str(term_label) not in term_labels_set:
        term_labels_set[str(term_label)] = 1
    else:
        term_labels_set[str(term_label)] = term_labels_set[str(term_label)] + 1

print("accu labels set:", sorted(accu_labels_set.items(), key=lambda k: k[1], reverse=True))
print("law labels set:", sorted(law_labels_set.items(), key=lambda k: k[1], reverse=True))
print("term labels set:", sorted(term_labels_set.items(), key=lambda k: k[1], reverse=True))


In [None]:
harnn_res_path = 'harnn_res.csv'
res = {}
for line in open(harnn_res_path, mode='r'):
    parts = line.strip().split()
    assert len(parts) == 5, f"Wrong line, {line}"
    res[str(parts[0])] = tuple(parts[1:])

print(res)

In [None]:
sorted_accu_labels_set = {item[0]: item[1] for item in sorted(accu_labels_set.items(), key=lambda k: k[1])}
print(sorted_accu_labels_set)

In [None]:
print(len(sorted_accu_labels_set))
top = 30
p, r, f1 = 0, 0, 0
for item in list(sorted_accu_labels_set.items())[119-top:]:
    accu_label = item[0]
    p += float(res[accu_label][0])
    r += float(res[accu_label][1])
    f1 += float(res[accu_label][2])

print(f"top {top}, average precision: {p/top}, recall: {r/top}, f1: {f1/top}")

In [None]:
import numpy as np
print(train_data.keys())
train_corpus = []
mask = np.array(train_data['fact_list']) == 164672
mask = ~mask
seq_len = mask.sum(2)
print("seq len:", seq_len[0])
sent_num_mask = seq_len == 0
sent_num_mask = ~sent_num_mask
sent_num = sent_num_mask.sum(1)
print("sent num:", sent_num[0])
for s_idx, doc in enumerate(train_data['fact_list']):
    tmp = []
    cur_sent_num = sent_num[s_idx]
    for w_idx, sent in enumerate(doc[:cur_sent_num]):
        cur_seq_len = seq_len[s_idx][w_idx]
        tmp.extend(sent[:cur_seq_len])

    train_corpus.append([id2word_dict[ids] for ids in tmp])

print("train_corpus size:", len(train_corpus))

In [None]:
dictionary = corpora.Dictionary(train_corpus)
corpus = [dictionary.doc2bow(text) for text in train_corpus]
bm25_obj = bm25.BM25(train_corpus)

In [None]:
test_facts, test_accu_labels, test_law_labels, test_term_labels = test_data["fact_list"], test_data["accu_label_lists"], test_data["law_label_lists"], test_data["term_lists"]
test_corpus = []
mask = np.array(test_facts) == 164672
mask = ~mask
seq_len = mask.sum(2)
print("seq len:", seq_len[0])
sent_num_mask = seq_len == 0
sent_num_mask = ~sent_num_mask
sent_num = sent_num_mask.sum(1)
print("sent num:", sent_num[0])
for s_idx, doc in enumerate(test_facts):
    tmp = []
    cur_sent_num = sent_num[s_idx]
    for w_idx, sent in enumerate(doc[:cur_sent_num]):
        cur_seq_len = seq_len[s_idx][w_idx]
        tmp.extend(sent[:cur_seq_len])

    test_corpus.append([id2word_dict[ids] for ids in tmp])

print("test_corpus size:", len(test_corpus))


In [None]:
query_index = 21264
query_text = test_corpus[query_index]
print("query text:", query_text)
print(f"query accu label: {test_accu_labels[query_index]}, law label: {test_law_labels[query_index]}, term label: {test_term_labels[query_index]}")
scores = bm25_obj.get_scores(query_text)
best_docs = sorted(range(len(scores)), key=lambda k: scores[k])[-10:]
print("best docs:", best_docs)

In [None]:
train_accu_labels = train_data["accu_label_lists"]
train_law_labels = train_data["law_label_lists"]
train_term_labels = train_data["term_lists"]


In [None]:
retr_index = 62983
retr_fact = train_corpus[retr_index]
print("retrieved text:", retr_fact)
print(f"retrieved accu label: {train_accu_labels[retr_index]}, law label: {train_law_labels[retr_index]}, term label: {train_term_labels[retr_index]}")

In [None]:
from transformers import AutoTokenizer

In [None]:
bert_tokenizer = AutoTokenizer.from_pretrained("/data/ganleilei/bert/bert-base-chinese/")

In [None]:
seg_texts = bert_tokenizer("价值1150元")
print(bert_tokenizer.convert_ids_to_tokens(seg_texts["input_ids"]))

In [None]:
import pickle as pk
import json
from tqdm import tqdm
#convert ladan labels index to neurjudge label index
neur_judge_charge2id = json.load(open('/data/ganleilei/law/ContrastiveLJP/NeurJudge_config_data/charge2id.json'))
ladan_to_neurjudge = {}
count = 0
with open('/data/ganleilei/workspace/ContrastiveLJP/data/new_big_accu.txt', 'r') as f:
    for line in f.readlines():
        if line.strip() in neur_judge_charge2id.keys():
            ladan_to_neurjudge[count] = neur_judge_charge2id[line.strip()]
        count = count + 1
print(ladan_to_neurjudge)

neur_judge_law2id = json.load(open('/data/ganleilei/law/ContrastiveLJP/NeurJudge_config_data/article2id.json'))
print(neur_judge_law2id)
ladan_to_neurjudge_law = {}
count = 0
with open('/data/ganleilei/workspace/ContrastiveLJP/data/new_big_law.txt', 'r') as f:
    for line in f.readlines():
        if line.strip() in neur_judge_law2id.keys():
            ladan_to_neurjudge_law[count] = neur_judge_law2id[line.strip()]
        count = count + 1
print(ladan_to_neurjudge_law)
###############################################

with open('/data/ganleilei/law/ContrastiveLJP/w2id_thulac.pkl', 'rb') as f:
    word2id_dict = pk.load(f)
    f.close()
print(len(word2id_dict))
id2word_dict = {item[1]: item[0] for item in word2id_dict.items()}

file_list = ["train", "valid", "test"]
#file_list = ["test"]
for file in file_list:
    fact_lists = []
    law_label_lists = []
    accu_label_lists = []
    term_lists = []
    ##add for pre-trained models
    raw_facts = []

    f = pk.load(open('/data/ganleilei/law/ContrastiveLJP/big/{}_processed_thulac_Legal_basis_with_fyb_annotate_number_field.pkl'.format(file), 'rb'))
    print(f.keys())
    for idx, fact in enumerate(tqdm(f["fact_list"][:100])):
        if f['law_label_lists'][idx] not in ladan_to_neurjudge_law or f['accu_label_lists'][idx] not in ladan_to_neurjudge:
            continue
        print("raw fact:", f['raw_facts_list'][idx])
        print("law label lists:", f['law_label_lists'][idx])
        print("law label lists:", ladan_to_neurjudge_law[f['law_label_lists'][idx]])

        print("accu label lists:", f['accu_label_lists'][idx])
        print("accu label lists:", ladan_to_neurjudge[f['accu_label_lists'][idx]])

        law_label_lists.append(ladan_to_neurjudge_law[f['law_label_lists'][idx]])
        accu_label_lists.append(ladan_to_neurjudge[f['accu_label_lists'][idx]])

        sentence = []
        for s in fact:
            s = s.tolist()
            for id in s:
                if id != word2id_dict['BLANK']:
                    sentence.append(id)
        raw_sent = [id2word_dict[id] for id in sentence]
        # if "贩毒" in raw_sent or "毒品" in raw_sent:
        #     print(raw_sent)
        if len(sentence) < 300:
            sentence = sentence + [word2id_dict['BLANK']]*(300-len(sentence))
        else:
            sentence = sentence[:300]

        fact_lists.append(sentence)

    data_dict = {'fact_list': fact_lists, 'law_label_lists': law_label_lists,
                 'accu_label_lists': accu_label_lists, 'term_lists': f["term_lists"], 'raw_facts_list': f["raw_facts_list"], 
                 'money_amount_lists': f["money_amount_lists"], 'drug_weight_lists': f["drug_weight_lists"]}

    pk.dump(data_dict, open('/data/ganleilei/law/ContrastiveLJP/big/NeurJudge/{}_processed_thulac_Legal_basis_with_fyb_annotate_number_field.pkl.bak'.format(file), 'wb'))
    print('{}_dataset is processed over'.format(file)+'\n')

In [3]:
charges =  [83, 11, 55, 16, 37, 102, 52, 107, 61, 12, 58, 75, 78, 38, 69, 60, 54, 94, 110, 88, 19, 30, 59, 26, 51, 118, 86, 49, 7] # number sensitive classes
lines = open('data/new_accu.txt', 'r').readlines()
print([lines[t].strip() for t in charges])

['生产、销售伪劣产品', '合同诈骗', '诈骗', '持有、使用假币', '行贿', '虚开增值税专用发票、用于骗取出口退税、抵扣税款发票', '侵犯著作权', '信用卡诈骗', '抢夺', '抢劫', '挪用资金', '挪用公款', '故意毁坏财物', '非法吸收公众存款', '集资诈骗', '出售、购买、运输假币', '贷款诈骗', '保险诈骗', '盗窃', '持有伪造的发票', '违法发放贷款', '骗取贷款、票据承兑、金融票证', '非法收购、运输盗伐、滥伐的林木', '对非国家工作人员行贿', '票据诈骗', '职务侵占', '贪污', '走私普通货物、物品', '销售假冒注册商标的商品']


In [2]:
import os
def load_dataset(path):
    train_path = os.path.join(path, "train_processed_thulac_Legal_basis_with_fyb_annotate_number_field.pkl")
    valid_path = os.path.join(path, "valid_processed_thulac_Legal_basis_with_fyb_annotate_number_field.pkl")
    test_path = os.path.join(path, "test_processed_thulac_Legal_basis_with_fyb_annotate_number_field.pkl")
    
    train_dataset = pickle.load(open(train_path, mode='rb'))
    valid_dataset = pickle.load(open(valid_path, mode='rb'))
    test_dataset = pickle.load(open(test_path, mode='rb'))

    print("train dataset sample:", train_dataset['raw_facts_list'][0])
    print("train dataset sample len:", len(train_dataset['law_label_lists']))
    return train_dataset, valid_dataset, test_dataset

In [4]:
path = "/data/ganleilei/law/ContrastiveLJP/datasets/fyb_annotate/"
train_dataset, valid_dataset, test_dataset = load_dataset(path)
print(train_dataset['accu_label_lists'])

train dataset sample: 2014年4月19日下午16时许，被告人段某驾拖车经过鸡飞乡澡塘街子，时逢堵车，段某将车停在“冰凉一夏”冷饮店门口，被害人王某的侄子王2某示意段某靠边未果，后上前敲打车门让段某离开，段某遂驾车离开，但对此心生怨愤。同年4月21日22时许，被告人段某酒后与其妻子王1某一起准备回家，走到鸡飞乡澡塘街富达通讯手机店门口时停下，段某进入手机店内对被害人王某进行吼骂，紧接着从手机店出来拿得一个石头又冲进手机店内朝王某头部打去，致王某右额部粉碎性骨折、右眼眶骨骨折。经鉴定，被害人王某此次损伤程度为轻伤一级。
train dataset sample len: 101619
101619


In [5]:
num_charges =  [83, 11, 55, 16, 37, 102, 52, 107, 61, 12, 58, 75, 78, 38, 69, 60, 54, 94, 110, 88, 19, 30, 59, 26, 51, 118, 86, 49, 7] # number sensitive classes
conf_target_classes = [1, 3, 5, 6, 11, 12, 15, 18, 22, 24, 25, 26, 27, 30, 33, 38, 42, 44, 45, 48, 54, 55, 61, 68, 69, 74, 77, 78, 79, 82, 86, 91, 93, 100, 105, 108, 110, 111, 112, 113, 118]

num_count, conf_count = 0, 0
for l in train_dataset['accu_label_lists']:
    if l in num_charges:
        num_count += 1
    if l in conf_target_classes:
        conf_count += 1

print("%d, %.4f" % (num_count, num_count/101619))
print("%d, %.4f" % (conf_count, conf_count/101619))

22608, 0.2225
37727, 0.3713
