In [1]:
# !pip install transformers torch shap
import pandas as pd
import numpy as np
import torch
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import shap
import json
import csv
import pandas as pd
import re

In [2]:
data_path = 'textLabel.csv'
labeled_path = 'output1.csv'
promise_path = 'promise_nfr.csv'

In [3]:
data = pd.read_csv(promise_path,sep=';',usecols=["RequirementText","NFR"])

In [4]:
# 分割数据集
train_text, val_text, train_labels, val_labels = train_test_split(data['RequirementText'], data['NFR'], test_size=0.2)
train_text = train_text.iloc[:]

**DistilBert**

In [5]:
import torch
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification

tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased").cuda()

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.bias', 'vocab_transform.weight', 'vocab_projector.weight', 'vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_layer_norm.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier

In [6]:
# use the custom function
import scipy as sp
def f(x):
    """
    使用模型预测输入文本的情感分数。

    Args:
    x: List[str] -- 包含多个文本的列表。

    Returns:
    List[float] -- 包含多个文本的情感分数。
    """
    tv = torch.tensor([tokenizer.encode(text, padding='max_length', max_length=128, truncation=True) for text in x]).cuda()
    attention_mask = (tv!=0).type(torch.int64).cuda()
    outputs = model(tv,attention_mask=attention_mask)[0].detach().cpu().numpy()
    scores = (np.exp(outputs).T / np.exp(outputs).sum(-1)).T
    val = sp.special.logit(scores)
    return val

In [7]:
#计算重要特征，返回特征与重要性
def important(shap_values):
    """
    根据SHAP值确定每个样例中最重要的特征。
    Args:
    shap_values: List[shap.Explanation] -- 一个包含多个SHAP值解释的列表，每个SHAP值解释对应一个样例的所有特征的SHAP值。

    Returns:
    List[List[Tuple[str, float]]] -- 一个包含多个样例的列表，每个样例都是一个包含多个元组的列表，每个元组包含两个元素，一个是最重要的特征名，一个是该特征的SHAP值。
    """

    reason = []
    for ele in shap_values:
        sum =0
        for num in ele.values:
            sum += abs(num[0])
        avg = sum/len(ele.values)
        res = []
        for values,datas in zip(ele.values,ele.data):
            cur = []
            if abs(values[0])>=avg:
                cur.append(datas)
                cur.append(values[0])
                res.append(cur)
        reason.append(res)
    return reason

In [8]:
#读取csv文件
def read_file(file_name):
    """
    读取CSV文件，返回所有数据行的列表。
    Args:
    file_name: str -- 要读取的CSV文件路径及文件名。

    Returns:
    List[List[str]] -- 一个包含所有数据行的列表，每个数据行都是一个包含多个数据字段的字符串列表。
    """

    csv_file = open(file_name, encoding="utf-8")
    csv_reader_lines = csv.reader(csv_file)
    raw_date = []
    for i, line in enumerate(csv_reader_lines):
        raw_date.append(line)
    return raw_date

In [9]:
#读取标注文件，返回关注点
def readLabel(file_path):
    """
    读取标注数据文件，提取每个文档的单词和对应的标签。

    Args:
    file_path: str -- 标注数据文件路径，应为CSV格式，每行为一个文档的标注结果，其中每个文档的标注结果以JSON格式保存。

    Returns:
    Tuple[List[List[str]], List[List[str]]] -- 一个元组，包含两个列表，第一个列表为每个文档的单词列表，第二个列表为每个文档的标签列表。
    """
    data = read_file(file_path)
    words = []
    labels = []
    for ele in data[1:]:
        #正则表达式提取每个文档的单词和对应的标签
        features = []
        label = []
        #数据从第二行开始
        for e in ele:
            match_text = re.search(r'"text": "(.*)"', e.strip())
            match_label = re.search(r'"labels": \["(\w+)"\]',e.strip())
            if match_text:
                features.append(match_text.group(1).strip())
            if match_label:
                label.append(match_label.group(1).strip())
        words.append(features)
        labels.append(label)
    return words,labels

In [10]:
def shapValues(data):
    """
    计算每个样例的SHAP值。
    Args:
    data: List[str] -- 一个包含多个文本样例的列表，每个样例都是一个字符串。

    Returns:
    Explanation -- 包含每个样例的SHAP值的对象。
    """

    explainer = shap.Explainer(f,tokenizer,output_names=["FR","NFR"])
    return explainer(data)

In [11]:
concernsData = pd.read_csv(data_path,sep=',',usecols=["RequirementText","Function","Data","Behavior"])
sample = concernsData['RequirementText']
shap_values = shapValues(sample)
importance = important(shap_values)
words,labs = readLabel(labeled_path)

Partition explainer: 51it [01:13,  1.52s/it]                        


评估特征

In [12]:
#检测异常特征
def errCheck(importance,words):
    """
    检查每个样例中是否有特征未出现在关注点中。

    Args:
    importance: List[Tuple[str, float]] -- 一个包含多个样例的列表，每个样例都是一个包含多个重要特征的元组列表，每个元组包含两个元素，一个是特征名，一个是该特征的权重值。
    words: List[List[str]] -- 一个包含多个样例的列表，每个样例都是一个包含多个词语的列表，表示该样例中出现的所有词语。

    Returns:
    List[List[str]] -- 一个包含多个样例的列表，每个样例都是一个包含多个异常特征的列表，表示该样例中所有未出现在关注点中的特征。
    """
    errList = []
    #遍历每个样例
    for imp,con in zip(importance,words):
        err = []
        for e1 in imp:
            #遍历重要特征，判断w是否是关注点，如果不是则异常
            w = e1[0].strip()
            #遍历关注点
            if(any(w in item for item in con)==False) : err.append(w)
        errList.append(err)
    return errList

In [14]:
errlist = errCheck(importance,words)
print(sample[0])
print(errlist[0])

The product shall ensure that only supervisors can view schedule of all callers.The product must ensure that supervisors are allowed to access advertise empty time slots.'


['product', 'that', 'can', 'of', 'product', 'must', 'are', 'to', "'"]