In [57]:
# !pip install transformers torch shap
import pandas as pd
import numpy as np
import torch
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import shap
import json
import csv
import pandas as pd
import re

In [58]:
data_path = 'textLabel.csv'
labeled_path = 'output1.csv'
promise_path = 'promise_nfr.csv'

In [59]:
data = pd.read_csv(promise_path,sep=';',usecols=["RequirementText","NFR"])

In [60]:
# 分割数据集
train_text, val_text, train_labels, val_labels = train_test_split(data['RequirementText'], data['NFR'], test_size=0.2)
train_text = train_text.iloc[:]

**DistilBert**

In [61]:
import torch
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification

tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased").cuda()

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_layer_norm.weight', 'vocab_projector.weight', 'vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_transform.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'pre_classifier.weight', 'pre_clas

In [62]:
# use the custom function
import scipy as sp
def f(x):
    tv = torch.tensor([tokenizer.encode(text, padding='max_length', max_length=128, truncation=True) for text in x]).cuda()
    attention_mask = (tv!=0).type(torch.int64).cuda()
    outputs = model(tv,attention_mask=attention_mask)[0].detach().cpu().numpy()
    scores = (np.exp(outputs).T / np.exp(outputs).sum(-1)).T
    val = sp.special.logit(scores)
    return val

In [118]:
#计算重要特征，返回特征与重要性
def important(shap_values):
    reason = []
    for ele in shap_values:
        sum =0
        for num in ele.values:
            sum += abs(num[0])
        avg = sum/len(ele.values)
        res = []
        for values,datas in zip(ele.values,ele.data):
            cur = []
            if abs(values[0])>=avg:
                cur.append(datas)
                cur.append(values[0])
                res.append(cur)
        reason.append(res)
    return reason

In [80]:
#读取csv文件
def read_file(file_name):
    csv_file = open(file_name, encoding="utf-8")
    csv_reader_lines = csv.reader(csv_file)
    raw_date = []
    for i, line in enumerate(csv_reader_lines):
        raw_date.append(line)
    return raw_date

In [134]:
#读取标注文件，返回关注点
def readLabel(file_path):
    data = read_file(file_path)
    words = []
    labels = []
    for ele in data[1:]:
        #正则表达式提取每个文档的单词和对应的标签
        features = []
        label = []
        #数据从第二行开始
        for e in ele:
            match_text = re.search(r'"text": "(.*)"', e.strip())
            match_label = re.search(r'"labels": \["(\w+)"\]',e.strip())
            if match_text:
                features.append(match_text.group(1).strip())
            if match_label:
                label.append(match_label.group(1).strip())
        words.append(features)
        labels.append(label)
    return words,labels

In [66]:
def shapValues(data):
    explainer = shap.Explainer(f,tokenizer,output_names=["FR","NFR"])
    return explainer(data)

In [69]:
concernsData = pd.read_csv(data_path,sep=',',usecols=["RequirementText","Function","Data","Behavior"])
sample = concernsData['RequirementText']
shap_values = shapValues(sample)
importance = important(shap_values)
words,labs = readLabel(labeled_path)

Partition explainer: 51it [01:04,  1.55s/it]                        


评估特征

In [191]:
#检测异常特征
def errCheck(importance,words):
    errList = []
    #遍历每个样例
    for imp,con in zip(importance,words):
        err = []
        for e1 in imp:
            #遍历重要特征，判断w是否是关注点，如果不是则异常
            w = e1[0].strip()
            #遍历关注点
            if(any(w in item for item in con)==False) : err.append(w)
        errList.append(err)
    return errList

In [193]:
errlist = errCheck(importance,words)

[['the',
  'product',
  'shall',
  'that',
  'can',
  'the',
  'product',
  'must',
  'that',
  'are',
  "'"],
 ['only', 'can', '.'],
 ['the', 'product', 'shall', 'have', 'to', 'to', '.'],
 ['the',
  'product',
  'shall',
  'gui',
  'based',
  'monitoring',
  'services',
  '.',
  'system'],
 ['the', 'will', 'by'],
 ['leads', 'that', 'system', 'and'],
 ['users', 'shall', 'be', 'able', 'to', 'nfl', 'and', '.'],
 ['when', 'the', 'the', 'of', 'sunk', 'on', 'the'],
 ['the',
  'system',
  'will',
  'affected',
  'parties',
  'when',
  'occur',
  'including',
  'but',
  'and'],
 ['the', 'product', 'shall', 'whether', 'the', 'or'],
 ['the', 'will', 'by'],
 ['the', 'must', 'to', '.'],
 ['the', 'must', 'to', 'that', 'to', '.'],
 ['the', 'shall', 'based', 'on', 'the'],
 ['the', 'shall', 'that', 'can', '.', 'product', 'must', 'that', 'are', "'"],
 ['disputes',
  'must',
  'maintain',
  'that',
  '.',
  'this',
  'ensures',
  'arise',
  'with'],
 ['the', 'of', 'the', 'product', 'shall', 'or', '.'],