In [1]:
import nltk
from gensim.models import Word2Vec
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Suppose that we have some data
text_data = "This is a sample text for Word2Vec model training with NLTK and Gensim."

stop_words = set(stopwords.words('english'))

words = word_tokenize(text_data.lower())
filtered_words = [word for word in words if word not in stop_words and word.isalnum()]

# Use Gensim to train Word2Vec model
model = Word2Vec([filtered_words], vector_size=100, window=5, min_count=1, workers=4)

# Use the model
vector = model.wv['sample']  # Get the vector of ‘sample’
similar_words = model.wv.most_similar('text')  #  find word similar to ‘sample’

print(vector)

print(similar_words)

[ 8.13227147e-03 -4.45733406e-03 -1.06835726e-03  1.00636482e-03
 -1.91113955e-04  1.14817743e-03  6.11386076e-03 -2.02715401e-05
 -3.24596534e-03 -1.51072862e-03  5.89729892e-03  1.51410222e-03
 -7.24261976e-04  9.33324732e-03 -4.92128357e-03 -8.38409644e-04
  9.17541143e-03  6.74942741e-03  1.50285603e-03 -8.88256077e-03
  1.14874600e-03 -2.28825561e-03  9.36823711e-03  1.20992784e-03
  1.49006362e-03  2.40640994e-03 -1.83600665e-03 -4.99963388e-03
  2.32429506e-04 -2.01418041e-03  6.60093315e-03  8.94012302e-03
 -6.74754381e-04  2.97701475e-03 -6.10765442e-03  1.69932481e-03
 -6.92623248e-03 -8.69402662e-03 -5.90020278e-03 -8.95647518e-03
  7.27759488e-03 -5.77203138e-03  8.27635173e-03 -7.24354526e-03
  3.42167495e-03  9.67499893e-03 -7.78544787e-03 -9.94505733e-03
 -4.32914635e-03 -2.68313056e-03 -2.71289347e-04 -8.83155130e-03
 -8.61755759e-03  2.80021061e-03 -8.20640661e-03 -9.06933658e-03
 -2.34046578e-03 -8.63180775e-03 -7.05664977e-03 -8.40115082e-03
 -3.01328895e-04 -4.56429

In [20]:
print(texts)



In [34]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import numpy as np
import shap


# 设置 device，优先使用 GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# 加载预训练的模型和分词器
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english")
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english")
model.to(device)  # 将模型转移到 GPU

def predict_sentiment(text):
    if isinstance(text, np.ndarray):
        text = text.tolist()

    if isinstance(text, str):
        text = [text]
    elif not isinstance(text, list) or not all(isinstance(t, str) for t in text):
        raise ValueError("The input must be a non-empty list of strings.")

    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    inputs = {k: v.to(device) for k, v in inputs.items()}  # 将输入数据转移到 GPU

    with torch.no_grad():
        outputs = model(**inputs)
    
    probabilities = torch.nn.functional.softmax(outputs.logits, dim=-1)
    return probabilities.detach().cpu().numpy()  # 将结果移回 CPU

def format_shap_output(shap_values):
    # 初始化一个空字典来存储结果
    shap_dict = {}

    # 遍历每个样本
    for sample_index in range(len(shap_values.values)):
        # 获取当前样本的单词和SHAP值
        words = shap_values.data[sample_index]
        shap_array = shap_values.values[sample_index]

        # 遍历每个单词和对应的SHAP值
        for word, values in zip(words, shap_array):
            # 如果单词是空的，我们跳过它
            if word.strip() == '':
                continue

            # 仅存储每个单词的第一个SHAP值
            first_value = abs(values[0])  # 获取第一个值

            # 如果字典中还没有这个单词，添加一个新键
            if word not in shap_dict:
                shap_dict[word] = []

            # 追加当前单词对应的第一个SHAP值
            shap_dict[word].append(first_value)

    return shap_dict

def average_shap_values(shap_dict):
    # 初始化一个新的字典来存储平均值
    average_dict = {}

    # 遍历原字典的每个键和值
    for word, values in shap_dict.items():
        # 计算列表中值的平均值
        average_value = sum(values) / len(values)

        # 将计算结果存入新字典
        average_dict[word] = average_value

    return average_dict


# 使用 CPU 来运行 SHAP 解释器
explainer = shap.Explainer(predict_sentiment, tokenizer)

texts = loaded_list[:2]
shap_values = explainer(texts)
    

formatted_output = format_shap_output(shap_values)
final_output = average_shap_values(formatted_output)
print(final_output)

{'I ': 0.0066507097333669656, 'was ': 0.008087154710665345, 'a ': 0.015024118336168164, 'state ': 0.015717149595730007, 'champion ': 0.07220575853716582, 'debate': 0.02634751098230481, 'r ': 0.006576301995664835, 'in ': 0.0024457303807139397, 'high ': 0.0017532678321003914, 'school ': 0.016099850879982114, 'and ': 0.005540101423279942, 'debated ': 0.003138192929327488, 'college': 0.0033451293129473925, '. ': 0.009413322705857029, 'The ': 0.017459155526012182, 'lack ': 0.15240813931450248, 'of ': 0.005184737528651201, 'manners ': 0.003655369859188795, 'et': 0.004381593646636853, 'ique': 0.004590899082055936, 'tte ': 0.0006861519844581684, 'Trump ': 0.009330650752720732, 'showed ': 0.009330650752720732, 'tonight ': 0.0096994365643089, 'is ': 0.01854309054459633, 'ab': 0.05136215287105491, 'hor': 0.04505466502935936, 'rent': 0.028074485793088872, 'Regardless ': 0.02079591908388668, 'your ': 0.007183378722725643, 'political ': 0.006810125589577688, 'ideology': 0.028376081420315637, ', ': 0

In [10]:
shap_values

.values =
array([array([[-7.44603312e-09,  3.72529030e-09],
              [-1.31278692e-02,  1.31278839e-02],
              [-3.79765582e-02,  3.79765662e-02],
              [-2.10849993e-02,  2.10850029e-02],
              [-2.72199114e-01,  2.72199103e-01],
              [-4.15091405e-03,  4.15091217e-03],
              [-9.85267476e-02,  9.85267549e-02],
              [-6.60143618e-02,  6.60143816e-02],
              [-2.00773300e-01,  2.00773298e-01],
              [-1.45475690e-02,  1.45475687e-02],
              [ 3.14730415e-02, -3.14730555e-02],
              [-2.97404767e-08,  2.23517418e-08]]),
       array([[-1.11758709e-08,  6.07542461e-09],
              [-9.58597735e-02,  9.58597800e-02],
              [-7.43020299e-02,  7.43020371e-02],
              [-3.85644672e-02,  3.85644750e-02],
              [ 4.46814919e-01, -4.46814917e-01],
              [-1.67025030e-02,  1.67025147e-02],
              [ 4.45553660e-03, -4.45554358e-03],
              [ 5.03658056e-02, -5.036

In [11]:
1e-4

0.0001

In [None]:
{
to: 20%, [1e-9,1e-8........]
    
word: frequency, contribution
    
}

High Fre, High Contri
High Fre Low Cont... Most stopwords are here
LF HC
LF LC

-7e-09, 7e-09 = 0
-7e-02, 7e-02 = 0

In [7]:
import pickle

# 打开文件，准备读取
with open('texts.pkl', 'rb') as f:
    loaded_list = pickle.load(f)

print(loaded_list)  # 输出读取的list




In [18]:
with open('final_output.pkl', 'wb') as f:
    pickle.dump(final_output, f)

In [19]:
import json

filename = 'data.json'

with open(filename, 'w') as f:
    json.dump(final_output, f, indent=4)  # 使用indent参数使输出格式化，更易读