In [4]:
import os
import json
import random
import openai
import requests
from openai import OpenAI

client = OpenAI()

from tqdm import tqdm
from sklearn.metrics import precision_score, recall_score, f1_score

# 设置 OpenAI API 密钥
openai.api_key = "your_openai_api_key"


class ChatGPT:
    def __init__(self, api_key, proxies=None):
        self.api_key = api_key
        self.proxies = proxies
        self.headers = {
            "Authorization": f"Bearer {self.api_key}",
            "Content-Type": "application/json",
        }
        self.url = "https://api.openai.com/v1/chat/completions"

    def ask(self, prompt):
        data = {
            "model": "gpt-3.5-turbo",  # 根据需要选择合适的模型
            "messages": [{"role": "user", "content": prompt}],
        }
        response = requests.post(
            self.url, headers=self.headers, data=json.dumps(data), proxies=self.proxies
        )
        return response.json()  # 返回JSON解析后的响应


def load_data(data_file):
    with open(data_file, "r", encoding="utf-8") as f:
        data = json.load(f)
    return data


def create_prompt(example, labels, max_length=256):
    text = example["text"][:max_length]
    prompt = f"请对以下文本进行命名实体识别,将实体类型标注在相应实体的前面,用[]括起来。实体类型包括:{', '.join(labels)}\n\n文本:{text}\n\n标注结果:"
    return prompt


def extract_entities(response, labels):
    entities = []
    for label in labels:
        label_parts = response.split(f"[{label}]")
        if len(label_parts) > 1:
            for part in label_parts[1:]:
                entity = part.split("[")[0].strip()
                if entity:
                    start = text.find(entity)
                    end = start + len(entity)
                    entities.append((start, end, label))
    return entities


# def evaluate(true_labels, pred_labels, label_map):
#     true_labels = [label_map[label] for label in true_labels]
#     pred_labels = [label_map[label] for label in pred_labels]

#     precision = precision_score(true_labels, pred_labels, average="macro")
#     recall = recall_score(true_labels, pred_labels, average="macro")
#     f1 = f1_score(true_labels, pred_labels, average="macro")

#     return precision, recall, f1


def evaluate(true_labels, pred_labels, label_map):
    # 使用.get方法访问字典，并提供默认值以避免KeyError
    true_labels_mapped = [
        label_map.get(label, -1) for label in true_labels
    ]  # -1或其他值作为未知标签的标识
    pred_labels_mapped = [label_map.get(label, -1) for label in pred_labels]

    # 根据需要处理未知标签（例如：过滤掉标签为-1的项）
    # 注意：这可能需要根据具体情况调整
    true_labels_filtered = [label for label in true_labels_mapped if label != -1]
    pred_labels_filtered = [label for label in pred_labels_mapped if label != -1]

    precision = precision_score(
        true_labels_filtered, pred_labels_filtered, average="macro"
    )
    recall = recall_score(true_labels_filtered, pred_labels_filtered, average="macro")
    f1 = f1_score(true_labels_filtered, pred_labels_filtered, average="macro")

    return precision, recall, f1


# 加载数据集
train_file = "data/cluener/train_new.json"
dev_file = "data/cluener/dev_new.json"
train_data = load_data(train_file)
dev_data = load_data(dev_file)

# 设置标签
labels = [
    "address",
    "book",
    "company",
    "game",
    "government",
    "movie",
    "name",
    "organization",
    "position",
    "scene",
]
label_map = {label: i for i, label in enumerate(labels)}

# 设置 few-shot 示例数量
num_shots = 10

# 随机选择 few-shot 示例
random.shuffle(train_data)
few_shot_examples = train_data[:num_shots]

# 创建 few-shot 提示
few_shot_prompt = ""
for example in few_shot_examples:
    text = example["text"]
    label_entities = example["label"]
    labeled_text = text
    for label_type in label_entities:  # 这里是 'address', 'name' 等
        for entity, positions in label_entities[label_type].items():
            for position in positions:
                start, end = position
                entity_text = text[start : end + 1]  # 修正为正确的字符串截取
                labeled_text = labeled_text.replace(
                    entity_text, f"[{label_type}]{entity_text}[/{label_type}]"
                )
    few_shot_prompt += f"文本:{text}\n标注:{labeled_text}\n\n"

# 对测试集进行预测
true_labels = []
pred_labels = []

for example in tqdm(dev_data, desc="Evaluating"):
    text = example["text"]
    true_entities = example["label"]

    prompt = few_shot_prompt + create_prompt(example, labels)
    response = client.completions.create(
        model="gpt-3.5-turbo-instruct",
        prompt="",
        temperature=1,
        max_tokens=256,
        top_p=1,
        frequency_penalty=0,
        presence_penalty=0,
    )

    pred_entity_text = response.choices[0].text.strip()
    pred_entities = extract_entities(pred_entity_text, labels)

    true_labels.extend([entity[2] for entity in true_entities])
    pred_labels.extend([entity[2] for entity in pred_entities])

# 评估模型性能
precision, recall, f1 = evaluate(true_labels, pred_labels, label_map)
print(f"Precision: {precision:.4f}, Recall: {recall:.4f}, F1: {f1:.4f}")

Evaluating:   0%|          | 1/1343 [00:07<2:53:02,  7.74s/it]


KeyboardInterrupt: 

In [5]:
import os
import json
import random
import openai
import requests
from openai import OpenAI

client = OpenAI()

from tqdm import tqdm
from sklearn.metrics import precision_score, recall_score, f1_score

# 设置 OpenAI API 密钥
openai.api_key = "your_openai_api_key"


class ChatGPT:
    def __init__(self, api_key, proxies=None):
        self.api_key = api_key
        self.proxies = proxies
        self.headers = {
            "Authorization": f"Bearer {self.api_key}",
            "Content-Type": "application/json",
        }
        self.url = "https://api.openai.com/v1/chat/completions"

    def ask(self, prompt):
        data = {
            "model": "gpt-3.5-turbo",  # 根据需要选择合适的模型
            "messages": [{"role": "user", "content": prompt}],
        }
        response = requests.post(
            self.url, headers=self.headers, data=json.dumps(data), proxies=self.proxies
        )
        return response.json()  # 返回JSON解析后的响应


def load_data(data_file):
    with open(data_file, "r", encoding="utf-8") as f:
        data = json.load(f)
    return data


def create_prompt(example, labels, max_length=256):
    text = example["text"][:max_length]
    prompt = f"请对以下文本进行命名实体识别,将实体类型标注在相应实体的前面,用[]括起来。实体类型包括:{', '.join(labels)}\n\n文本:{text}\n\n标注结果:"
    return prompt


def extract_entities(response, labels):
    entities = []
    for label in labels:
        label_parts = response.split(f"[{label}]")
        if len(label_parts) > 1:
            for part in label_parts[1:]:
                entity = part.split("[")[0].strip()
                if entity:
                    start = text.find(entity)
                    end = start + len(entity)
                    entities.append((start, end, label))
    return entities


# def evaluate(true_labels, pred_labels, label_map):
#     true_labels = [label_map[label] for label in true_labels]
#     pred_labels = [label_map[label] for label in pred_labels]

#     precision = precision_score(true_labels, pred_labels, average="macro")
#     recall = recall_score(true_labels, pred_labels, average="macro")
#     f1 = f1_score(true_labels, pred_labels, average="macro")

#     return precision, recall, f1


def evaluate(true_labels, pred_labels, label_map):
    # 使用.get方法访问字典，并提供默认值以避免KeyError
    true_labels_mapped = [
        label_map.get(label, -1) for label in true_labels
    ]  # -1或其他值作为未知标签的标识
    pred_labels_mapped = [label_map.get(label, -1) for label in pred_labels]

    # 根据需要处理未知标签（例如：过滤掉标签为-1的项）
    # 注意：这可能需要根据具体情况调整
    true_labels_filtered = [label for label in true_labels_mapped if label != -1]
    pred_labels_filtered = [label for label in pred_labels_mapped if label != -1]

    precision = precision_score(
        true_labels_filtered, pred_labels_filtered, average="macro"
    )
    recall = recall_score(true_labels_filtered, pred_labels_filtered, average="macro")
    f1 = f1_score(true_labels_filtered, pred_labels_filtered, average="macro")

    return precision, recall, f1


# 加载数据集
train_file = "data/cluener/train_new_1.json"
dev_file = "data/cluener/dev_new_1.json"
train_data = load_data(train_file)
dev_data = load_data(dev_file)

# 设置标签
# labels = [
#     "address",
#     "book",
#     "company",
#     "game",
#     "government",
#     "movie",
#     "name",
#     "organization",
#     "position",
#     "scene",
# ]

# 设置标签
labels = [
    "address",
    "book",
    "company",
    "game",
    "government",
    "movie",
    "name",
    "organization",
    "position",
    "scene",
]

label_map = {label: i for i, label in enumerate(labels)}

# 设置 few-shot 示例数量
num_shots = 10

# 随机选择 few-shot 示例
random.shuffle(train_data)
few_shot_examples = train_data[:num_shots]

# 创建 few-shot 提示
few_shot_prompt = ""
for example in few_shot_examples:
    text = example["text"]
    label_entities = example["label"]
    labeled_text = text
    for label_type in label_entities:  # 这里是 'address', 'name' 等
        for entity, positions in label_entities[label_type].items():
            for position in positions:
                start, end = position
                entity_text = text[start : end + 1]  # 修正为正确的字符串截取
                labeled_text = labeled_text.replace(
                    entity_text, f"[{label_type}]{entity_text}[/{label_type}]"
                )
    few_shot_prompt += f"文本:{text}\n标注:{labeled_text}\n\n"

# 对测试集进行预测
true_labels = []
pred_labels = []

for example in tqdm(dev_data, desc="Evaluating"):
    text = example["text"]
    true_entities = example["label"]

    prompt = few_shot_prompt + create_prompt(example, labels)
    response = client.completions.create(
        model="gpt-3.5-turbo-instruct",
        prompt="",
        temperature=1,
        max_tokens=256,
        top_p=1,
        frequency_penalty=0,
        presence_penalty=0,
    )

    pred_entity_text = response.choices[0].text.strip()
    pred_entities = extract_entities(pred_entity_text, labels)

    true_labels.extend([entity[2] for entity in true_entities])
    pred_labels.extend([entity[2] for entity in pred_entities])

# 评估模型性能
precision, recall, f1 = evaluate(true_labels, pred_labels, label_map)
print(f"Precision: {precision:.4f}, Recall: {recall:.4f}, F1: {f1:.4f}")

Evaluating: 100%|██████████| 1/1 [00:04<00:00,  4.84s/it]

Precision: nan, Recall: nan, F1: nan





In [7]:
few_shot_examples = [
    {
        "text": "浙商银行企业信贷部叶老桂博士则从另一个角度对五道门槛进行了解读。叶老桂认为，对目前国内商业银行而言，",
        "label": {"name": {"叶老桂": [[9, 11]]}, "company": {"浙商银行": [[0, 3]]}},
    },
    {"text": "生生不息CSOL生化狂潮让你填弹狂扫", "label": {"game": {"CSOL": [[4, 7]]}}},
]
# [
#     {
#         "text": "彭小军认为，国内银行现在走的是台湾的发卡模式，先通过跑马圈地再在圈的地里面选择客户，",
#         "label": {"address": {"台湾": [[15, 16]]}, "name": {"彭小军": [[0, 2]]}},
#     }
# ]


few_shot_prompt = ""

for example in few_shot_examples:
    text = example["text"]
    label_entities = example["label"]
    labeled_text = text
    for label_type in label_entities:  # 这里是 'address', 'name' 等
        for entity, positions in label_entities[label_type].items():
            for position in positions:
                start, end = position
                entity_text = text[start : end + 1]  # 修正为正确的字符串截取
                labeled_text = labeled_text.replace(
                    entity_text, f"[{label_type}]{entity_text}[/{label_type}]"
                )
    few_shot_prompt += f"文本:{text}\n标注:{labeled_text}\n\n"

print(few_shot_prompt)

文本:浙商银行企业信贷部叶老桂博士则从另一个角度对五道门槛进行了解读。叶老桂认为，对目前国内商业银行而言，
标注:[company]浙商银行[/company]企业信贷部[name]叶老桂[/name]博士则从另一个角度对五道门槛进行了解读。[name]叶老桂[/name]认为，对目前国内商业银行而言，

文本:生生不息CSOL生化狂潮让你填弹狂扫
标注:生生不息[game]CSOL[/game]生化狂潮让你填弹狂扫




In [12]:
from openai import OpenAI

client = OpenAI()

response = client.completions.create(
    model="gpt-3.5-turbo-instruct",
    prompt="列出人工智能领域的 2 个最新研究方向、研究内容、重要概念、重要学者、重要论文、重要出版物和重要研究机构。用列表形式显示出来。",
    temperature=1,
    max_tokens=1024,
    top_p=1,
    frequency_penalty=0,
    presence_penalty=0,
)

data = response.choices[0].text.strip()

print(data)

1. 语音识别（Speech Recognition）
- 研究方向：自然语言处理、深度学习、语音信号处理
- 研究内容：使用计算机识别和理解语音信号，将语音信号转换为文本形式。
- 重要概念：人工神经网络、声学模型、语言模型、语音识别系统
- 重要学者：Geoffrey Hinton、Yoshua Bengio、Andrew Ng、Li Deng
- 重要论文："Deep Speech: Scaling up end-to-end speech recognition" (Hinton et al., 2012)、"Speech Recognition with Deep Recurrent Neural Networks" (Graves et al., 2013)
- 重要出版物：IEEE Signal Processing Magazine、IEEE Transactions on Audio, Speech, and Language Processing
- 重要研究机构：Google Brain、Microsoft Research、Facebook AI Research

2. 人机协作（Human-Robot Collaboration）
- 研究方向：机器学习、人机交互、机器人技术
- 研究内容：研究如何让机器人与人类进行有效的协作，实现任务分工和共同工作。
- 重要概念：强化学习、协同控制、人机协作系统、反应式规划
- 重要学者：Stuart Russell、Pieter Abbeel、Maja Mataric、Cynthia Breazeal
- 重要论文："Cooperative multi-agent learning: the state of the art" (Lauer et al., 2000)、"Robot Learning from Demonstration: A Review" (Argall et al., 2009)
- 重要出版物：Frontiers in Robotics and AI、IEEE Transactions on Robotics
- 重要研究机构：OpenAI、Stanford Artificial Intelligence Laboratory、Massachusetts Institute 