In [None]:
!pip install openai rdflib spacy pyvis datasets scikit-learn matplotlib tqdm pandas

In [None]:
# Import necessary libraries
import os
import re
import json
from collections import Counter
import matplotlib.pyplot as plt
from tqdm.auto import tqdm
import pandas as pd
import time

# NLP and KG libraries
import spacy
from rdflib import Graph, Literal, Namespace, URIRef
from rdflib.namespace import RDF, RDFS, XSD, SKOS # 添加SKOS用于altLabel

# OpenAI client for LLM
from openai import OpenAI

# Visualization
from pyvis.network import Network

# Hugging Face datasets library
from datasets import load_dataset

# For embedding similarity
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
# 使用特定版本可以帮助保持一致性
cnn_dm_dataset=load_dataset("cnn_dailymail", "3.0.0")

In [None]:
# 计算记录总数  
total_records=len(cnn_dm_dataset["train"]) +len(cnn_dm_dataset["validation"]) +len(cnn_dm_dataset["test"])  # 打印总数和样本记录  
print(f"Total number of records in the dataset: {total_records}\n")
print("Sample record from the training dataset:")
print(cnn_dm_dataset["train"][0])


# #### OUTPUT ####
# Total number of records in the dataset: 311971

# Sample record from the training dataset:
# {'article': 'LONDON, England (Reuters) -- Harry Potter star Daniel ...'} 

In [None]:
# 定义与技术公司收购相关的关键词   
ACQUISITION_KEYWORDS= ["acquire", "acquisition", "merger", "buyout", "purchased by", "acquired by", "takeover"]   
TECH_KEYWORDS= ["technology", "software", "startup", "app", "platform", "digital", "AI", "cloud"]

In [None]:
# 仅取训练集  
cnn_dm_dataset_train=cnn_dm_dataset['train']  
# 初始化一个空列表来存储过滤后的文章  
filtered_articles= []  
# 遍历数据集并基于关键词过滤文章  
forrecordincnn_dm_dataset_train:      
# 检查任何关键词是否出现在文章文本中      
 found_keyword = False
    for keyword in ACQUISITION_KEYWORDS:
        if keyword.lower() in record['article'].lower():
            found_keyword = True
            break# 一旦找到关键词就停止            
    # 如果找到关键词，将文章添加到过滤列表中      
    if found_keyword:
      filtered_articles.append(record)

In [None]:
# 打印过滤后文章的总数  
print(f"Total number of filtered articles: {len(filtered_articles)}")  
# 打印一个过滤后文章的样本  
print("\nSample of a filtered article:")  
print(filtered_articles[0]['article'])  
### OUTPUT #### 

# Sample of a filtered article:
# SAN DIEGO, California (CNN) -- You must know whats really driving the 
# immigration debate ...

In [None]:
cleaned_articles = []

for record in filtered_articles：
    text = record['article']

    # 使用正则表达式进行基本清理
    text = re.sub(r'^\(CNN\)\s*(--)?\s*', '', text)  # 删除（CNN）前缀
    text = re.sub(r'By .*? for Dailymail\.com.*?Updated:.*', '', text, flags=re.I | re.S) # 删除副标题
    text = re.sub(r'PUBLISHED:.*?UPDATED:.*', '', text, flags=re.I | re.S) # 删除已发表/已更新的内容
    text = re.sub(r'Last updated at.*on.*', '', text, flags=re.I) # 移除最后更新的内容
    text = re.sub(r'https?://\S+|www\.\S+', '[URL]', text) # 替换网址
    text = re.sub(r'<.*?>', '', text) # 删除 HTML 标记
    text = re.sub(r'b[\w.-]+@[\w.-]+\.\w+\b', '[EMAIL]', text) # 替换邮件地址
    text = re.sub(r'\s+', ' ', text).strip() # 对空白进行规范化处理

    # 存储清理后的结果
    cleaned_articles.append({
        “id": record[‘id’]、
        “cleaned_text": text、
        “summary": record.get(‘highlights’, ‘’)
    })

### Step 1

In [None]:
# 下载并加载 spaCy 的英语模型
# 只需运行一次）
spacy.cli.download(“en_core_web_sm”)
nlp = spacy.load(“en_core_web_sm”)

# 初始化一个计数器，用于保存实体标签计数（例如，PERSON、ORG、DATE）
entity_counts = Counter()

# 循环处理每篇文章，并应用 spaCy 的命名实体识别技术
for article in cleaned_articles：
    text = article['cleaned_text'] # 获取清理后的文本
    doc = nlp(text) # 使用 spaCy 处理文本

    # 计算在文本中找到的每个实体标签
    for ent in doc.ents：
        entity_counts[ent.label_] += 1

In [None]:
# 提取标签和计数
labels, counts = zip(*entity_counts)

# 绘制条形图
plt.figure(figsize=(12, 7))  # 设置图表尺寸
plt.bar(labels, counts, color='skyblue') # 创建条形图
plt.title(“Top Entity Type Distribution (via spaCy)”)  # 图表标题
plt.ylabel(“Frequency”) # Y 轴标签
plt.xlabel(“Entity Label”) # X 轴标签
plt.xticks(rotation=45, ha=“right”) # 旋转 X 轴标签以提高可视性
plt.tight_layout() # 调整布局以确保所有内容都能匹配
plt.show() # 显示曲线图

### Step 2

In [None]:
# 使用提供的配置初始化OpenAI客户端  
client = OpenAI(  
    base_url="YOUR LLM API Provider link",  
    api_key="LLM API KEY"  
)

In [None]:
def call_llm(system_prompt, user_prompt, model_name):  
    """  
    向语言模型（LLM）发送请求，根据提供的提示提取实体。

    Args:  
        system_prompt (str): 给LLM的指示或上下文（例如，如何行为）。 
        user_prompt (str): 包含要提取实体的文本的用户输入。
        model_name (str): 要使用的LLM模型的标识符（例如，"gpt-4"）。 

    Returns:  
        str: 来自LLM的JSON格式字符串响应，如果客户端不可用则为None。
    """

    # 构建并发送聊天完成请求到LLM  
    response = client.chat.completions.create(  
        model=model_name,  
        messages=[  
            {"role": "system", "content": system_prompt},  # 系统级指令  
            {"role": "user", "content": user_prompt}       # 用户提供的输入  
        ],  
    )  

    # 提取并返回响应内容（JSON字符串）  
    return response.choices[0].message.content.strip()


In [None]:
# 按频率获取前N个实体类型  
relevant_entity_labels_for_llm = [label for label, count in entity_counts.most_common(TOP_N_ENTITY_TYPES)]  
entity_types_string_for_prompt = ", ".join(relevant_entity_labels_for_llm)  



# LLM的系统提示  
# 我们指示它返回一个带有"entities"键的JSON对象  
# 其值是实体对象的列表。  
llm_ner_system_prompt = (  
    f"You are an expert Named Entity Recognition system. "
    f"From the provided news article text, identify and extract entities. "
    f"The entity types to focus on are: {entity_types_string_for_prompt}. "
    f"For each identified entity, provide its exact text span from the article and its type (use one of the provided types). "
    f"Output ONLY a valid JSON object with a single key 'entities'. The value of 'entities' MUST be a list of JSON objects, "
    f"where each object has 'text' and 'type' keys. "
    f"Example: {{\"entities\": [{{\"text\": \"United Nations\", \"type\": \"ORG\"}}, {{\"text\": \"Barack Obama\", \"type\": \"PERSON\"}}]}} "
    f"If no entities of the specified types are found, the 'entities' list should be empty: {{\"entities\": []}}."
)

In [None]:
def parse_llm_entity_json_output(llm_output_str):  
    """  
    解析LLM的JSON字符串并返回实体列表。
    假设格式为：{"entities": [{"text": "...", "type": "..."}]}  

    Args:  
        llm_output_str (str): 来自LLM的JSON字符串。

    Returns:  
        list: 提取的实体或如果解析失败则返回空列表。
    """
    ifnot llm_output_str:  
        return []  # 如果没有输出则返回空列表  

    # 如果存在markdown代码块则移除  
    if llm_output_str.startswith("```json"):  
        llm_output_str = llm_output_str[7:].rstrip("```").strip()  

    try:  
        data = json.loads(llm_output_str)  
        return data.get("entities", [])  # 返回实体列表，如果未找到则为空  
    except json.JSONDecodeError:  
        return []  # JSON错误时返回空列表

In [None]:
# 定义我们的实体提取LLM  
TEXT_GEN_MODEL_NAME = "microsoft/phi-4"

# 遍历有限数量的清理后文章以   
# 使用LLM提取实体  
for i, article_data in enumerate(cleaned_articles):  
    article_id = article_data['id']  
    article_text = article_data['cleaned_text']  

    # 调用LLM提取实体  
    llm_response_content = call_llm(  
        llm_ner_system_prompt,  
        article_text,  
        TEXT_GEN_MODEL_NAME  
    )  

    # 将LLM的响应解析为实体列表  
    extracted_llm_entities = []  
    if llm_response_content:  
        extracted_llm_entities = parse_llm_entity_json_output(llm_response_content)  

    # 将结果与文章一起存储  
    articles_with_llm_entities.append({  
        "id": article_id,  
        "cleaned_text": article_text,  
        "summary": article_data['summary'],  
        "llm_extracted_entities": extracted_llm_entities  
    })

In [None]:
print(articles_with_llm_entities[4212]['llm_extracted_entities'])  



# ### OUTPUT ###  
# Extracted 20 entities for article ID 4cf51ce937a.  
#   Sample entities: [  
#   {  
#     "text": "United Nations",  
#     "type": "ORG"
#   },  
#   {  
#     "text": "Algiers",  
#     "type": "GPE"
#   },  
#   {  
#     "text": "CNN",  
#     "type": "ORG"
#   }  

#    ...

### Step 3

In [None]:
# 关系提取的系统提示  # 我们要求一个带有"relationships"键的JSON对象。  
llm_re_system_prompt = (
    "You are an expert system for extracting relationships between entities from text, "
    "specifically focusing on **technology company acquisitions**. "
    "Given an article text and a list of pre-extracted named entities (each with 'text' and 'type'), "
    "your task is to identify and extract relationships. "
    "The 'subject_text' and 'object_text' in your output MUST be exact text spans of entities found in the provided 'Extracted Entities' list. "
    "The 'subject_type' and 'object_type' MUST correspond to the types of those entities from the provided list. "
    "Output ONLY a valid JSON object with a single key 'relationships'. The value of 'relationships' MUST be a list of JSON objects. "
    "Each relationship object must have these keys: 'subject_text', 'subject_type', 'predicate' (one of the types listed above), 'object_text', 'object_type'. "
    "Example: {\"relationships\": [{\"subject_text\": \"Innovatech Ltd.\", \"subject_type\": \"ORG\", \"predicate\": \"ACQUIRED\", \"object_text\": \"Global Solutions Inc.\", \"object_type\": \"ORG\"}]} "
    "If no relevant relationships of the specified types are found between the provided entities, the 'relationships' list should be empty: {\"relationships\": []}."
)

In [None]:

# {
#   "relationships": [
#     {
#       "subject_text": "Innovatech Ltd.",
#       "subject_type": "ORG",
#       "predicate": "ACQUIRED",
#       "object_text": "Global Solutions Inc.",
#       "object_type": "ORG"
#     }
#   ]
# }

In [None]:
def parse_llm_relationship_json_output(llm_output_str_rels):
  """
  解析LLM的JSON字符串以提取关系。
  预期格式：
  {"relationships": [{"subject_text": ..., "predicate": ..., "object_text": ...}]}
  Args:          
  llm_output_str_rels (str): 来自LLM的JSON字符串。
  Returns:          
  list: 提取的关系或如果解析失败则返回空列表。
  """
ifnot llm_output_str_rels:
    return []  # 如果没有输出则返回空列表      
# 如果存在markdown代码块则移除
if llm_output_str_rels.startswith("```json"):
        llm_output_str_rels = llm_output_str_rels[7:].rstrip("```").strip()

try:
    data = json.loads(llm_output_str_rels)
    return data.get("relationships", [])  # 返回关系列表，如果未找到则为空
except json.JSONDecodeError:
    return []  # JSON错误时返回空列表

In [None]:
# 遍历每篇文章的实体数据
for i, article_entity_data in enumerate(articles_with_llm_entities):
    # 从文章数据中提取文章 ID、已清理文本和已提取实体
    article_id_rels = article_entity_data['id']
    article_text_rels = article_entity_data['cleaned_text']
    current_entities = article_entity_data['llm_extracted_entities']
    
    # 将实体列表序列化为 JSON 字符串，以便包含在提示中
    entities_json_for_prompt = json.dumps(current_entities)

    # 构建用户提示，请求从 LLM 中提取关系
    user_prompt_for_re = (
        f"Article Text:\n```\n{article_text_for_llm_re}\n```\n\n"
        f"Extracted Entities (use these exact texts for subjects/objects of relationships):\n```json\n{entities_json_for_prompt}\n```\n\n"
        "Identify and extract relationships between these entities based on the system instructions."
    )
    
    # 调用 LLM 以根据提示提取关系
    llm_response_rels_content = call_llm_for_relationships(llm_re_system_prompt, user_prompt_for_re, TEXT_GEN_MODEL_NAME)
    
    # 初始化一个空列表来存储提取的关系
    extracted_llm_rels = []
    
    # 如果 LLM 响应不是空的，则解析 JSON 响应中提取的关系
    if llm_response_rels_content:
        extracted_llm_rels = parse_llm_relationship_json_output(llm_response_rels_content)

    
    # 将原始文章数据和提取的关系附加到结果列表中
    articles_with_llm_relations.append({ 
      **article_entity_data, # 保留原始文章数据(id, text, entities, etc.)
      "llm_extracted_relationships": extracted_llm_rels  # 添加提取的关系
    })
处理完成后，我们可以查看一篇文章中提取的关系样本：

# 打印一篇样本文章的关系
print(articles_with_llm_entities[1234]['llm_extracted_relationships'])

# ### OUTPUT ###
# Extracted 3 relationships using LLM.
#   Sample LLM relationships: [
#   {
#     "subject_text": "Microsoft Corp.",
#     "subject_type": "ORG",
#     "predicate": "ACQUIRED",
#     "object_text": "Nuance Communications Inc.",
#     "object_type": "ORG"
#   },
#   {
#     "subject_text": "Nuance Communications Inc.",
#     "subject_type": "ORG",
#     "predicate": "HAS_PRICE",
#     "object_text": "$19.7 billion",
#     "object_type": "MONEY"
#   }
# ]
# ... (similar output for other articles) ...
# 至此，我们已成功从文章数据集中提取了实体（节点）和关系（边），完成了构建知识图谱所需的基本元素。


In [None]:
#!/usr/bin/env python3
"""
Bulk-import keywords into RediSearch autocomplete, then query them.
"""

from pathlib import Path
from redis import Redis
from redis.commands.search.suggestion import Suggestion

##############################################################################
# 1 ▸ connection
##############################################################################

r = Redis(host="localhost", port=6379, decode_responses=True)
ac  = r.ft()                     # we only need the helper object, no schema

##############################################################################
# 2 ▸ importer
##############################################################################

def add_keywords_from_file(
    filepath: str | Path,
    key: str,
    batch: int = 10_000,
) -> None:
    """
    Read KEYWORD <TAB|SPACE|COMMA> COUNT from *filepath* and insert into
    the RediSearch autocomplete dictionary stored under *key*.
    """
    filepath = Path(filepath)

    with filepath.open(encoding="utf-8") as fh:
        pipe   = r.pipeline(transaction=False)
        total  = 0

        for i, line in enumerate(fh, 1):
            # -------------------- parse "keyword  count"
            parts = [p.strip() for p in line.strip().replace(",", " ").split()]
            if len(parts) < 2 or not parts[-1].isdigit():
                continue                          # skip malformed lines
            *token_parts, count_str = parts
            token  = " ".join(token_parts)
            score  = float(count_str)             # RediSearch wants a float

            # -------------------- stage FT.SUGADD
            pipe.ft().sugadd(
                key,
                Suggestion(token, score=score),
                # INCR=True makes repeated terms accumulate counts
                increment=True,
            )
            total += 1

            if i % batch == 0:
                pipe.execute()
                pipe = r.pipeline(transaction=False)

        pipe.execute()

    print(f"Inserted/updated {total:,} keywords into '{key}'")


##############################################################################
# 3 ▸ query helper
##############################################################################

def suggest(
    key: str,
    prefix: str,
    *,
    max_results: int = 10,
    fuzzy: bool = True,
):
    """
    Return up to *max_results* suggestions for *prefix*.
    """
    results = ac.sugget(
        key,
        prefix,
        max=max_results,
        fuzzy=fuzzy,
        with_scores=True,
        with_payloads=False,
    )
    # results is a list of Suggestion objects
    return [(s.string, s.score) for s in results]


##############################################################################
# 4 ▸ demo
##############################################################################

if __name__ == "__main__":
    DICT_KEY = "keywords_ac"

    add_keywords_from_file("keywords_counts.txt", DICT_KEY)

    for p in ("stra", "strawb", "jam"):
        print(f"\nSuggestions for '{p}':")
        for term, score in suggest(DICT_KEY, p):
            print(f"  {term:<30s} {score:,.0f}")



import nltk
from nltk.data import find

# List all corpora you may need across your projects
NLTK_RESOURCES = [
    "corpora/stopwords",
    "corpora/wordnet",
    "tokenizers/punkt",
    "taggers/averaged_perceptron_tagger",
    # Add more if needed
]

def ensure_nltk_resources():
    """Download required NLTK resources safely (only if missing)."""
    for resource in NLTK_RESOURCES:
        try:
            find(resource)
        except LookupError:
            nltk.download(resource.split("/", 1)[1], quiet=True)

# Call once during module load or app startup
ensure_nltk_resources()