In [140]:
from elasticsearch import Elasticsearch
import streamlit as st
import tqdm
from elasticsearch.helpers import scan

In [143]:
es = Elasticsearch(st.secrets["elastic_uri"])

ret_doc = []
# 指定索引名称
index_name = "wikipedia_title_20240201"

results = scan(
    client=es,
    query={"query": {"match_all": {}}},
    index=index_name,
    size=5_000,
    scroll='5m'
)

# 遍历结果
for doc in tqdm.tqdm(results):
    # _source 包含了文档的原始数据
    

    my_doc = {
        '_id': doc['_id'],
        'title': [doc['_source']['title']]
    }
    if doc['_source']['redirect']:
        my_doc['title'].extend(doc['_source']['redirect'])
    ret_doc.append(my_doc)

len(ret_doc)

5616951it [00:54, 103269.30it/s]


5616951

In [150]:
# # 更新 en_page 表的 title 
en_page_title = {}
en_page_redirect = {}
index_name = "en_page"

results = scan(
    client=es,
    query={"_source": ["title", "redirect", "id"],"query": {"match_all": {}}},
    index=index_name,
    # size=10,
    size=50_00,
    scroll='5m'
    # scroll='1s'
)

for doc in tqdm.tqdm(results):
    # _source 包含了文档的原始数据
    doc = doc['_source']
    if 'redirect' in doc:
        en_page_redirect[doc['title'].lower()] = doc['redirect'].lower()
    else:
        en_page_title[doc['title'].lower()] = doc['id']

print(len(en_page_redirect))
print(len(en_page_title))

17836728it [06:56, 42867.66it/s]

10533310
6826307





In [7]:
# 测试 wikititle 的 char 类型
c_set  = set()
for doc in ret_doc:
    for title in doc['title'][:1]:
        for c in title.lower():
            c_set.add(c)
len(c_set)

589

In [8]:
print(c_set)

{'ƌ', '利', 'ử', 'ẫ', 'ǵ', 'ḑ', '̄', '英', '溫', '̣', 'ա', 'ộ', 'k', '淳', 'ᵵ', 'ơ', 'σ', 'η', '⋒', 'ʼ', '黎', 'ǝ', 'û', 'ĸ', ',', 'ĺ', '唯', 'æ', 'ȳ', '%', 'ằ', '⅓', 'ṯ', 'ú', '許', 'e', '†', '逯', '‒', '姞', 'u', 'ǽ', '松', 'ġ', '理', '栗', '瑞', '∴', 'ǹ', 'ʈ', 'ḱ', 'ƴ', 'о', 'θ', 'и', '̨', 'ȼ', '⁄', 'ǥ', 'с', '⋶', 'ặ', '͟', '吉', 'ƙ', '¥', '̩', '®', '严', 'b', 'y', '⋐', 'ũ', 'ỽ', 'ẋ', 'ợ', '우', '÷', '豫', 'ǔ', 'o', '@', 'φ', '⊷', '–', 'ớ', '̌', 'ɠ', 'ọ', 'ŝ', 'ʰ', '麟', '그', 'ḿ', '斯', 'ở', '阝', 'ȟ', '﹟', 'ݨ', '余', 'ḹ', 'ɗ', 'ấ', '嵇', '£', '^', 'ǃ', 'ș', 'я', 'ē', 'ṕ', 'ģ', 'g', '符', 'ĵ', 'ω', 'ṟ', 'ª', 'к', 'ḵ', '武', '諴', 'ƃ', 'ɪ', 'º', 'h', 'ç', '路', 'ų', 'ế', 'ẃ', 'ǧ', '•', '̃', '«', '¾', 'ề', '♭', '兰', 'ă', 'ƹ', '이', '9', '儀', 'c', '曲', '3', 'ǣ', 'ĉ', 'ↀ', '́', '李', 'ḩ', 'ō', '려', 'ǎ', '粟', '͎', '̂', '⟡', '计', '己', 'ᴋ', 'ố', 'ɛ', '·', 'a', 'ů', 'ğ', 'ñ', ':', 'ä', 'ņ', 'ь', '→', 'ǩ', 'ȥ', 'ݭ', '⊐', '∂', '낙', 'ứ', '禮', 'ṃ', '₂', 'ễ', '紅', 'ħ', 'ą', '上', 'ô', '帝', 'ŋ', '\\', 'ʻ', 'ꞥ', 'î', '盧', '⋓'

# 使用 字典树的方案失败了,太占内存

In [2]:
class TrieNode:
    __slots__ = ['children', 'ID']
    def __init__(self):
        self.children = {}  # 子节点，使用字典存储
        self.ID = 0  # 标记是否为单词的结尾

class WikipediaTokenizer:
    def __init__(self):
        self.root = TrieNode()
        
    def insert(self, token_list, ID):
        node = self.root
        for token in token_list:
            # 如果字符不在当前节点的子节点中，添加一个新节点
            if token not in node.children:
                node.children[token] = TrieNode()
            # 移动到子节点
            node = node.children[token]
        # 设置当前节点为单词的结尾
        node.ID = ID

    def search(self, token_list):
        """
        -1 表示不存在该单词
        0 表示存在该单词，但是不是单词的结尾
        """
        node = self.root
        for token in token_list:
            # 如果字符不在当前节点的子节点中，表示单词不存在
            if token not in node.children:
                return -1
            node = node.children[token]
        # 检查当前节点是否为单词的结尾
        return node.ID


In [None]:


# WT = WikipediaTokenizer()

# for doc in tqdm.tqdm(ret_doc):
#     ID = doc['_id']
#     for title in doc['title'][:1]:
#         WT.insert(title.lower(), ID)

In [5]:
# import pickle

# with open('../model/WT.pkl', 'wb') as f:
#     train_data = pickle.dump(WT,f)

In [6]:
# import pickle

# with open('../model/WT.pkl', 'rb') as f:
#     WT = pickle.load(f)

# 使用 dict 方案

In [171]:

WTD = {}
for doc in tqdm.tqdm(ret_doc):
    ID = int(doc['_id'])
    for title in doc['title'][:]:
        WTD[title.lower()] = ID
        

# print(len(en_page_title))
print(len(WTD))

for title,ID in tqdm.tqdm(en_page_title.items()):
    WTD[title] = int(ID)
    
print(len(WTD))
for title,true_title in tqdm.tqdm(en_page_redirect.items()):
    if true_title in WTD:
         WTD[title] = WTD[true_title]
         

print(len(WTD))

100%|██████████| 5616951/5616951 [00:09<00:00, 570984.37it/s]


14621981


100%|██████████| 6826307/6826307 [00:04<00:00, 1619875.38it/s]


15848253


100%|██████████| 10533310/10533310 [00:10<00:00, 1051817.91it/s]

16878172





In [154]:

text = """
The Facebookmoord ("Facebook murder") is a term Harvard University coined by Dutch media for the 2012 murder of Joyce (Winsie) Hau, by the then 14–year-old Jinhua K. from Capelle aan den IJssel, in the Netherlands. Hau's father was also injured during the attack. The case was given its name because the motive lay in a disagreement on Facebook.
"""

In [172]:
import pickle, gzip

with gzip.open('../model/WTD.pkl.gz', 'wb') as f:
    train_data = pickle.dump(WTD,f)


In [41]:
import pickle, gzip

with gzip.open('../model/WTD.pkl.gz', 'rb') as f:
    WTD = pickle.load(f)


In [155]:
def format_text(text):
    return text.lower().split(" ")


def get_token(text):
    token_set = set()
    annotated_text = []
    
    token_list = format_text(text)
    token_range = 5
    start_token_index = 0
    end_token_index = token_range
    last_token_index = 0
    
    while start_token_index < len(token_list):
        
        sub_token_list = token_list[start_token_index:end_token_index]
        sub_text = " ".join(sub_token_list).strip()
        ID = WTD.get(sub_text)
        
        if ID:
            # 前置空白区
            last_string = " ".join(token_list[last_token_index:start_token_index])
            if last_string:
                annotated_text.append(last_string)
                
            # 当前 token 区
            annotated_text.append((sub_text, ID))
            last_token_index = end_token_index
            start_token_index = end_token_index
            end_token_index = start_token_index + token_range
            
            token_set.add((ID, sub_text))
        else:
            if end_token_index - start_token_index > 1: 
                end_token_index -= 1
            else:
                start_token_index += 1
                end_token_index = start_token_index + token_range
            
        
    return token_set, annotated_text


get_token(text)

({(290, 'a'),
  (20171, 'murder'),
  (21673, 'name'),
  (47374, '2012'),
  (83427, 'father'),
  (180184, 'term'),
  (216834, 'disagreement'),
  (224192, 'in'),
  (294441, 'is a'),
  (299083, 'the motive'),
  (310121, 'its'),
  (360668, 'by'),
  (436179, 'of'),
  (788093, 'injured'),
  (904370, 'because'),
  (920255, 'aan'),
  (1715111, 'was'),
  (2017600, 'den'),
  (2102103, 'on'),
  (2558477, 'given'),
  (2853114, 'jinhua'),
  (3475505, 'for'),
  (3835872, 'coined'),
  (4140811, 'lay'),
  (6133489, 'k.'),
  (7529378, 'facebook.'),
  (8282451, 'dutch media'),
  (9716090, 'the'),
  (10144925, 'capelle'),
  (16633798, 'the case'),
  (18426501, 'harvard university'),
  (18534749, 'also'),
  (20965144, 'joyce'),
  (23139855, 'from'),
  (27399125, 'the then'),
  (76215005, 'during')},
 [('the', 9716090),
  'facebookmoord ("facebook murder")',
  ('is a', 294441),
  ('term', 180184),
  ('harvard university', 18426501),
  ('coined', 3835872),
  ('by', 360668),
  ('dutch media', 8282451),
  ('f

# model

In [16]:
!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu

Looking in indexes: https://download.pytorch.org/whl/cpu
Collecting torch
  Downloading https://download.pytorch.org/whl/cpu/torch-2.3.1%2Bcpu-cp312-cp312-linux_x86_64.whl (190.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m190.4/190.4 MB[0m [31m9.3 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting torchvision
  Downloading https://download.pytorch.org/whl/cpu/torchvision-0.18.1%2Bcpu-cp312-cp312-linux_x86_64.whl (1.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m6.9 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0mm
[?25hCollecting torchaudio
  Downloading https://download.pytorch.org/whl/cpu/torchaudio-2.3.1%2Bcpu-cp312-cp312-linux_x86_64.whl (1.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m6.7 MB/s[0m eta [36m0:00:00[0mta [36m0:00:01[0m
Installing collected packages: torch, torchvision, torchaudio
Successfully installed torch-2.3.1+cpu torchaudio-2.3.1+cpu

In [8]:
!pip install modelscope addict simplejson sortedcontainers transformers sentence_transformers



In [6]:
# !pip install flash-attn --no-build-isolation

In [1]:
import modelscope
import os
# os.environ['http_proxy'] = 'http://192.168.1.227:23333'
# os.environ['https_proxy'] = 'http://192.168.1.227:23333'
os.environ['http_proxy'] = ''
os.environ['https_proxy'] = ''

  from .autonotebook import tqdm as notebook_tqdm


In [28]:
# from modelscope.models import Model
from transformers import AutoModel
from numpy.linalg import norm

cache_dir="/home/ider/.cache/modelscope/hub/jinaai/jina-embeddings-v2-small-en"
cos_sim = lambda a,b: (a @ b.T) / (norm(a)*norm(b))
# model = Model.from_pretrained('jinaai/jina-embeddings-v2-small-en', trust_remote_code=True) # trust_remote_code is needed to use the encode method
model = AutoModel.from_pretrained(cache_dir, trust_remote_code=True)
embeddings = model.encode(['How is the weather today?', 'What is the current weather like today?'])
print(cos_sim(embeddings[0], embeddings[1]))

embeddings = model.encode(['How is the weather today?', '今天天气怎么样?'])
print(cos_sim(embeddings[0], embeddings[1]))


0.9399813
0.7471952


In [39]:
from modelscope import AutoModel
from numpy.linalg import norm

cos_sim = lambda a,b: (a @ b.T) / (norm(a)*norm(b))
model = AutoModel.from_pretrained('jinaai/jina-embeddings-v2-base-zh', trust_remote_code=True) # trust_remote_code is needed to use the encode method
embeddings = model.encode(['How is the weather today?', '今天天气怎么样?'])
print(cos_sim(embeddings[0], embeddings[1]))


embeddings = model.encode(['How is the weather today?', 'What is the current weather like today?'])
print(cos_sim(embeddings[0], embeddings[1]))

0.7860607
0.8722507


In [48]:
embeddings = model.encode(['How is the weather today?', '今天出发?'])
print(cos_sim(embeddings[0], embeddings[1]))

embeddings = model.encode(['How is the weather today?', 'nice! fake news lets go to school'])
print(cos_sim(embeddings[0], embeddings[1]))


embeddings = model.encode(['How is the weather today?', 'weather is nice '])
print(cos_sim(embeddings[0], embeddings[1]))

0.39379686
0.125564
0.42542523


# 找 1000 个例子测试
我们就用3级学科进行测试

In [83]:
category_list = ['Computer engineering', 'Computer science', 'Psychology', 'Mathematics', 'Environmental engineering', 'Chemical engineering', 'Theoretical computer science', 'Sociology', 'Economics', 'Biological engineering', 'Deep learning', 'Anthropology', 'Medicine', 'Philosophy', 'Blockchains', 'Electrical engineering', 'Engineering disciplines', 'Genetic engineering', 'Materials science', 'Physics', 'Logic', 'Cognitive science', 'Mechanical engineering', 'Geography', 'Chemistry', 'Industrial engineering', 'Theoretical physics', 'Linguistics', 'Machine learning', 'Biology', 'Environmental science', 'Civil engineering', 'Artificial intelligence', 'Genome editing', 'Political science', 'Quantum computing', 'Neuroscience', 'Geology', 'Literature', 'History']
len(set(category_list))

40

In [2]:

import neo4j

def query_title(title):
    driver = neo4j.GraphDatabase.driver(
            "bolt://192.168.1.227:17688",
            auth=("neo4j", "neo4j-test"),
        )
    record_list = []
    with driver.session(database="enwiki") as session:
        result = session.run(
            "MATCH (start:page {f_title: $source})-[r:page]->(end:page) "
            "RETURN start.pageId as SID, end.f_title as title, end.pageId as EID "
            ,
            source=title,
        )
        for record in result:
            record_list.append(record)

    return record_list

title_set = set()
for title in category_list:
    for record in query_title(title):
        title_set.add((title, int(record["SID"])))
        title_set.add((record["title"], int(record["EID"])))
len(title_set)

3968

In [3]:
import sys
import tqdm

# 使用相对路径添加父目录到搜索路径
sys.path.append("../gpc_demo") 
sys.path.append("..") 

from utils import calculate_cartesian_product_distances, get_token, search_wikipedia, get_plaintext, calculate_similarity,query_partner_distancles


  from .autonotebook import tqdm as notebook_tqdm
2024-07-09 20:02:52.525 
  command:

    streamlit run /home/ider/miniconda3/envs/py312/lib/python3.12/site-packages/ipykernel_launcher.py [ARGUMENTS]
2024-07-09 20:03:14.578 No runtime found, using MemoryCacheStorageManager
2024-07-09 20:03:21.165 No runtime found, using MemoryCacheStorageManager
2024-07-09 20:03:21.166 No runtime found, using MemoryCacheStorageManager
2024-07-09 20:03:21.167 No runtime found, using MemoryCacheStorageManager
2024-07-09 20:03:21.168 No runtime found, using MemoryCacheStorageManager


In [14]:
import random
result_list = []

title_data_list = list(title_set)

tmp_list = []
for title_b, ID_b in title_data_list:
    if title_b in category_list:
        tmp_list.append((title_b, ID_b))
tmp_list.extend(title_data_list[:100])
random.shuffle(tmp_list)
print(len(tmp_list))

for title_a, ID_a in tqdm.tqdm(tmp_list):
    for title_b, ID_b in tmp_list:
        if title_a == title_b:
            continue
        
        plaintext_a = get_plaintext(ID_a)
        plaintext_b = get_plaintext(ID_b)
        
        if not plaintext_a or not plaintext_b:
            continue
        
        token_set_1, _ = get_token(plaintext_a)
        token_set_2, _ = get_token(plaintext_b)
        
        df, _ = calculate_cartesian_product_distances(
                    tuple(token_set_1 - token_set_2), 
                    tuple(token_set_2 - token_set_1),
                    tuple(token_set_2 & token_set_1),
                )
        if len(df):
            result_list.append({
                "source": title_a,
                "dest": title_b,
                "title distance": query_partner_distancles(ID_a, ID_b),
                "tokens distance": df['distance'].mean().round(4),
                "STS": calculate_similarity(plaintext_a, plaintext_b)
            })
                        

138


100%|██████████| 138/138 [5:17:04<00:00, 137.86s/it]  


In [16]:
import pandas as pd
df_ok = pd.DataFrame(result_list)
df_ok.to_csv("./tmp.csv")

In [32]:
df_ok[df_ok['STS'] > 0.5]

Unnamed: 0,source,dest,title distance,tokens distance,STS
0,Crosslinguistic influence,Thin film,,0.5112,-0.018466
1,Crosslinguistic influence,Neoevolutionism,,0.4714,0.109186
2,Crosslinguistic influence,Biological motion,,0.4410,0.082935
3,Crosslinguistic influence,Field (mineral deposit),,0.6099,-0.017856
4,Crosslinguistic influence,Stereochemistry,,0.4835,0.112445
...,...,...,...,...,...
18727,River linking,Dry well,,0.5708,0.180327
18728,River linking,Chemical stability,,0.5863,0.086699
18729,River linking,Biology,,0.6240,0.036248
18730,River linking,Digital empathy,,0.6314,0.009882


In [20]:
from scipy.stats import pearsonr, spearmanr
from sklearn.metrics.pairwise import (
    paired_cosine_distances,
    paired_euclidean_distances,
    paired_manhattan_distances,
)

In [97]:
except_row = ['Field (mineral deposit)',
 'Electromaterials',
 'Inverter-based resource',
 'Transparent heating film',
 'Epistemic cognition',
 'Bullwheel',
 'Policy monitoring',
 'Transparent wood composite',
 'Ferroelasticity',
 'Direct laser interference patterning',
 'COMOS',
 'Photomechanical effect',
 'Cerebral rubicon',
 'Testing of advanced thermoplastic composite welds',
 'Ore dock',
 'Bioresilience',
 'The Fable of Oscar',
 'Survey camp',
 'Reverse roll coating',
 'Mental timeline',
 'River linking']
df_any = df_ok.fillna(1, inplace=False)
df_any = df_any.rename(columns={'title distance': 'google similarity', 
                                'tokens distance': 'token similarity',
                                'STS': 'STS similarity',
                                })
# df_any = df_any[~df_any['source'].isin(except_row)]
df_any['google similarity'] = 1 - df_any['google similarity']
df_any['token similarity'] = 1 - df_any['token similarity']
df_any.to_csv("../model/score_subject.csv", index=False)
df_any

Unnamed: 0,source,dest,google similarity,token similarity,STS similarity
0,Crosslinguistic influence,Thin film,0.0,0.4888,-0.018466
1,Crosslinguistic influence,Neoevolutionism,0.0,0.5286,0.109186
2,Crosslinguistic influence,Biological motion,0.0,0.5590,0.082935
3,Crosslinguistic influence,Field (mineral deposit),0.0,0.3901,-0.017856
4,Crosslinguistic influence,Stereochemistry,0.0,0.5165,0.112445
...,...,...,...,...,...
18727,River linking,Dry well,0.0,0.4292,0.180327
18728,River linking,Chemical stability,0.0,0.4137,0.086699
18729,River linking,Biology,0.0,0.3760,0.036248
18730,River linking,Digital empathy,0.0,0.3686,0.009882


In [80]:
# a = [1,2,3]
# b = [4,-1,8]
# print(pearsonr(a,b))
# print(spearmanr(a,b))
len(df_any)

15873

In [66]:

# 		STS
# df_ok[df_ok['STS'] > 0.5]
a = df_any['STS']
b = df_any['tokens distance']
# print(pearsonr(a,b))
print(spearmanr(a,b))

a = df_any['title distance']
b = df_any['tokens distance']
# print(pearsonr(a,b))
print(spearmanr(a,b))


a = df_any['STS']
b = df_any['title distance']
# print(pearsonr(a,b))
print(spearmanr(a,b))

SignificanceResult(statistic=0.37640778945453646, pvalue=0.0)
SignificanceResult(statistic=0.3153197108388149, pvalue=0.0)
SignificanceResult(statistic=0.462048788927142, pvalue=0.0)


In [67]:
# df_any[df_any['source'] == "Ferroelasticity"]

In [69]:
import numpy as np
except_title = []
for key in df_any['source'].drop_duplicates():
    # print(key)
    df_tmp = df_any[df_any['source'] == key]
    a = df_tmp['STS']
    b = df_tmp['tokens distance']
    # print(pearsonr(a,b))
    v,_ = spearmanr(a,b) 
    print(v)

    a = df_tmp['title distance']
    b = df_tmp['tokens distance']
    # print(pearsonr(a,b))
    v,_ = spearmanr(a,b) 
    print(v)

    a = df_tmp['STS']
    b = df_tmp['title distance']
    # print(pearsonr(a,b))
    v,_ = spearmanr(a,b) 
    # if np.isnan(v):
    #     except_title.append(key)
        # print(key)
    print(v)
    # break
# except_title

0.25608762919702605
0.271908759676496
0.21887605805293883
0.18574890528335267
0.23369066817574438
0.25623069393453884
0.43343848957457887
0.3090280699762393
0.23636565663660966
0.48319851915246287
0.2772394168617563
0.20556582063818993
0.34491288502009
0.2121037478804271
0.48080836659021353
0.5145502043685085
0.45563694662478144
0.5573218755711219
0.4509163800354335
0.4632086806306368
0.7090190844030354
0.40989279730876194
0.20317020185395815
0.18468094142212796
0.6292255709175253
0.620223432976125
0.7047140594641933
0.4049339366786404
0.14783457167642364
-0.021725520967776347
0.13334637353717774
-0.09755739784181279
0.049326362366699
0.4783948960874043
0.38348024517355994
0.3084022308382367
0.5228213831123807
0.19692480223823558
0.20462944841432473
0.3913308792345814
0.3273263442070751
0.40960022793393236
0.40274035671325864
0.32998473735951617
0.7034808764702121
0.5429667856413554
0.47186457231304996
0.7284986097305473
0.316456375596099
0.013009675442184346
-0.056374865588637135
0.21

In [78]:
import requests
def get_llm_article(title):

    url = "https://api.siliconflow.cn/v1/chat/completions"

    payload = {
        "model": "Qwen/Qwen2-7B-Instruct",
        "messages": [
            {
                "role": "user",
                "content":  "# You are an encyclopedia writer. \n"
                            f"Please write an article for *{title}* that is easy to understand and should be around 150 words long in English"
            }
        ]
    }
    headers = {
        "accept": "application/json",
        "content-type": "application/json",
        "authorization": "Bearer sk-hxndleevlrdiygjlrfsnfgltpwwopommwquqnzdixrwrifxa"
    }
    try:
        response = requests.post(url, json=payload, headers=headers)
        if response.status_code == 200:
            print(response.json())
            return response.json()['choices'][0]['message']['content']
        else:
            print(response.text)
    except Exception as e:
        print(e)
        
    return "sorry! something went wrong"

    
    
get_llm_article("earth")

{'id': '01909ad2321613e2c455f004929a57c8', 'object': 'chat.completion', 'created': 1720584385, 'model': 'Qwen/Qwen2-7B-Instruct', 'choices': [{'index': 0, 'message': {'role': 'assistant', 'content': 'Earth, officially known as Terra, is an incredible planet and the third planet in our solar system, residing in a distinct orbit around the sun. Uniquely, Earth is among the few celestial bodies in our solar system believed to harbor life, specifically because of the perfect climate conditions that support it. Earth is a few light-minutes away from the heart of the solar system, our star, making its temperature just right not to be scorched nor frozen, which is an indispensable ingredient for supporting life.\n\nThe flagship of Earth is its large, liquid water ocean, which envelops 71 percent of the planet\'s surface. This water has had a deterring effect on solar heating and has served as a powerful promoter of Earth’s climate and ecological balance. The remaining part of our planet is ei

'Earth, officially known as Terra, is an incredible planet and the third planet in our solar system, residing in a distinct orbit around the sun. Uniquely, Earth is among the few celestial bodies in our solar system believed to harbor life, specifically because of the perfect climate conditions that support it. Earth is a few light-minutes away from the heart of the solar system, our star, making its temperature just right not to be scorched nor frozen, which is an indispensable ingredient for supporting life.\n\nThe flagship of Earth is its large, liquid water ocean, which envelops 71 percent of the planet\'s surface. This water has had a deterring effect on solar heating and has served as a powerful promoter of Earth’s climate and ecological balance. The remaining part of our planet is either land, accommodating a wide diversity of terrestrial ecosystems, or undersea plains. The land is divided into continents and islands, supporting a plethora of life forms, and entirely covered or 

# 再找一组例子测试
Presidents_of_the_United_States

In [92]:
category_list = set(["John Adams","John Quincy Adams","Chester A. Arthur","Joe Biden","James Buchanan","William Henry Harrison", "George H. W. Bush","George W. Bush","Jimmy Carter","Grover Cleveland","Bill Clinton","Calvin Coolidge","Dwight D. Eisenhower","Millard Fillmore","Gerald Ford","James A. Garfield","Ulysses S. Grant","Warren G. Harding","Benjamin Harrison","Rutherford B. Hayes","Herbert Hoover","Andrew Jackson","Thomas Jefferson","Andrew Johnson","Lyndon B. Johnson","John F. Kennedy","Abraham Lincoln","James Madison","William McKinley","James Monroe","Richard Nixon","Barack Obama","Franklin Pierce","James K. Polk","Ronald Reagan","Franklin D. Roosevelt","Theodore Roosevelt","William Howard Taft","Zachary Taylor","Harry S. Truman","Donald Trump","John Tyler","Martin Van Buren","George Washington","Woodrow Wilson",])
len(category_list)

45

In [90]:
def query_title(title):
    driver = neo4j.GraphDatabase.driver(
            "bolt://192.168.1.227:17688",
            auth=("neo4j", "neo4j-test"),
        )
    record_list = []
    with driver.session(database="enwiki") as session:
        result = session.run(
            "MATCH (start:page {f_title: $source})-[r:page]->(end:page) "
            "RETURN start.pageId as SID, end.f_title as title, end.pageId as EID "
            ,
            source=title,
        )
        for record in result:
            record_list.append(record)

    return record_list

title_set = set()
for title in category_list:
    for record in query_title(title):
        title_set.add((title, int(record["SID"])))
        # title_set.add((record["title"], int(record["EID"])))
len(title_set)

45

In [94]:
import random
result_list_p2 = []

title_data_list = list(title_set)


tmp_list = title_set
print(len(tmp_list))

for title_a, ID_a in tqdm.tqdm(tmp_list):
    for title_b, ID_b in tmp_list:
        if title_a == title_b:
            continue
        
        plaintext_a = get_plaintext(ID_a)
        plaintext_b = get_plaintext(ID_b)
        
        if not plaintext_a or not plaintext_b:
            continue
        
        token_set_1, _ = get_token(plaintext_a)
        token_set_2, _ = get_token(plaintext_b)
        
        df, _ = calculate_cartesian_product_distances(
                    tuple(token_set_1 - token_set_2), 
                    tuple(token_set_2 - token_set_1),
                    tuple(token_set_2 & token_set_1),
                )
        if len(df):
            result_list_p2.append({
                "source": title_a,
                "dest": title_b,
                "title distance": query_partner_distancles(ID_a, ID_b),
                "tokens distance": df['distance'].mean().round(4),
                "STS": calculate_similarity(plaintext_a, plaintext_b)
            })
                        

45


100%|██████████| 45/45 [1:59:12<00:00, 158.94s/it]  


In [100]:

df_ok_p2 = pd.DataFrame(result_list_p2)
# df_ok_p2.to_csv("./tmp.csv")

df_any = df_ok_p2.fillna(1, inplace=False)
df_any = df_any.rename(columns={'title distance': 'google similarity', 
                                'tokens distance': 'token similarity',
                                'STS': 'STS similarity',
                                })

# df_any = df_any[~df_any['source'].isin(except_row)]
df_any['google similarity'] = 1 - df_any['google similarity']
df_any['token similarity'] = 1 - df_any['token similarity']
df_any.to_csv("../model/score_president.csv", index=False)
df_any[df_any['google similarity'] > 0.5]

Unnamed: 0,source,dest,google similarity,token similarity,STS similarity
0,James A. Garfield,Theodore Roosevelt,0.636485,0.5889,0.347201
1,James A. Garfield,John Tyler,0.737854,0.5961,0.437931
3,James A. Garfield,James Monroe,0.725755,0.5866,0.409478
4,James A. Garfield,Richard Nixon,0.575057,0.5362,0.339013
5,James A. Garfield,Warren G. Harding,0.762793,0.6210,0.568400
...,...,...,...,...,...
1975,George Washington,John Quincy Adams,0.629134,0.5750,0.447842
1976,George Washington,Franklin D. Roosevelt,0.573534,0.5478,0.350501
1977,George Washington,William McKinley,0.566138,0.5137,0.420271
1978,George Washington,Zachary Taylor,0.600656,0.5596,0.392882


In [102]:

from modelscope import AutoModel
model = AutoModel.from_pretrained('jinaai/jina-embeddings-v2-base-en', trust_remote_code=True) # trust_remote_code is needed to use the encode method
    

In [125]:
from scipy.spatial.distance import (
    cosine,
    euclidean,
    minkowski,
    cityblock,
    chebyshev,
)

from numpy.linalg import norm
cos_sim = lambda a,b: (a @ b.T) / (norm(a)*norm(b))

a = [1,2,3,4,5,6]
b = [11,22,33,44,55,66]
a,b = model.encode(["今天天气不错", "今天天气绝对不是非常不错,"])
# a,b = calculate_similarity(plaintext_a, plaintext_b)
for func in [cosine, euclidean, minkowski,cityblock,chebyshev]:
    
    print(func.__name__, func(a, b))
# cos_sim(a,b)

cosine 0.47387356350132315
euclidean 3.5670745372772217
minkowski 3.5670745372772217
cityblock 77.864235
chebyshev 0.38387686


In [22]:
import redis
connection_pool = redis.ConnectionPool(host='192.168.1.227', port=6379, db=0)

# 创建 Redis 客户端实例，使用上面创建的连接池
r = redis.Redis(connection_pool=connection_pool,decode_responses=True)
# 定义一批键
keys = ['a', 'b', 'c', 'd']

# 使用 mget 命令批量查询
values = r.mget(keys)

# 打印结果
for key, value in zip(keys, values):
    print(f"Key: {key}, Value: {float(value) if value  else 'None'}")

Key: a, Value: 1.23
Key: b, Value: 0.414145
Key: c, Value: -0.51413526
Key: d, Value: None


In [19]:
data = {
    'a': 1.23,
    'b': 0.414145,
    'c': -.51413526
}
r.mset(data)

True

In [95]:
stream_name = "test_stream"
consumer_name = 'my_consumer'
group_name = 'my_group'
for i in range(1,3):
    r.xadd(stream_name, {"a": i, "b": i})
    

In [131]:

for stream, messages in r.xread({stream_name:"0-0"}, count=50,block=5000):
    for message_id, message in messages:
        print(f'Received message {message_id} with data: {message}')
        # print(message[b"a"])
        # r.xack(stream_name, "", message_id)
        # r.xdel(stream_name, message_id)

Received message b'1721094328313-0' with data: {b'_id': b'1040533-609303', b'SID': b'1040533', b'Stitle': b'Martin_McDonagh', b'EID': b'609303', b'Etitle': b'Nicholas_Hytner', b'weight': b'0.34160003850600884'}
Received message b'1721094328314-0' with data: {b'_id': b'893454-4287791', b'SID': b'893454', b'Stitle': b'Frank_Langella', b'EID': b'4287791', b'Etitle': b'Kathryn_Hahn', b'weight': b'0.34160003850600884'}
Received message b'1721094328314-1' with data: {b'_id': b'21643854-40186398', b'SID': b'21643854', b'Stitle': b'Far_East_Movement', b'EID': b'40186398', b'Etitle': b'Roar_(song)', b'weight': b'0.34160003850600884'}
Received message b'1721094328314-2' with data: {b'_id': b'394815-3601630', b'SID': b'394815', b'Stitle': b'Introduced_species', b'EID': b'3601630', b'Etitle': b"Townsend's_solitaire", b'weight': b'0.34160006720832875'}
Received message b'1721094328314-3' with data: {b'_id': b'62266-129585', b'SID': b'62266', b'Stitle': b'William_Wyler', b'EID': b'129585', b'Etitle'

In [138]:
group_name = 'my_group'
try:
    # r.xgroup_create(stream_name, group_name, id='0', mkstream=True)
    r.xgroup_create(stream_name, group_name, mkstream=True)
except redis.exceptions.ResponseError as e:
    # 如果组已经存在，则忽略错误
    if "BUSYGROUP Consumer Group name already exists" not in str(e):
        raise

In [132]:
consumer_name = 'myconsumer'
while True:
    try:
        print(1)
        messages = r.xreadgroup(group_name, consumer_name, {stream_name: '>'}, count=100, block=1000)
        for stream, msgs in messages:
            for msg_id, msg_data in msgs:
                print(f"Message ID: {msg_id}, Message Data: {msg_data}")
                # 在处理完消息后，可以确认消息已被处理
                r.xack(stream_name, group_name, msg_id)
    except Exception as e:
        print(f"Error reading messages: {e}")
        break

1
Error reading messages: NOGROUP No such key 'task_0' or consumer group 'my_group_1' in XREADGROUP with GROUP option


In [111]:
# collection = DATABASE.token_mathch_v20240712

import functools
import time

def tt(v):
    time.sleep(1)
    print(v)
    return v

tt = functools.lru_cache(maxsize=5)(tt)

tt(3)

3


3

In [139]:
consumer_name = f"consumer-1"
group_name = 'my_group'
stream_name = 'task_0'
# r.xgroup_create(stream_name, group_name, mkstream=True)

while True:
    try:
        messages = r.xreadgroup(group_name, consumer_name, {stream_name: '0-0'}, count=1, block=1000)
        if not messages:
            print("No new messages")
            break
        for stream, msgs in messages:
            for msg_id, msg_data in msgs:
                # yield msg_id, msg_data
                print(f"Message ID: {msg_id}, Message Data: {msg_data}")
                # time.sleep(1)
                # 在处理完消息后，可以确认消息已被处理
                # r.xack(stream_name, group_name, msg_id)
    except Exception as e:
        print(f"Error reading messages: {e}")
        break

KeyboardInterrupt: 

In [156]:
# r.xpending(stream_name, group_name)

In [162]:
import sys
import time

sys.path.append("../gpc_demo") 
sys.path.append("..") 
from utils import (
    query_distance
)


In [170]:

query_distance((123131,51235214,27298083,36645032,18963910 ),(18963910 ,123131,51235214,27298083,36645032))

[(27298083, 36645032, 0.5266786),
 (18963910, 27298083, 0.65231186),
 (18963910, 36645032, 0.7101743),
 (18963910, 27298083, 0.65231186),
 (18963910, 36645032, 0.7101743),
 (27298083, 36645032, 0.5266786)]