# Test

## Paper Collection

In [None]:
import os
import json

from paper_collect import PaperCollector

In [None]:
llm_api_key = os.getenv('GEMINI_API_KEY_3')
llm_model_name="gemini-2.0-flash"
embed_api_key = os.getenv('GEMINI_API_KEY_3')
embed_model_name="models/text-embedding-004"

research_topic = "llm literature review"
seed_dois = ['10.48550/arXiv.2406.10252',  # AutoSurvey: Large Language Models Can Automatically Write Surveys
            '10.48550/arXiv.2412.10415',  # Generative Adversarial Reviews: When LLMs Become the Critic
            '10.48550/arXiv.2402.12928',  # A Literature Review of Literature Reviews in Pattern Analysis and Machine Intelligence 
            ]
seed_titles = ['PaperRobot: Incremental Draft Generation of Scientific Ideas',
            'From Hypothesis to Publication: A Comprehensive Survey of AI-Driven Research Support Systems'
            ]

In [None]:
ps = PaperCollector(   
    research_topic = research_topic,   
    seed_paper_titles = seed_titles, 
    seed_paper_dois = seed_dois,
    llm_api_key = llm_api_key,
    llm_model_name = llm_model_name,
    embed_api_key = embed_api_key,
    embed_model_name = embed_model_name,
    from_dt = '2020-01-01',
    to_dt = '2025-04-30',
    fields_of_study = ['Computer Science'])

In [None]:
await ps.construct_paper_graph(
    search_citation = 'both',  # 'both',
    search_author = True,
    find_recommend = True,
    if_related_topic = True,
    if_expanded_citations  = 'reference',  #  'reference',
    if_expanded_authors = True,
    if_add_similarity = True,
    similarity_threshold = 0.7,
    expanded_k_papers = 10,
    expanded_l_authors = 50,
)

## Basic Stats

In [None]:
len(ps.nodes_json)

In [None]:
len(ps.edges_json)

In [None]:
g = ps.pg.graph

In [None]:
len(g.nodes)

In [None]:
len(g.edges)

In [None]:
# check node types
set([g.nodes[nid].get('nodeType') for nid in g.nodes])

In [None]:
# stats of node types
node_types = [g.nodes[nid].get('nodeType') for nid in g.nodes]

from collections import Counter
counts = Counter(node_types)

# 按计数降序排序
sorted_counts = counts.most_common()
sorted_counts

In [None]:
# stats of edge types
edge_types = [d.get('relationshipType') for u, v, d in g.edges(data=True)]
print(set(edge_types))

from collections import Counter
counts = Counter(edge_types)

# 按计数降序排序
sorted_counts = counts.most_common()
print(sorted_counts)

## Key Nodes

In [None]:
# seed papers
seed_paper_dois = [nid for nid in g.nodes 
                        if g.nodes[nid].get('nodeType')=='Paper' and
                           g.nodes[nid].get('from_seed') == True]
seed_paper_nodes = [g.nodes[nid] for nid in g.nodes 
                        if g.nodes[nid].get('nodeType')=='Paper' and
                           g.nodes[nid].get('from_seed') == True]

In [None]:
print(seed_paper_dois)

In [None]:
# expanded papers with citation chain
# paper with cites but no citing, paper not in seed dois
paper_w_ref_dois = []
paper_w_ref_nodes = []

for nid in g.nodes:
    node = g.nodes[nid]
    if node.get('nodeType')=='Paper' and nid not in seed_paper_dois:
        out_edges_info = g.out_edges(nid, data=True)
        cnt = 0
        for u, v, data in out_edges_info:
            if data.get('relationshipType') == 'CITES':
               cnt += 1
        if cnt > 0:
            paper_w_ref_dois.append(nid)
            paper_w_ref_nodes.append(node)

print(paper_w_ref_dois)

In [None]:
expanded_paper_w_ref_dois = [x for x in paper_w_ref_dois if x not in seed_paper_dois]

In [None]:
filtered_dois = []
for doi in expanded_paper_w_ref_dois:
    out_edges_info = g.out_edges(doi, data=True)
    ref_cnt = sum([1 for u, v, data in out_edges_info if data.get('relationshipType') == 'CITES'])
    print(doi, ref_cnt)
    if ref_cnt > 2:
        filtered_dois.append(doi)

In [None]:
g.out_edges('10.48550/arXiv.2408.16498', data=True)

In [None]:
n = '10.48550/arXiv.2408.16498'
for v in g.successors(n):
    data = g[n][v]
    if data.get('relationshipType') == 'CITES':
        print(g.nodes[v])

In [None]:
len(filtered_dois)

Check cross refs  
- most refered to
- precessor of seed dois

In [None]:
paper_stat = []
for n in g.nodes:
    if g.nodes[n].get('nodeType') == 'Paper':
        in_edges_info = g.in_edges(n, data=True)
        cite_cnt = sum([1 for u, v, data in in_edges_info if data.get('relationshipType') == 'CITES'])
        sim_cnt = sum([1 for u, v, data in in_edges_info if data.get('relationshipType') == 'SIMILAR_TO'])
        paper_stat.append((n, cite_cnt, sim_cnt))

In [None]:
import numpy as np
np.average([1, None, 2])

In [None]:
sorted_by_cite = sorted(paper_stat, key=lambda item: item[1], reverse=True)
print(sorted_by_cite[0:20])

for item in sorted_by_cite[0:20]:
    n = item[0]
    cite_cnt = item[1]
    # paper infos
    title = g.nodes[n].get('title')
    overall_cite_cnt = g.nodes[n].get('citationCount')
    influential_cite_cnt = g.nodes[n].get('influentialCitationCount')
    # author infors
    hindex_lst = []
    for u in g.predecessors(n):
        if g.nodes[u].get('nodeType') == 'Author':
            hIndex = g.nodes[u].get('hIndex')
            if hIndex:
                hindex_lst.append(hIndex)
            paperCount = g.nodes[u].get('paperCount')
            citationCount = g.nodes[u].get('citationCount')
    h_index = np.average(hindex_lst)


    paper_info = {"doi":n, "title":title, 
                  "local_refs":cite_cnt, "global_refs":overall_cite_cnt, "inf_cite_cnt":influential_cite_cnt,
                  "h_index": h_index}
    print(paper_info)

In [None]:
sorted_by_sim = sorted(paper_stat, key=lambda item: item[2], reverse=True)
print(sorted_by_sim[0:20])

for item in sorted_by_sim[0:20]:
    n = item[0]
    sim_cnt = item[2]
    title = g.nodes[n].get('title')
    overall_cite_cnt = g.nodes[n].get('citationCount')
    # author infors
    hindex_lst = []
    for u in g.predecessors(n):
        if g.nodes[u].get('nodeType') == 'Author':
            hIndex = g.nodes[u].get('hIndex')
            if hIndex:
                hindex_lst.append(hIndex)
            paperCount = g.nodes[u].get('paperCount')
            citationCount = g.nodes[u].get('citationCount')
    h_index = np.average(hindex_lst)


    paper_info = {"doi":n, "title":title, 
                  "local_sims":sim_cnt, "global_refs":overall_cite_cnt, "inf_cite_cnt":influential_cite_cnt,
                  "h_index": h_index}
    print(paper_info)

In [None]:
for id in graph[0].graph.nodes:
    item = graph[0].graph.nodes[id]
    if item.get('nodeType') is None:
        print(id, item)

In [None]:
set(nodes_types)

In [None]:
set([d['relationshipType'] for u, v, d in graph[0].graph.edges(data=True)])

In [None]:
# node types and edges types to keep
filtered_node_labels = ['Paper', 'Topic', 'Author']
filtered_edges_labels = ['CITES', 'DISCUSS', 'WRITES']

In [None]:
G = graph[0].graph

In [None]:
# Create a list of node IDs to iterate over
node_ids_to_check = list(G.nodes) # <--- Create a static list here

# filter node types
for id in node_ids_to_check: # <-- Iterate over the list
    # Check if the node still exists (important if edges might remove nodes indirectly, though less likely here)
    if id in G:
        item = G.nodes[id]
        node_type = item.get('nodeType')
        if node_type not in filtered_node_labels:
            G.remove_node(id) # Modify the original graph G

In [None]:
# Create a list of edge tuples (u, v, data) to iterate over
edge_list_copy = list(G.edges(data=True)) # <--- Create a static list here

# filter edge types
for u, v, d in edge_list_copy: # <-- Iterate over the copy
    edge_type = d.get('relationshipType') # Use .get() for safety if attr might be missing
    if edge_type not in filtered_edges_labels:
         # Check if edge still exists (might have been removed if graph allows parallel edges and one was removed)
         if G.has_edge(u, v):
            G.remove_edge(u, v) # Modify the original graph G

In [None]:

G.remove_edge(1, 3)

In [None]:
set([graph[0].graph.nodes[x]['nodeType'] for x in graph[0].graph.nodes])

In [None]:
a = False
if a:
    print(111)

In [None]:
import networkx as nx
G = nx.Graph()

In [None]:
G.add_nodes_from([(4, {"color": "red"}), (5, {"color": "green"})])

In [None]:
G.nodes[4]

In [None]:
G.add_nodes_from([(4, {"color": "blue"})])

In [None]:
G.nodes[4]

In [None]:
G.add_nodes_from([(4, {"name": "No.4"})])

In [None]:
G.nodes[4]