In [66]:
import pandas as pd
from elasticsearch import helpers
from elasticsearch import Elasticsearch
es = Elasticsearch(
    "http://133.11.72.200:9200/",
    # ca_certs="./ti06.crt",
    timeout=500,
    basic_auth=('oa_reader', 'five@godtruth')
)

def build_es_query_titleabstract_or(keywords,gte=1800,lte=2030):
    """Build Elasticsearch query for a specific keyword."""

    list_keywords = []
    for key in keywords:
        list_keywords = list_keywords + [  {
                        "match_phrase": {
                          "abstract": key
                        }
                      },
                                      {
                        "match_phrase": {
                          "title": key
                        }
                      }]
    
    return {
          "query": {
            "bool": {
              "must": [
                {
                  "range": {
                    "publication_year": {
                      "gte": gte,   
                      "lte": lte   
                    }
                  }
                },
                { "range" : { "cited_by_count" :{
                        "gte" : 0,
                    } }},   
                {
                  "bool": {
                    "should": list_keywords
                }
                }
              ]
            }
          }
        }
    

def build_es_query_topic(x):
    """Build Elasticsearch query for a topic."""
    topics = pd.read_pickle('/disks/qnap3/shared/openalex-24/data/master/topics.pickle')
    if x in topics.index:
        keyword = topics.loc[x].display_name
    else:
        keyword = topics.loc["https://openalex.org/"+str(x)].display_name
        
    return {
        "query": {
            "bool": {
            "must" : [
                { "match" : { "language" : "en" }},
                { "term" : { "topics": {"value":keyword} }}, 
                { "range" : { "cited_by_count" :{"gte" : 0,} }},       
                { "range" : { "publication_year" :{"gte" : 0,"lte":2025} }}
            ]

            }
        }
    }
def build_es_query_by_ids(id_list):
    return {
        "query": {
            "bool": {
                "must": [
                    {
                        "terms": {
                            "id": id_list
                        }
                    }
                ]
            }
        }
    }




def get_result(es_client, es_index, es_query, scroll_time="2m", batch_size=1000):
    all_hits = []
    response = es_client.search(index=es_index, body=es_query, scroll=scroll_time, size=batch_size)
    scroll_id = response['_scroll_id']
    
    while response['hits']['hits']:
        all_hits.extend(hit['_source'] for hit in response['hits']['hits'])
        response = es_client.scroll(scroll_id=scroll_id, scroll=scroll_time)
    
    es_client.clear_scroll(scroll_id=scroll_id)
    result = pd.DataFrame(all_hits)
    
    if 'cited_by_count' in result.columns:
        result = result.sort_values('cited_by_count')
    if 'id' in result.columns:
        result.index = result['id']
    
    journal = pd.read_pickle('/disks/qnap3/shared/openalex-24/data/journal/journal.pickle')
    result['journal'] = result['primary_location'].map(journal.get('display_name', {}))
    result['journal_if'] = result['primary_location'].map(journal.get('2yr_mean_citedness', {}))
    
    print(len(result))
    return result

  es = Elasticsearch(


# TOPICの全論文を取得

In [67]:
#例えば、https://openalex.org/topics/T10020の全論文を取得するとき　
papers = get_result(es,'oa_papers',build_es_query_topic("T10020") )
print(len(papers))
papers.head()

  response = es_client.search(index=es_index, body=es_query, scroll=scroll_time, size=batch_size)


KeyboardInterrupt: 

# 特定のキーワードを含む論文を取得

In [20]:
#例えば、"large language models",'llms','llm'のいずれかの単語を含む、2024年から2024年までの全論文を取得するとき　
papers = get_result(es,'oa_papers',build_es_query_titleabstract_or(["large language models",'llms','llm'],2024,2024) )
print(len(papers))
papers.head()

  response = es_client.search(index=es_index, body=es_query, scroll=scroll_time, size=batch_size)


31281
31281


Unnamed: 0_level_0,doi,fields,affliations,primary_topic,authorships,cited_by_count,concepts,subfields,publication_date,primary_location,...,abstract,language,text_embedding_256,topic_sub_clustering,citedby,reference,cited_by_year,new_score,journal,journal_if
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
4403116255,,"[Computer Science, Engineering]","[[], [], []]",Visual Question Answering in Images and Videos,"[5036727505, 5101776086, 5109890544]",0,"[Benchmark (surveying), Computer science, Arti...","[Computer Vision and Pattern Recognition, Cont...",20241004,https://openalex.org/S4306400194,...,Generalizing language-conditioned robotic poli...,en,"[0.011329010128974915, 0.008168527856469154, 0...","[llm_10002, T10653_40001, T11714_60001]",,,,,arXiv (Cornell University),0.632485
4402111602,https://doi.org/10.21437/interspeech.2024-1700,"[Computer Science, Computer Science, Psychology]","[[], [], [], [], []]",Speech Recognition Technology,"[5092418416, 5113361537, 5091143209, 501238656...",0,"[Duration (music), Computer science, Speech re...","[Artificial Intelligence, Signal Processing, E...",20240901,,...,Audio-visual alignment after dubbing is a chal...,en,"[-0.024551065638661385, 0.05350632965564728, 0...",,,,,,,
4403964218,https://doi.org/10.48550/arxiv.2410.04834,[Engineering],"[[], [], [], [], [], []]",Iterative Learning Control in Engineering Prac...,"[5102006980, 5101808283, 5000871321, 510044172...",0,"[Simple (philosophy), Computer science, Contro...",[Control and Systems Engineering],20241007,https://openalex.org/S4306400194,...,Direct Preference Optimization (DPO) has emerg...,en,"[0.0264631025493145, 0.009874476119875908, 0.1...",,,,,,arXiv (Cornell University),0.632485
4396821954,https://doi.org/10.48550/arxiv.2405.05329,"[Computer Science, Computer Science, Decision ...","[[], [], []]",Natural Language Processing,"[5080091586, 5056246621, 5113880378]",0,"[Cache, Computer science, Key (lock), Scalabil...","[Artificial Intelligence, Artificial Intellige...",20240508,https://openalex.org/S4306400194,...,Large Language Model or LLM inference has two ...,en,"[0.014008685015141964, -0.025452211499214172, ...",,,,,,arXiv (Cornell University),0.632485
4399205174,https://doi.org/10.1609/icwsm.v18i1.31446,[Computer Science],"[[138847295], [], [], [], [138847295], [138847...",Automated Detection of Hate Speech and Offensi...,"[5001896240, 5027705731, 5094057195, 509895682...",0,"[Context (archaeology), Hessian matrix, Comput...",[Artificial Intelligence],20240528,https://openalex.org/S4387284482,...,This study describes a dataset that allows to ...,en,"[-0.00195204874034971, 0.041102878749370575, 0...","[llm_50003, T12262_20001]",,"[2099865247, 2302501749, 2912924812, 294757159...",,,Proceedings of the International AAAI Conferen...,2.546843


In [68]:
#例えば、"large language models",'llms','llm'のいずれかの単語を含む、2024年から2024年までの全論文を取得するとき　
papers = get_result(es,'oa_papers',build_es_query_titleabstract_or(["sports analytics",'sports data science','sports performance analysis','machine learning in sports','sports statistics'],2024,2024) )
print(len(papers))
papers.head()

  response = es_client.search(index=es_index, body=es_query, scroll=scroll_time, size=batch_size)


182
182


Unnamed: 0_level_0,abstract,title,concepts,id,fields,primary_location,is_oa,cited_by_count,topics,publication_date,...,subfields,domains,affliations,text_embedding_256,reference,cited_by_year,citedby,topic_sub_clustering,journal,journal_if
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
4394566173,To explore the prospects and challenges of app...,THE PROGRESS IN THE RESEARCH OF MACHINE LEARNI...,"[Sports medicine, Computer science, Psychology...",4394566173,"[Health Professions, Medicine]",https://openalex.org/S4210172178,True,0,"[Machine Learning in Healthcare and Medicine, ...",20240408,...,"[Health Information Management, Health Informa...","[Health Sciences, Health Sciences]",[[4210095228]],"[-0.013616752810776234, 0.030714480206370354, ...",,,,,EPRA International Journal of Multidisciplinar...,0.454545
4396610510,Abstract: Due to the lack of review papers pub...,A Review of Sports Analytics,"[Analytics, Computer science, Data science]",4396610510,"[Business, Management and Accounting]",https://openalex.org/S2764566388,True,0,[Impact of Big Data Analytics on Business Perf...,20240430,...,[Management Information Systems],[Social Sciences],"[[226983648], [], [], [], [], [226983648]]","[-0.08760611712932587, 0.059393979609012604, 0...","[1964573888, 1986060983, 1987422996, 199627308...",,,,International Journal for Research in Applied ...,0.262504
4392144307,This MLSA 2023 proceedings deals with machine ...,Machine Learning and Data Mining for Sports An...,"[Basketball, Analytics, Football, Racket, Comp...",4392144307,"[Business, Management and Accounting, Computer...",https://openalex.org/S2764900261,True,0,[Impact of Big Data Analytics on Business Perf...,20240101,...,"[Management Information Systems, Computer Netw...","[Social Sciences, Physical Sciences, Physical ...","[[113456305], [99464096], [99464096], [4210139...","[-0.05282923951745033, 0.06061029061675072, 0....",,,,,Communications in computer and information sci...,0.673717
4391092065,More and more sports teams are looking to util...,IMPLEMENTATION OF THE MASTER'S PROGRAM IN SPOR...,"[Analytics, Computer science, Field (mathemati...",4391092065,"[Health Professions, Business, Management and ...",,True,0,"[Physical Education and Sports Science, Impact...",20240122,...,"[Physical Therapy, Sports Therapy and Rehabili...","[Health Sciences, Social Sciences]","[[4210163059], []]","[-0.030882054939866066, 0.11646179854869843, 0...",,,,,,
4391618811,More and more sports teams are looking to util...,IMPLEMENTATION OF THE MASTER'S PROGRAM IN SPOR...,"[Analytics, Computer science, Field (mathemati...",4391618811,"[Health Professions, Business, Management and ...",,True,0,"[Physical Education and Sports Science, Impact...",20240207,...,"[Physical Therapy, Sports Therapy and Rehabili...","[Health Sciences, Social Sciences]","[[4210163059], []]","[-0.030882054939866066, 0.11646179854869843, 0...",,,,,,


In [69]:
#例えば、"large language models",'llms','llm'のいずれかの単語を含む、2024年から2024年までの全論文を取得するとき　
papers = get_result(es,'oa_papers',build_es_query_titleabstract_or(["sports analytics",'sports data science','sports performance analysis','machine learning in sports','sports statistics'],2012,2024) )
print(len(papers))
papers.head()

  response = es_client.search(index=es_index, body=es_query, scroll=scroll_time, size=batch_size)


1004
1004


Unnamed: 0_level_0,fields,affliations,publication_year,domains,publication_date,subfields,affliations_list,title,id,language,...,concepts,topics,doi,text_embedding_256,citedby,reference,cited_by_year,topic_sub_clustering,journal,journal_if
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
4394624495,"[Neuroscience, Medicine, Medicine]","[[141596103], [141596103], [80810150], [141596...",2024,"[Life Sciences, Health Sciences, Health Sciences]",20240301,"[Cognitive Neuroscience, Orthopedics and Sport...","[80810150, 141596103]",Is Task Analysis a Contributing Factor to Perf...,4394624495,en,...,"[Public health, Factor (programming language),...",[Computational Principles of Motor Control and...,https://doi.org/10.2478/pjst-2024-0001,"[0.06605333834886551, 0.09307071566581726, 0.0...",,"[1485705743, 1605965041, 1843174427, 197405661...",,,Polish Journal of Sport and Tourism,1.061224
4402247897,"[Computer Science, Business, Management and Ac...",[[]],2024,"[Physical Sciences, Social Sciences]",20240830,"[Computer Networks and Communications, Managem...",[],Employing Artificial Intelligence Applications...,4402247897,en,...,"[Computer science, Artificial intelligence, En...",[Log Analysis and System Performance Diagnosis...,https://doi.org/10.37575/h/edu/240036,"[-2.5489234758424573e-05, 0.012813206762075424...",,,,,Scientific Journal of King Faisal University H...,0.104478
3047428183,"[Medicine, Health Professions, Health Professi...",[[4210163048]],2020,"[Health Sciences, Health Sciences, Health Scie...",20200101,"[Physiology, Physical Therapy, Sports Therapy ...",[4210163048],Study on the Effect of Tennis on the Physical ...,3047428183,en,...,"[China, Socioeconomics, Environmental health, ...",[Effects of Physical Activity on Health Outcom...,https://doi.org/10.2991/assehr.k.200727.053,"[0.03593587130308151, 0.06506069749593735, 0.0...",,,,,Proceedings of the 2019 4th International Conf...,0.0
4285544226,"[Health Professions, Psychology, Psychology]","[[], []]",2021,"[Health Sciences, Social Sciences, Social Scie...",20210101,"[Physical Therapy, Sports Therapy and Rehabili...",[],THE PSYCHOLOGICAL CHARACTERISTICS OF YOUNG USE...,4285544226,en,...,"[CLARITY, Activity tracker, BitTorrent tracker...",[Physical Education and Sports Science Researc...,https://doi.org/10.17721/upj.2021.1(15).8,"[0.025173939764499664, 0.1276238113641739, -0....",,"[1971209549, 1979183008, 1995204909, 201142629...",,,Ukrainian Psychological Journal,0.139535
2971823064,"[Computer Science, Computer Science, Computer ...","[[98358874], [98358874], [98358874]]",2019,"[Physical Sciences, Physical Sciences, Physica...",20190906,"[Computer Vision and Pattern Recognition, Comp...",[98358874],Running Event Visualization using Videos from ...,2971823064,en,...,"[Computer science, Computer vision, Event (par...","[Automatic Video Summarization and Analysis, H...",,"[0.07035969942808151, 0.09912274777889252, 0.0...",,"[1530232915, 1565050238, 1976382093, 199804286...",,,arXiv (Cornell University),0.632485


In [71]:
reference_pairs = papers.dropna(subset=["reference"]).explode("reference")[["id", "reference"]]
reference_pairs.columns = ["citing_paper_id", "cited_paper_id"]

# 2. in-degree(cited_paper_id ごとに引用された回数）をカウント
in_degree_counts = reference_pairs["cited_paper_id"].value_counts().reset_index()
in_degree_counts.columns = ["referenced_paper_id", "in_degree"]

print(in_degree_counts.head())

   referenced_paper_id  in_degree
0           2194775991         30
1           1527788842         24
2           2064675550         20
3           2784056902         18
4           2911964244         17


In [72]:
reference_pairs = papers.dropna(subset=["reference"]).explode("reference")[["id", "reference"]]
reference_pairs.columns = ["citing_paper_id", "referenced_paper_id"]

# 各引用先に対して、引用元のIDをカンマ区切りにして結合
reference_summary = (
    reference_pairs
    .groupby("referenced_paper_id")["citing_paper_id"]
    .apply(lambda ids: ",".join(map(str, sorted(ids.unique())))) 
    .reset_index()
)
reference_summary.columns = ["referenced_paper_id", "in_degree_ids"]

# referenced_paper_id リストからメタデータを取得
ids_to_query = reference_summary["referenced_paper_id"].tolist()
query = build_es_query_by_ids(ids_to_query)
ref_metadata = get_result(es, "oa_papers", query)

ref_metadata = ref_metadata.reset_index(drop=True)

#merge
ref_with_in_degree = ref_metadata.merge(
    reference_summary,
    left_on="id",
    right_on="referenced_paper_id",
    how="left"
)

display(ref_with_in_degree[["id", "title", "cited_by_count", "journal", "in_degree_ids"]].head())

#ref_with_in_degree.to_csv("ref_with_in_degree.csv", index=False, encoding="utf-8")

  response = es_client.search(index=es_index, body=es_query, scroll=scroll_time, size=batch_size)


12823


Unnamed: 0,id,title,cited_by_count,journal,in_degree_ids
0,4285719527,Deleted Work,0,,"2080234233,2187648190,2493431606,2751600782,29..."
1,2962324746,Game-to-Game Prediction of NBA Players’ Points...,0,,4404651385
2,4389749175,A multilayer network framework for soccer anal...,0,Chaos Solitons & Fractals,4404651385
3,4390484013,OmniScorer: Real-Time Shot Spot Analysis for C...,1,,4400352169
4,4393870032,"Enhancing Skills, Mood, and Performance in Ove...",1,Children,4401811174


In [81]:
def compute_in_degrees_by_generation(papers, es, max_generation=3):
    """
    被引用関係（reference）を n 世代まで再帰的にたどり、
    generation ごとの referenced_paper_id と in_degree_ids を返す。
    """
    from IPython.display import display

    all_generations = []
    visited_ids = set()

    # generation 0 の初期引用ペアを取得
    reference_pairs = papers.dropna(subset=["reference"]).explode("reference")[["id", "reference"]]
    reference_pairs.columns = ["citing_paper_id", "referenced_paper_id"]
    reference_pairs["generation"] = 0

    all_generations.append(reference_pairs.copy())

    current_ids = reference_pairs["referenced_paper_id"].unique().tolist()
    visited_ids.update(current_ids)

    for gen in range(1, max_generation + 1):
        print(f"\n== Generation {gen} ==")

        # 次世代に進む引用先のメタデータ取得
        query = build_es_query_by_ids(current_ids)
        ref_metadata = get_result(es, "oa_papers", query, include_fields=["id", "reference"]).reset_index(drop=True)

        # reference 展開（次の引用先へ）
        next_pairs = ref_metadata.dropna(subset=["reference"]).explode("reference")[["id", "reference"]]
        next_pairs.columns = ["citing_paper_id", "referenced_paper_id"]
        next_pairs = next_pairs[~next_pairs["referenced_paper_id"].isin(visited_ids)]
        next_pairs["generation"] = gen

        if next_pairs.empty:
            print("▶ No further references found.")
            break

        display(next_pairs.head())

        all_generations.append(next_pairs.copy())
        current_ids = next_pairs["referenced_paper_id"].unique().tolist()
        visited_ids.update(current_ids)

    # 各世代の引用ペアから、generation ごとの in_degree 情報を生成
    all_reference_summaries = []

    for df in all_generations:
        reference_summary = (
            df.groupby(["referenced_paper_id", "generation"])["citing_paper_id"]
            .apply(lambda ids: ",".join(map(str, sorted(ids.unique()))))
            .reset_index()
        )
        reference_summary.columns = ["referenced_paper_id", "generation", "in_degree_ids"]
        all_reference_summaries.append(reference_summary)

    return pd.concat(all_reference_summaries, ignore_index=True)

# 関数呼び出し（n世代分）
ref_summary_all = compute_in_degrees_by_generation(papers, es, max_generation=3)

# 結果の表示
from IPython.display import display
display(ref_summary_all.head(10))  # 必要に応じて .head(10) を増やせます



== Generation 1 ==


TypeError: get_result() got an unexpected keyword argument 'include_fields'

In [84]:
def get_lightweight_result(es_client, es_index, es_query, scroll_time="2m", batch_size=1000, include_fields=None):
    """
    Get results from Elasticsearch with optional field restriction.
    Lightweight version to avoid transferring unnecessary fields.
    """
    all_hits = []

    # 軽量化：必要なフィールドだけに制限
    if include_fields:
        es_query["_source"] = include_fields

    # 推奨される形で size を body に埋め込む
    es_query["size"] = batch_size

    response = es_client.search(index=es_index, body=es_query, scroll=scroll_time)
    scroll_id = response['_scroll_id']

    while response['hits']['hits']:
        all_hits.extend(hit['_source'] for hit in response['hits']['hits'])
        response = es_client.scroll(scroll_id=scroll_id, scroll=scroll_time)

    es_client.clear_scroll(scroll_id=scroll_id)
    return pd.DataFrame(all_hits)


In [85]:
def compute_in_degrees_by_generation(papers, es, max_generation=3):
    """
    被引用関係（reference）を n 世代まで再帰的にたどり、
    generation ごとの referenced_paper_id と in_degree_ids を返す。
    """
    from IPython.display import display

    all_generations = []
    visited_ids = set()

    # generation 0 の初期引用ペアを取得
    reference_pairs = papers.dropna(subset=["reference"]).explode("reference")[["id", "reference"]]
    reference_pairs.columns = ["citing_paper_id", "referenced_paper_id"]
    reference_pairs["generation"] = 0

    all_generations.append(reference_pairs.copy())

    current_ids = reference_pairs["referenced_paper_id"].unique().tolist()
    visited_ids.update(current_ids)

    for gen in range(1, max_generation + 1):
        print(f"\n== Generation {gen} ==")

        # 次世代に進む引用先のメタデータ取得（軽量化版）
        query = build_es_query_by_ids(current_ids)
        ref_metadata = get_lightweight_result(es, "oa_papers", query, include_fields=["id", "reference"]).reset_index(drop=True)

        # reference 展開（次の引用先へ）
        next_pairs = ref_metadata.dropna(subset=["reference"]).explode("reference")[["id", "reference"]]
        next_pairs.columns = ["citing_paper_id", "referenced_paper_id"]
        next_pairs = next_pairs[~next_pairs["referenced_paper_id"].isin(visited_ids)]
        next_pairs["generation"] = gen

        if next_pairs.empty:
            print("No further references found.")
            break

        display(next_pairs.head())

        all_generations.append(next_pairs.copy())
        current_ids = next_pairs["referenced_paper_id"].unique().tolist()
        visited_ids.update(current_ids)

    # 各世代の引用ペアから、generation ごとの in_degree 情報を生成
    all_reference_summaries = []

    for df in all_generations:
        reference_summary = (
            df.groupby(["referenced_paper_id", "generation"])["citing_paper_id"]
            .apply(lambda ids: ",".join(map(str, sorted(ids.unique()))))
            .reset_index()
        )
        reference_summary.columns = ["referenced_paper_id", "generation", "in_degree_ids"]
        all_reference_summaries.append(reference_summary)

    return pd.concat(all_reference_summaries, ignore_index=True)


# 実行：3世代まで引用関係をたどる
ref_summary_all = compute_in_degrees_by_generation(papers, es, max_generation=3)

# 結果を表示
from IPython.display import display
display(ref_summary_all.head(10))



== Generation 1 ==


Unnamed: 0,citing_paper_id,referenced_paper_id,generation
0,2897736987,1965637139,1
0,2897736987,2024605342,1
0,2897736987,2084583116,1
0,2897736987,2135058044,1
0,2897736987,2580107250,1



== Generation 2 ==


BadRequestError: BadRequestError(400, 'search_phase_execution_exception', 'failed to create query: The number of terms [300737] used in the Terms Query request has exceeded the allowed maximum of [65536]. This maximum can be set by changing the [index.max_terms_count] index level setting.')