In [None]:
import pandas as pd
from elasticsearch import helpers
from elasticsearch import Elasticsearch
es = Elasticsearch(
    "http://133.11.72.200:9200/",
    # ca_certs="./ti06.crt",
    timeout=500,
    basic_auth=('oa_reader', 'five@godtruth')
)

def build_es_query_titleabstract_or(keywords,gte=1800,lte=2030):
    """Build Elasticsearch query for a specific keyword."""

    list_keywords = []
    for key in keywords:
        list_keywords = list_keywords + [  {
                        "match_phrase": {
                          "abstract": key
                        }
                      },
                                      {
                        "match_phrase": {
                          "title": key
                        }
                      }]
    
    return {
          "query": {
            "bool": {
              "must": [
                {
                  "range": {
                    "publication_year": {
                      "gte": gte,   
                      "lte": lte   
                    }
                  }
                },
                { "range" : { "cited_by_count" :{
                        "gte" : 0,
                    } }},   
                {
                  "bool": {
                    "should": list_keywords
                }
                }
              ]
            }
          }
        }
    

def build_es_query_topic(x):
    """Build Elasticsearch query for a topic."""
    topics = pd.read_pickle('/disks/qnap3/shared/openalex-24/data/master/topics.pickle')
    if x in topics.index:
        keyword = topics.loc[x].display_name
    else:
        keyword = topics.loc["https://openalex.org/"+str(x)].display_name
        
    return {
        "query": {
            "bool": {
            "must" : [
                { "match" : { "language" : "en" }},
                { "term" : { "topics": {"value":keyword} }}, 
                { "range" : { "cited_by_count" :{"gte" : 0,} }},       
                { "range" : { "publication_year" :{"gte" : 0,"lte":2025} }}
            ]

            }
        }
    }



def get_result(es_client, es_index, es_query, scroll_time="2m", batch_size=1000):
    all_hits = []
    response = es_client.search(index=es_index, body=es_query, scroll=scroll_time, size=batch_size)
    scroll_id = response['_scroll_id']
    
    while response['hits']['hits']:
        all_hits.extend(hit['_source'] for hit in response['hits']['hits'])
        response = es_client.scroll(scroll_id=scroll_id, scroll=scroll_time)
    
    es_client.clear_scroll(scroll_id=scroll_id)
    result = pd.DataFrame(all_hits)
    
    if 'cited_by_count' in result.columns:
        result = result.sort_values('cited_by_count')
    if 'id' in result.columns:
        result.index = result['id']
    
    journal = pd.read_pickle('/disks/qnap3/shared/openalex-24/data/journal/journal.pickle')
    result['journal'] = result['primary_location'].map(journal.get('display_name', {}))
    result['journal_if'] = result['primary_location'].map(journal.get('2yr_mean_citedness', {}))
    
    print(len(result))
    return result

# TOPICの全論文を取得

In [2]:
#例えば、https://openalex.org/topics/T10020の全論文を取得するとき　
papers = get_result(es,'oa_papers',build_es_query_topic("T10020") )
print(len(papers))
papers.head()

  response = es_client.search(index=es_index, body=es_query, scroll=scroll_time, size=batch_size)


226168
226168


Unnamed: 0_level_0,doi,primary_topic,topics,fields,publication_year,subfields,language,id,publication_date,affliations_list,...,is_oa,title,authorships,citedby,reference,cited_by_year,text_embedding_256,topic_sub_clustering,journal,journal_if
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2356823873,,Quantum Information and Computation,"[Quantum Information and Computation, Quantum ...","[Computer Science, Computer Science, Physics a...",2006,"[Artificial Intelligence, Artificial Intellige...",en,2356823873,20060101,[4210126638],...,False,Secure direct communication via quantum entang...,[5101433810],,,,"[0.05899946391582489, -0.004938934929668903, -...",,,
2352630923,,Quantum Information and Computation,"[Quantum Information and Computation, Quantum ...","[Computer Science, Computer Science, Physics a...",2007,"[Artificial Intelligence, Artificial Intellige...",en,2352630923,20070101,[1793135],...,False,Simple Scheme for Generating n-qubit W State V...,[5014996251],,,,"[-0.018001431599259377, -0.01619630865752697, ...",,Journal of Huaiyin Teachers College,0.0
2352716238,,Quantum Information and Computation,"[Quantum Information and Computation, Semicond...","[Computer Science, Physics and Astronomy, Comp...",2006,"[Artificial Intelligence, Atomic and Molecular...",en,2352716238,20060101,[143868143],...,False,Scheme for teleportation of an unknown tripart...,[5017507932],,,,"[0.04118983820080757, -0.016310667619109154, -...",,Journal of Anhui University,0.0
2352737489,,Foundations of Quantum Mechanics and Interpret...,[Foundations of Quantum Mechanics and Interpre...,"[Physics and Astronomy, Physics and Astronomy,...",2003,"[Atomic and Molecular Physics, and Optics, Ato...",en,2352737489,20030101,[],...,False,Two Photon Width of η_c,[5103010721],,,,"[0.01876791939139366, 0.03016684018075466, -0....",,High Energy Physics and Nuclear Physics,0.0
2352845061,,Slow Light Propagation and Quantum Memory,"[Slow Light Propagation and Quantum Memory, Qu...","[Physics and Astronomy, Computer Science, Engi...",2005,"[Atomic and Molecular Physics, and Optics, Art...",en,2352845061,20050101,[],...,False,Scheme for Implementing Quantum State Transfer...,"[5005615854, 5051458868]",,,,"[-0.04326479136943817, 0.1029047891497612, -0....",,Journal of Huaihua University,0.0


# 特定のキーワードを含む論文を取得

In [20]:
#例えば、"large language models",'llms','llm'のいずれかの単語を含む、2024年から2024年までの全論文を取得するとき　
papers = get_result(es,'oa_papers',build_es_query_titleabstract_or(["large language models",'llms','llm'],2024,2024) )
print(len(papers))
papers.head()

  response = es_client.search(index=es_index, body=es_query, scroll=scroll_time, size=batch_size)


31281
31281


Unnamed: 0_level_0,doi,fields,affliations,primary_topic,authorships,cited_by_count,concepts,subfields,publication_date,primary_location,...,abstract,language,text_embedding_256,topic_sub_clustering,citedby,reference,cited_by_year,new_score,journal,journal_if
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
4403116255,,"[Computer Science, Engineering]","[[], [], []]",Visual Question Answering in Images and Videos,"[5036727505, 5101776086, 5109890544]",0,"[Benchmark (surveying), Computer science, Arti...","[Computer Vision and Pattern Recognition, Cont...",20241004,https://openalex.org/S4306400194,...,Generalizing language-conditioned robotic poli...,en,"[0.011329010128974915, 0.008168527856469154, 0...","[llm_10002, T10653_40001, T11714_60001]",,,,,arXiv (Cornell University),0.632485
4402111602,https://doi.org/10.21437/interspeech.2024-1700,"[Computer Science, Computer Science, Psychology]","[[], [], [], [], []]",Speech Recognition Technology,"[5092418416, 5113361537, 5091143209, 501238656...",0,"[Duration (music), Computer science, Speech re...","[Artificial Intelligence, Signal Processing, E...",20240901,,...,Audio-visual alignment after dubbing is a chal...,en,"[-0.024551065638661385, 0.05350632965564728, 0...",,,,,,,
4403964218,https://doi.org/10.48550/arxiv.2410.04834,[Engineering],"[[], [], [], [], [], []]",Iterative Learning Control in Engineering Prac...,"[5102006980, 5101808283, 5000871321, 510044172...",0,"[Simple (philosophy), Computer science, Contro...",[Control and Systems Engineering],20241007,https://openalex.org/S4306400194,...,Direct Preference Optimization (DPO) has emerg...,en,"[0.0264631025493145, 0.009874476119875908, 0.1...",,,,,,arXiv (Cornell University),0.632485
4396821954,https://doi.org/10.48550/arxiv.2405.05329,"[Computer Science, Computer Science, Decision ...","[[], [], []]",Natural Language Processing,"[5080091586, 5056246621, 5113880378]",0,"[Cache, Computer science, Key (lock), Scalabil...","[Artificial Intelligence, Artificial Intellige...",20240508,https://openalex.org/S4306400194,...,Large Language Model or LLM inference has two ...,en,"[0.014008685015141964, -0.025452211499214172, ...",,,,,,arXiv (Cornell University),0.632485
4399205174,https://doi.org/10.1609/icwsm.v18i1.31446,[Computer Science],"[[138847295], [], [], [], [138847295], [138847...",Automated Detection of Hate Speech and Offensi...,"[5001896240, 5027705731, 5094057195, 509895682...",0,"[Context (archaeology), Hessian matrix, Comput...",[Artificial Intelligence],20240528,https://openalex.org/S4387284482,...,This study describes a dataset that allows to ...,en,"[-0.00195204874034971, 0.041102878749370575, 0...","[llm_50003, T12262_20001]",,"[2099865247, 2302501749, 2912924812, 294757159...",,,Proceedings of the International AAAI Conferen...,2.546843


In [19]:
#例えば、"large language models",'llms','llm'のいずれかの単語を含む、2024年から2024年までの全論文を取得するとき　
papers = get_result(es,'oa_papers',build_es_query_titleabstract_or(["sports analytics",'sports data science','sports performance analysis','machine learning in sports','sports statistics'],2024,2024) )
print(len(papers))
papers.head()

  response = es_client.search(index=es_index, body=es_query, scroll=scroll_time, size=batch_size)


182
182


Unnamed: 0_level_0,abstract,title,concepts,id,fields,primary_location,is_oa,cited_by_count,topics,publication_date,...,subfields,domains,affliations,text_embedding_256,reference,cited_by_year,citedby,topic_sub_clustering,journal,journal_if
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
4394566173,To explore the prospects and challenges of app...,THE PROGRESS IN THE RESEARCH OF MACHINE LEARNI...,"[Sports medicine, Computer science, Psychology...",4394566173,"[Health Professions, Medicine]",https://openalex.org/S4210172178,True,0,"[Machine Learning in Healthcare and Medicine, ...",20240408,...,"[Health Information Management, Health Informa...","[Health Sciences, Health Sciences]",[[4210095228]],"[-0.013616752810776234, 0.030714480206370354, ...",,,,,EPRA International Journal of Multidisciplinar...,0.454545
4396610510,Abstract: Due to the lack of review papers pub...,A Review of Sports Analytics,"[Analytics, Computer science, Data science]",4396610510,"[Business, Management and Accounting]",https://openalex.org/S2764566388,True,0,[Impact of Big Data Analytics on Business Perf...,20240430,...,[Management Information Systems],[Social Sciences],"[[226983648], [], [], [], [], [226983648]]","[-0.08760611712932587, 0.059393979609012604, 0...","[1964573888, 1986060983, 1987422996, 199627308...",,,,International Journal for Research in Applied ...,0.262504
4392144307,This MLSA 2023 proceedings deals with machine ...,Machine Learning and Data Mining for Sports An...,"[Basketball, Analytics, Football, Racket, Comp...",4392144307,"[Business, Management and Accounting, Computer...",https://openalex.org/S2764900261,True,0,[Impact of Big Data Analytics on Business Perf...,20240101,...,"[Management Information Systems, Computer Netw...","[Social Sciences, Physical Sciences, Physical ...","[[113456305], [99464096], [99464096], [4210139...","[-0.05282923951745033, 0.06061029061675072, 0....",,,,,Communications in computer and information sci...,0.673717
4391092065,More and more sports teams are looking to util...,IMPLEMENTATION OF THE MASTER'S PROGRAM IN SPOR...,"[Analytics, Computer science, Field (mathemati...",4391092065,"[Health Professions, Business, Management and ...",,True,0,"[Physical Education and Sports Science, Impact...",20240122,...,"[Physical Therapy, Sports Therapy and Rehabili...","[Health Sciences, Social Sciences]","[[4210163059], []]","[-0.030882054939866066, 0.11646179854869843, 0...",,,,,,
4391618811,More and more sports teams are looking to util...,IMPLEMENTATION OF THE MASTER'S PROGRAM IN SPOR...,"[Analytics, Computer science, Field (mathemati...",4391618811,"[Health Professions, Business, Management and ...",,True,0,"[Physical Education and Sports Science, Impact...",20240207,...,"[Physical Therapy, Sports Therapy and Rehabili...","[Health Sciences, Social Sciences]","[[4210163059], []]","[-0.030882054939866066, 0.11646179854869843, 0...",,,,,,


In [None]:
#例えば、"large language models",'llms','llm'のいずれかの単語を含む、2024年から2024年までの全論文を取得するとき　
papers = get_result(es,'oa_papers',build_es_query_titleabstract_or(["sports analytics",'sports data science','sports performance analysis','machine learning in sports','sports statistics'],2012,2024) )
print(len(papers))
papers.head()