In [1]:
import pandas as pd
import numpy as np
import json
import os

## Load journal and conferences lists

Set of journal and conferences from [gov.pl](https://www.gov.pl/web/edukacja-i-nauka/nowy-rozszerzony-wykaz-czasopism-naukowych-i-recenzowanych-materialow-z-konferencji-miedzynarodowych) with impact metrics

In [2]:
# load https://www.gov.pl/web/edukacja-i-nauka/nowy-rozszerzony-wykaz-czasopism-naukowych-i-recenzowanych-materialow-z-konferencji-miedzynarodowych
# sheet 1: 'journals'
gov_articles_data = pd.read_excel("..//data//external//Wykaz_dyscyplin_do_czasopism_i_materiałów_konferencyjnych.xlsx", 0, header=None)

gov_articles_data.columns = np.where(gov_articles_data.iloc[0].notna(), 
                                     gov_articles_data.iloc[0].astype(str) + ' - ' + gov_articles_data.iloc[1].astype(str), 
                                     gov_articles_data.iloc[1])
gov_articles_data = gov_articles_data[2:]
gov_articles_data = gov_articles_data.reset_index(drop=True)

gov_articles_data = gov_articles_data.drop('Lp.', axis=1)

display(gov_articles_data)

Unnamed: 0,Unikatowy Identyfikator Czasopisma,Tytuł 1,issn,e-issn,Tytuł 2,issn.1,e-issn.1,Punkty,archeologia - 101,filozofia - 102,...,prawo kanoniczne - 510,psychologia - 511,astronomia - 601,informatyka - 602,matematyka - 603,nauki biologiczne - 604,nauki chemiczne - 605,nauki fizyczne - 606,nauki o Ziemi i środowisku - 607,nauki teologiczne - 701
0,1,2D Materials,2053-1583,2053-1583,2D Materials,,2053-1583,140,,,...,,,,,,,x,x,,
1,2,3 Biotech,2190-572X,2190-5738,3 Biotech,2190-572X,2190-5738,70,,,...,,,,,,x,,,,
2,3,3C Empresa,2254-3376,2254-3376,,,,20,,,...,,,,,,,,,,
3,4,3c Tecnologia,2254-4143,2254-4143,,,,20,,,...,,,,,,,,,,
4,5,3C Tic,2254-6529,2254-6529,,,,20,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31428,499029,Open Political Science,,2543-8042,,,,20,,,...,,,,,,,,,,
31429,499067,Językoznawstwo,1897-0389,2391-5137,Linguistics,,,20,,,...,,,,,,,,,,
31430,499108,Człowiek i Społeczeństwo,0239-3271,,Man and Society,,,40,,,...,,,,,,,,,,
31431,499116,Com.press,,2545-2320,,,,20,,,...,,,,,,,,,,


In [3]:
# load https://www.gov.pl/web/edukacja-i-nauka/nowy-rozszerzony-wykaz-czasopism-naukowych-i-recenzowanych-materialow-z-konferencji-miedzynarodowych
# sheet 1: 'conferences'
gov_conferences_data = pd.read_excel("..//data//external//Wykaz_dyscyplin_do_czasopism_i_materiałów_konferencyjnych.xlsx", 1)
gov_conferences_data = gov_conferences_data.drop('LP.', axis=1)

gov_conferences_data['Przypisane dyscypliny naukowe'] = gov_conferences_data['Przypisane dyscypliny naukowe'].replace('\n', ' ', regex=True)

display(gov_conferences_data)

Unnamed: 0,Nazwa konferencji,Liczba punktów,Przypisane dyscypliny naukowe
0,3-D Digital Imaging and Modelling [3DIM],20,informatyka techniczna i telekomunikacja; info...
1,A Satellite workshop on Formal Approaches to T...,20,informatyka techniczna i telekomunikacja; info...
2,Accounting and Finance Association of Australi...,20,informatyka techniczna i telekomunikacja; info...
3,ACIS Conference on Software Engineering Resear...,20,informatyka techniczna i telekomunikacja; info...
4,ACM Annual Computer Science Conference [CSC],20,informatyka techniczna i telekomunikacja; info...
...,...,...,...
1633,International Conference: Sciences of Electron...,20,informatyka techniczna i telekomunikacja; info...
1634,The Symposium of Combinatorial Search [SoCS],20,informatyka techniczna i telekomunikacja; info...
1635,The International Conference on Verification a...,20,informatyka techniczna i telekomunikacja; info...
1636,International Teletraffic Congress [ITC],20,informatyka techniczna i telekomunikacja; info...


## Load collection of computer science publications

A collection of data on publications in the field of computer science from [dblp](https://paperswithcode.com/dataset/dblp)

In [4]:
# load https://www.aminer.org/citation v10
directory_path = "../data/external/dblp-ref-10"  # Zastąp to ścieżką do katalogu, który chcesz przeszukać

try:
    file_names = os.listdir(directory_path)

    # Wyświetl nazwy plików
    for file_name in file_names:
        # print(file_name)
        with open("..//data//external//dblp-ref-10//dblp-ref-0.json") as input_file:
            # head = [next(input_file) for _ in range(2)]
            head = "[" + ", ".join(next(input_file) for _ in range(2)) + "]"
            data = json.loads(head)
            df = pd.json_normalize(data)
            display(df)
        
except FileNotFoundError:
    print(f"Katalog '{directory_path}' nie istnieje.")

except PermissionError:
    print(f"Brak uprawnień do odczytu katalogu '{directory_path}'.")

Unnamed: 0,abstract,authors,n_citation,references,title,venue,year,id
0,The purpose of this study is to develop a lear...,"[Makoto Satoh, Ryo Muramatsu, Mizue Kayama, Ka...",0,"[51c7e02e-f5ed-431a-8cf5-f761f266d4be, 69b625b...",Preliminary Design of a Network Protocol Learn...,international conference on human-computer int...,2013,00127ee2-cb05-48ce-bc49-9de556b93346
1,This paper describes the design and implementa...,"[Gareth Beale, Graeme Earl]",50,"[10482dd3-4642-4193-842f-85f3b70fcf65, 3133714...",A methodology for the physically accurate visu...,visual analytics science and technology,2011,001c58d3-26ad-46b3-ab3a-c1e557d16821


Unnamed: 0,abstract,authors,n_citation,references,title,venue,year,id
0,The purpose of this study is to develop a lear...,"[Makoto Satoh, Ryo Muramatsu, Mizue Kayama, Ka...",0,"[51c7e02e-f5ed-431a-8cf5-f761f266d4be, 69b625b...",Preliminary Design of a Network Protocol Learn...,international conference on human-computer int...,2013,00127ee2-cb05-48ce-bc49-9de556b93346
1,This paper describes the design and implementa...,"[Gareth Beale, Graeme Earl]",50,"[10482dd3-4642-4193-842f-85f3b70fcf65, 3133714...",A methodology for the physically accurate visu...,visual analytics science and technology,2011,001c58d3-26ad-46b3-ab3a-c1e557d16821


Unnamed: 0,abstract,authors,n_citation,references,title,venue,year,id
0,The purpose of this study is to develop a lear...,"[Makoto Satoh, Ryo Muramatsu, Mizue Kayama, Ka...",0,"[51c7e02e-f5ed-431a-8cf5-f761f266d4be, 69b625b...",Preliminary Design of a Network Protocol Learn...,international conference on human-computer int...,2013,00127ee2-cb05-48ce-bc49-9de556b93346
1,This paper describes the design and implementa...,"[Gareth Beale, Graeme Earl]",50,"[10482dd3-4642-4193-842f-85f3b70fcf65, 3133714...",A methodology for the physically accurate visu...,visual analytics science and technology,2011,001c58d3-26ad-46b3-ab3a-c1e557d16821


Unnamed: 0,abstract,authors,n_citation,references,title,venue,year,id
0,The purpose of this study is to develop a lear...,"[Makoto Satoh, Ryo Muramatsu, Mizue Kayama, Ka...",0,"[51c7e02e-f5ed-431a-8cf5-f761f266d4be, 69b625b...",Preliminary Design of a Network Protocol Learn...,international conference on human-computer int...,2013,00127ee2-cb05-48ce-bc49-9de556b93346
1,This paper describes the design and implementa...,"[Gareth Beale, Graeme Earl]",50,"[10482dd3-4642-4193-842f-85f3b70fcf65, 3133714...",A methodology for the physically accurate visu...,visual analytics science and technology,2011,001c58d3-26ad-46b3-ab3a-c1e557d16821


## Concatenate datasets

In [None]:
cross_df = pd.DataFrame({'id': [], 'title': [], 'year': [], 'references': [], 'authors': [], 'n_citation': [], 'gov_score': []})

# prepare all titles list
articles_data = gov_articles_data[['Tytuł 2', 'Punkty']].dropna().values
conferences_data = gov_conferences_data[['Nazwa konferencji', 'Liczba punktów']].dropna().values

articles_and_conferences_list = np.concatenate((articles_data, conferences_data))

directory_path = "../data/external/dblp-ref-10"  # Zastąp to ścieżką do katalogu, który chcesz przeszukać

try:
    file_names = os.listdir(directory_path)

    for file_name in file_names:
        print(file_name)

        with open("..//data//external//dblp-ref-10//" + file_name) as input_file:
            # head = [next(input_file) for _ in range(2)]
            for line in input_file:
                head = "[" + line + "]"
                tmp_df = pd.json_normalize(json.loads(head))
                
                row_indices = np.where(articles_and_conferences_list[:, 0] == tmp_df.iloc[0]["title"])[0]
                if len(row_indices) > 0:
                    display(tmp_df)
                    cross_df = pd.concat([cross_df,
                                        pd.DataFrame({
                                            'id': [tmp_df.iloc[0]["id"]],
                                            'title': [tmp_df.iloc[0]["title"]],
                                            'year': [tmp_df.iloc[0]["year"]],
                                            'references': [tmp_df.iloc[0]["references"] if "references" in tmp_df.columns else np.nan],
                                            'authors': [tmp_df.iloc[0]["authors"] if "authors" in tmp_df.columns else np.nan],
                                            'n_citation': [tmp_df.iloc[0]["n_citation"] if "n_citation" in tmp_df.columns else np.nan],
                                            'gov_score': [articles_and_conferences_list[row_indices[0], 1]]
                                        })
                                        ],axis=1)

except FileNotFoundError:
    print(f"Katalog '{directory_path}' nie istnieje.")
 
except PermissionError:
    print(f"Brak uprawnień do odczytu katalogu '{directory_path}'.")

display(cross_df)

dblp-ref-0.json


Unnamed: 0,authors,n_citation,title,venue,year,id
0,"[Alain Bensoussan, Charles S. Tapiero]",50,Risk and Decision Analysis,,2009,aa1ea2bb-05b7-4064-9bd9-936bc9db2dfe


Unnamed: 0,abstract,authors,n_citation,references,title,venue,year,id
0,Before delving into the role of information th...,[Carlos Gershenson],50,"[1327c5f7-afff-42d9-b2a5-dd63b5fefb50, 297ae24...",Information and Computation,arXiv: Information Theory,2013,b001e752-b4e4-4101-b38a-40e857741b96


Unnamed: 0,authors,n_citation,title,venue,year,id
0,"[Montek Singh, Steven M. Nowick]",50,ACM Journal on Emerging Technologies in Comput...,ACM Transactions on Design Automation of Elect...,2010,116c1176-aa13-4716-a122-a8b1d2fc16f2


Unnamed: 0,authors,n_citation,title,venue,year,id
0,"[S. Sitharama Iyengar, Richard R. Brooks, Günt...",175,International Journal of Distributed Sensor Ne...,International Journal of Distributed Sensor Ne...,2008,3a86606e-0f1e-4ed8-a174-df3fc8e0807f


dblp-ref-1.json


Unnamed: 0,abstract,authors,n_citation,title,venue,year,id
0,The editors of BMC Medical Informatics and Dec...,[Irene Pala],112,BMC Medical Informatics and Decision Making,BMC Medical Informatics and Decision Making,2014,4e1dace0-b5df-44b7-bf55-5362f95b822d


Unnamed: 0,authors,n_citation,title,venue,year,id
0,[William L. Jorgensen],109,Journal of Chemical Information and Modeling,Journal of Chemical Information and Modeling,2005,55e4a5b9-3248-462d-abe2-731f456433e0


Unnamed: 0,authors,n_citation,title,venue,year,id
0,[Raymond Lister],0,Computer Science Education,Computer Science Education,2008,ad3922db-ba22-465c-a5ed-da0eb8b77511


## Save data

Save joined set of atricles and conferences list with scores

In [None]:
cross_df.to_csv(os.path.join("../data/interim", "articles_with_score_df.csv"))