In [1]:
import pandas as pd
import numpy as np
import os
from tqdm import tqdm
import sys

## Load journal and conferences lists

Set of journal and conferences from [gov.pl](https://www.gov.pl/web/edukacja-i-nauka/nowy-rozszerzony-wykaz-czasopism-naukowych-i-recenzowanych-materialow-z-konferencji-miedzynarodowych) with impact metrics

Get journal titles list

In [None]:
# load https://www.gov.pl/web/edukacja-i-nauka/nowy-rozszerzony-wykaz-czasopism-naukowych-i-recenzowanych-materialow-z-konferencji-miedzynarodowych

# load https://www.gov.pl/web/edukacja-i-nauka/nowy-rozszerzony-wykaz-czasopism-naukowych-i-recenzowanych-materialow-z-konferencji-miedzynarodowych
# sheet 2: 'conferences'

# https://www.aminer.org/citation v10

A collection of data on publications in the field of computer science from [dblp](https://paperswithcode.com/dataset/dblp)

**Note**
When we filter by *venue* in both list program match the vast majority of records than when we use *title* for publications and *venue* for conferences separately (about 18 rows)

### 1. Prepare paths and localisation

Ensure the project root is the working directory so that relative paths
and imports from the `src` package behave consistently with .py scripts.

In [None]:
if os.getcwd().endswith('notebooks'):
    os.chdir('..')
    print(f"Zmieniono katalog roboczy na: {os.getcwd()}")

if os.getcwd() not in sys.path:
    sys.path.append(os.getcwd())

In [11]:
INPUT_EXCEL = "data/external/Wykaz_dyscyplin_do_czasopism_i_materiałów_konferencyjnych.xlsx"
DBLP_DIRECTORY = "data/external/dblp-ref-10"

### 2. Unit tests

Data loading tests

In [None]:
from src.data.make_dataset import load_gov_data, load_dblp_raw_data

In [None]:
# Testujemy ładowanie słowników
journal_lookup, conference_lookup = load_gov_data(INPUT_EXCEL)

print(f"Liczba załadowanych czasopism: {len(journal_lookup)}")
print(f"Liczba załadowanych konferencji: {len(conference_lookup)}")

2025-12-28 08:44:56,741 - INFO - Loading ministerial journal and conference lists...


Liczba załadowanych czasopism: 26793
Liczba załadowanych konferencji: 1638
Przykładowe czasopismo: 2D Materials -> Punkty: {'Punkty': 140}


In [None]:
# sanity check
example_journal = list(journal_lookup.keys())[0]
print(f"Przykładowe czasopismo: {example_journal} -> Punkty: {journal_lookup[example_journal]}")

Przykładowe czasopismo: 2D Materials -> Punkty: {'Punkty': 140}


In [None]:
# testujemy łądowanie danych DBLP
raw_pubs = load_dblp_raw_data(DBLP_DIRECTORY)

print(f"Liczba załadowanych DBLP: {len(raw_pubs)}")  

2025-12-28 08:46:44,345 - INFO - Loading DBLP publications from: ../data/external/dblp-ref-10
2025-12-28 08:46:44,346 - INFO - Processing file: dblp-ref-0.json
Loading dblp-ref-0.json: 1000000it [00:07, 142506.62it/s]
2025-12-28 08:46:51,422 - INFO - Processing file: dblp-ref-1.json
Loading dblp-ref-1.json: 1000000it [00:10, 97638.64it/s]
2025-12-28 08:47:01,678 - INFO - Processing file: dblp-ref-2.json
Loading dblp-ref-2.json: 1000000it [00:13, 71786.28it/s]
2025-12-28 08:47:15,756 - INFO - Processing file: dblp-ref-3.json
Loading dblp-ref-3.json: 79007it [00:00, 90002.74it/s] 

Liczba załadowanych DBLP: 3079007





In [None]:

###
# Merge simulation on sample data buckets
###

test_sample = raw_pubs[:100]

test_results = []
for pub in test_sample:
    venue = pub.get('venue', '')
    score = None
    
    if venue in conference_lookup:
        score = conference_lookup[venue]['Liczba punktów']
    elif venue in journal_lookup:
        score = journal_lookup[venue]['Punkty']
    
    if score is not None:
        test_results.append({"title": pub.get("title"), "venue": venue, "gov_score": score})

test_df = pd.DataFrame(test_results)
print(f"Znaleziono dopasowania dla {len(test_df)} z 100 artykułów.")
display(test_df.head())

Znaleziono dopasowania dla 4 z 100 artykułów.


Unnamed: 0,title,venue,gov_score
0,Software Evolution through Transformations.,Electronic Notes in Theoretical Computer Science,40
1,Context Dependent Automatic Textile Image Anno...,Journal of Advanced Computational Intelligence...,20
2,On the signed total chromatic number of a graph.,Ars Combinatoria,40
3,Traveling wave solutions of the n-dimensional ...,Applied Mathematics Letters,100


### 3. Run main step

Run main step - ETL step that loads two source datasets, merges them, and writes
the unified articles dataset to an intermediate CSV file.

In [None]:
from src.data.make_dataset import main

OUTPUT_CSV = "data/interim/articles_with_score_df.csv"

main(INPUT_EXCEL, DBLP_DIRECTORY, OUTPUT_CSV)


2025-12-28 09:50:38,526 - INFO - Loading ministerial journal and conference lists...
2025-12-28 09:50:44,339 - INFO - Loading DBLP publications from: data/external/dblp-ref-10
2025-12-28 09:50:44,339 - INFO - Processing file: dblp-ref-0.json
Loading dblp-ref-0.json: 1000000it [00:06, 166088.49it/s]
2025-12-28 09:50:50,363 - INFO - Processing file: dblp-ref-1.json
Loading dblp-ref-1.json: 1000000it [00:10, 93859.23it/s]
2025-12-28 09:51:01,019 - INFO - Processing file: dblp-ref-2.json
Loading dblp-ref-2.json: 1000000it [00:41, 23894.57it/s]
2025-12-28 09:51:42,874 - INFO - Processing file: dblp-ref-3.json
Loading dblp-ref-3.json: 79007it [00:00, 114629.72it/s]
2025-12-28 09:51:43,570 - INFO - Merging DBLP data with ministerial scores...
Merging: 100%|██████████| 3079007/3079007 [00:51<00:00, 59468.25it/s] 
2025-12-28 09:52:39,881 - INFO - Saving 850406 records to data/interim/articles_with_score_df.csv
2025-12-28 09:52:50,872 - INFO - Dataset construction completed successfully.
