Download of the OpenAIRE Graph Dataset:

1. Download the files `dataset_*.tar` from the version of your choice [here](https://doi.org/10.5281/zenodo.3516917).
2. Unpack in the following steps:
    ```
    unar dataset_1.tar
    cd dataset
    cat part-00*.json.gz > dataset.json.gz
    unar dataset.json.gz
    ```

The code blocks are executed in descending order.

### 0. Preparation

In [None]:
import json
import pickle
import random
from pathlib import Path

from helper_openaire_graph_dataset import get_identifier, sort_by_provider
from threaded_metadata_harvester import metadata_harvester
from helper_convert_tinydb_to_sqlite3 import convert_tinydb_to_sqlite3
from helper_quantile import calculate_stats
from helper_geopackage import count_bboxes, create_geopackage
from helper_convert_sqlite3_to_csv import create_csv

Definitions
- `openaire_graph_datasets` contains the OpenAIRE Graph datasets from which the identifiers are extracted.
- `working_dir` is the path in which statistics files are saved.

In [None]:
# path to OpenAIRE Graph Dataset and total lines
openaire_graph_datasets = {
    #"/run/media/lars/522aa7ef-7821-427b-9ded-fd117f17f316/bachelor-thesis/OpenAIRE Graph Dataset 1.0/dataset.json": 13754884,
    #"/run/media/lars/522aa7ef-7821-427b-9ded-fd117f17f316/bachelor-thesis/OpenAIRE Graph Dataset 2.0/dataset.json": 13754884,
    #"/run/media/lars/522aa7ef-7821-427b-9ded-fd117f17f316/bachelor-thesis/OpenAIRE Graph Dataset 3.0/dataset.json": 14228362,
    #"/run/media/lars/522aa7ef-7821-427b-9ded-fd117f17f316/bachelor-thesis/OpenAIRE Graph Dataset 4.0/dataset.json": 15272516,
    #"/run/media/lars/522aa7ef-7821-427b-9ded-fd117f17f316/bachelor-thesis/OpenAIRE Graph Dataset 4.1/dataset.json": 15606225,
    #"/run/media/lars/522aa7ef-7821-427b-9ded-fd117f17f316/bachelor-thesis/OpenAIRE Graph Dataset 5.0.0/dataset.json": 18958506,
    #"/run/media/lars/522aa7ef-7821-427b-9ded-fd117f17f316/bachelor-thesis/OpenAIRE Graph Dataset 6.0.0/dataset.json": 58576399,
    #"/run/media/lars/522aa7ef-7821-427b-9ded-fd117f17f316/bachelor-thesis/OpenAIRE Graph Dataset 7.0.0/dataset.json": 59998862,
    #"/run/media/lars/522aa7ef-7821-427b-9ded-fd117f17f316/bachelor-thesis/OpenAIRE Graph Dataset 8.0.0/dataset.json": 61346274,
    "/run/media/lars/522aa7ef-7821-427b-9ded-fd117f17f316/bachelor-thesis/OpenAIRE Graph Dataset 9.0.1/dataset.json": 73443566,
}

content_provider = ["dryad", "figshare", "zenodo"]

working_dir = Path("statistics")

### 1. Sort OpenAIRE Graph Dataset by content provider

**Attention:** The output file of the statistics is overwritten in each run. If a new run is carried out with fewer input files, the statistics for these files are deleted.

In [None]:
results1 = []
statistics1 = {}  # dict contains all providers for the files and which providers have been assigned to the searched content providers
statistics1_path = working_dir.joinpath("1_statistics.json")
working_dir.mkdir(parents=True, exist_ok=True)

for dataset_path, number_of_lines in openaire_graph_datasets.items():
    output_input_sorted, statistics = sort_by_provider(
        dataset_path, number_of_lines, content_provider,
    )
    if statistics:
        results1.append([output_input_sorted, statistics])
        statistics1[dataset_path] = statistics
    else:
        with open(statistics1_path, "r") as file:
            statistics1_loaded = (json.load(file))[dataset_path]
            results1.append([output_input_sorted, statistics1_loaded])
            statistics1[dataset_path] = statistics1_loaded

    with open(statistics1_path, "w") as file:
        json.dump(statistics1, file, indent=4)
        print("Statistics saved in", statistics1_path.absolute())

### 2. Extract identifier from sorted OpenAIRE Graph Dataset for content provider

From now on, only the latest version of the OpenAIRE Graph Dataset will be used (last entry from `results1`).

In [None]:
results2 = []

for index, name in enumerate(content_provider):
    extract_dataset_path = results1[-1][0][index]
    extract_identifier_path = extract_dataset_path + "_identifiers.pickle"

    if not Path(extract_identifier_path).is_file():
        identifiers = get_identifier(name, extract_dataset_path)
        results2.append([extract_identifier_path, identifiers])

        with open(extract_identifier_path, "wb") as f:
            pickle.dump(identifiers, f)
    else:
        print(f"Output file {extract_identifier_path} already exist.")
        with open(extract_identifier_path, "rb") as f:
            results2.append([extract_identifier_path, pickle.load(f)])

### 3. Shuffle identifier

In [None]:
for extract_identifier_path, identifiers in results2:
    extract_identifier_shuffle_path = extract_identifier_path + "_shuffled.pickle"

    if not Path(extract_identifier_shuffle_path).is_file():
        random.seed(74292449775793935952472534943397)
        random.shuffle(identifiers)

        with open(extract_identifier_shuffle_path, "wb") as output_file:
            pickle.dump(identifiers, output_file)

        print("Shuffled identifier saved in", extract_identifier_shuffle_path)

    else:
        print(f"Output file {extract_identifier_shuffle_path} already exist.")

### 4. Metadata harvesting

Threshold value must be set in `threaded_metadata_harvester.py` in line 79.

API keys can be provided.

In [None]:
files = {
    "dryad": "/run/media/lars/522aa7ef-7821-427b-9ded-fd117f17f316/bachelor-thesis/OpenAIRE Graph Dataset 9.0.1/dataset.json_auszug_dryad.json_identifiers.pickle_shuffled.pickle",
    "figshare": "/run/media/lars/522aa7ef-7821-427b-9ded-fd117f17f316/bachelor-thesis/OpenAIRE Graph Dataset 9.0.1/dataset.json_auszug_figshare.json_identifiers.pickle_shuffled.pickle",
    "zenodo": "/run/media/lars/522aa7ef-7821-427b-9ded-fd117f17f316/bachelor-thesis/OpenAIRE Graph Dataset 9.0.1/dataset.json_auszug_zenodo.json_identifiers.pickle_shuffled.pickle",
}
access_token = {
    # "dryad": "",
    # "figshare": "",
    # "zenodo": "",
}
checkpoint_path = "/home/lars/FINAL_checkpoint.pickle"
tinydb_path = "/home/lars/FINAL_metadata_db.json"

In [None]:
metadata_harvester(files, checkpoint_path, tinydb_path, access_token)

### 5. Convert TinyDB to SQLite3

In [None]:
tinydb_paths = [
    tinydb_path,
]
sqlite_path = str(Path(tinydb_paths[0]).with_suffix(".sqlite3"))

convert_tinydb_to_sqlite3(tinydb_paths, sqlite_path)

### 6. Calculate statistics

In [None]:
calculate_stats(sqlite_path)

### 7. Download and Analysis

In the following, the program is executed in the shell, as the argument “-u” must be passed in order to deactivate standard output buffering.

Hardcoded variables in `threaded_dataset_analysis.py`:
- 695: SQLite3 path
- 628: Dataset size limit
- 399: Threshold time
- 50: Temp dir

In [None]:
!python -u "threaded_dataset_analysis.py"  # 2>&1 | tee '/home/lars/FINAL_metadata_db.sqlite3.log'

### 8. Convert to Geopackage

Creates geopackage file in new subdir `gpkg` of SQLite3.

In [None]:
gpkg_path = create_geopackage(sqlite_path, "bbox")
# create_geopackage(sqlite_path, "bbox", filter=0.02)
create_geopackage(sqlite_path, "center")

### 9. Count BBoxes

In [None]:
geopackage_with_bbox = {
    gpkg_path: "bounding_box",
}
geopackage_to_count = {
    "/home/lars/Dokumente/Studium2/Bachelorarbeit_/visualization/grid_30_deg.gpkg": "grid_30_deg",
    "/home/lars/Dokumente/Studium2/Bachelorarbeit_/visualization/grid_15_deg.gpkg": "grid_15_deg",
    "/home/lars/Dokumente/Studium2/Bachelorarbeit_/visualization/grid_10_deg.gpkg": "grid_10_deg",
    "/home/lars/Dokumente/Studium2/Bachelorarbeit_/visualization/natural earth/ne_10m_admin_0_countries.gpkg": "ne_10m_admin_0_countries",
}

count_bboxes(geopackage_with_bbox, geopackage_to_count)

### 10. Convert to CSV

In [None]:
create_csv(sqlite_path)