In [19]:
import pandas as pd
from tqdm import tqdm
from glob import glob
import logging
import os
import re
from sklearn.model_selection import train_test_split

In [20]:
def get_files_df(INPUT_GRAPHS_FOLDER, INFERENCE_GRAPHS_FOLDER):
    logging.info("Creating dataframe for OWL2DL-1 input/inference pairs")
    rdf_files = []
    for input_graph_path in tqdm(sorted(glob(INPUT_GRAPHS_FOLDER + "*"))):
        input_graph_file = os.path.basename(input_graph_path)
        inference_path = INFERENCE_GRAPHS_FOLDER + input_graph_file
        graph_type = input_graph_path.split('\\')[-1]
        rdf_pair = {"input_graph_file": input_graph_path, "inference_file": inference_path, "graph_type": graph_type}
        rdf_files.append(rdf_pair)
    files_df = pd.DataFrame.from_dict(rdf_files)
    return files_df

In [21]:
files_df = get_files_df('MyJenaProject/input/', 'MyJenaProject/output/')

100%|██████████████████████████████████████████████████████████████████████████████████| 3/3 [00:00<00:00, 2960.69it/s]


In [25]:
files_df

Unnamed: 0,input_graph_file,inference_file,graph_type
0,MyJenaProject/input\P0.owl,MyJenaProject/output/P0.owl,P0.owl
1,MyJenaProject/input\P1.owl,MyJenaProject/output/P1.owl,P1.owl
2,MyJenaProject/input\P2.owl,MyJenaProject/output/P2.owl,P2.owl


In [22]:
def train_validate_test_split(df, train_percent=0.6, validate_percent=0.2, stratify=None, seed=1):
    val_test_percent = 1 - train_percent
    test_percent = (1 - (train_percent + validate_percent))
    test_percent = test_percent / (test_percent + validate_percent)
    if stratify:
        df_train, df_val_test = train_test_split(df, test_size=val_test_percent, random_state=seed,
                                                 stratify=df[stratify])
        df_val, df_test = train_test_split(df_val_test, test_size=test_percent, random_state=seed,
                                           stratify=df_val_test[stratify])
    else:
        df_train, df_val_test = train_test_split(df, test_size=val_test_percent, random_state=seed)
        df_val, df_test = train_test_split(df_val_test, test_size=test_percent, random_state=seed)
    return df_train, df_val, df_test

In [23]:
rdf_data_train, rdf_data_val, rdf_data_test = train_validate_test_split(files_df,
                                                                        train_percent=0.6,
                                                                        validate_percent=0.2,
                                                                        stratify="graph_type",
                                                                        seed=1)

ValueError: The least populated class in y has only 1 member, which is too few. The minimum number of groups for any class cannot be less than 2.

In [24]:
def write_nt_file(data, output_file):
    nt_files_orig = data['input_graph_file']
    nt_files_inferred = data['inference_file']
    nt_files = pd.concat([nt_files_orig, nt_files_inferred])

    with open(output_file, 'w') as outfile:
        for nt_file in nt_files:
            with open(nt_file, 'r') as infile:
                outfile.write(infile.read())
                outfile.write("\n")

In [16]:
write_nt_file(rdf_data_train, 'bin/OWL2DL-1_train.owl')
write_nt_file(rdf_data_val, 'bin/OWL2DL-1_val.owl')
write_nt_file(rdf_data_test, 'bin/OWL2DL-1_test.owl')