<a href="https://colab.research.google.com/github/heidingaway/datapeople/blob/main/schemaOrg.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [77]:
import pandas as pd

## Link to CSV
src = "https://schema.org/version/latest/schemaorg-current-https-types.csv"

# Create the DataFrame from all the retrieved records
df = pd.read_csv(src)

# Get unique values from the 'subTypeOf' column and remove blanks
t_df = df['subTypeOf'].unique()
t_df = t_df[~pd.isnull(t_df)]
t_df = t_df[t_df != '']

# Create a new DataFrame with t_df as the first column
new_df = pd.DataFrame({'subTypeOf': t_df})

# Perform a left join, keeping only the "id" column from df
merged_df = pd.merge(new_df, df[['subTypeOf', 'id', 'label']], on='subTypeOf', how='left')

print(merged_df)

# Perform a left join, keeping only the "id" column from df
final_df = pd.merge(merged_df, df[['id', 'label']], left_on ="subTypeOf",right_on='id', how='left')
final_df= final_df.drop(columns=['id_y'])
final_df = final_df.rename(columns={'subTypeOf':'sourcetype','label_x': 'class', 'label_y':'type','id_x':'sourceclass'})

print(final_df)

final_df['class'] = final_df['class'].apply(lambda x: f"\"[[{x}]]\"" if pd.notna(x) else x)
# Group by 'sourcetype' and join 'class' with a comma
grouped_df = final_df.groupby('type')['class'].apply(lambda x: ', '.join(x.astype(str))).reset_index()

grouped_df["class"] = grouped_df["class"].apply(lambda x: f"[{x}]" if pd.notna(x) else x)

# Print the final grouped DataFrame
print(grouped_df)


                                             subTypeOf  \
0                       https://schema.org/MediaObject   
1                       https://schema.org/MediaObject   
2                       https://schema.org/MediaObject   
3                       https://schema.org/MediaObject   
4                       https://schema.org/MediaObject   
..                                                 ...   
918                    https://schema.org/ChooseAction   
919                       https://schema.org/UseAction   
920            https://schema.org/SizeGroupEnumeration   
921           https://schema.org/SizeSystemEnumeration   
922  https://schema.org/EducationalOccupationalProgram   

                                                   id  \
0                          https://schema.org/3DModel   
1                      https://schema.org/AudioObject   
2                     https://schema.org/DataDownload   
3                      https://schema.org/ImageObject   
4                 

In [81]:
import os
# Create a new directory for Markdown files with a unique name
dir_name = "markdown_files"
i = 1
while os.path.exists(dir_name):
    dir_name = f"markdown_files_{i}"
    i += 1
os.makedirs(dir_name)

alias_start = 2025010312471  # Initial alias value

# Function to create markdown content with bullet points
def create_markdown_content(row, alias_start): # Add alias_start as parameter
    markdown_content = f"---\ntitle: {row['type']}\ntags:\n- schemaorg\naliases: {alias_start}\n"
    for column in grouped_df.columns:
        if column not in ['type']:
            value = row[column]
            # Check if value is a Series and handle it appropriately
            if isinstance(value, pd.Series):
                # Use any() to check if any value in the Series is not NA and not empty string
                if value.notna().any() and value.astype(str).str.strip().ne('').any():
                    markdown_content += f"{column}: {','.join(value.astype(str).tolist())}\n"
            # If not a Series, use pd.notna as before
            elif pd.notna(value) and value != '':
                markdown_content += f"{column}: {value}\n"
    markdown_content += "---"
    return markdown_content # Return only the markdown content


def create_markdown_file(row, alias_start): # Add alias_start as parameter
    file_name = os.path.join(dir_name, f"{row['type']}.md")
    with open(file_name, "w") as f:
        markdown_content = create_markdown_content(row, alias_start) # Call with alias_start
        f.write(markdown_content)
    return alias_start + 1  # Return the incremented alias_start for the next row

# Apply with alias_start and accumulate the updated alias_start
alias_start = grouped_df.apply(lambda row: create_markdown_file(row, alias_start), axis=1).iloc[-1]


print(f"Markdown files created successfully in directory: {dir_name}")

Markdown files created successfully in directory: markdown_files_3


In [83]:
import zipfile
# Zip the output folder
zip_file_name = f"{dir_name}.zip"
with zipfile.ZipFile(zip_file_name, 'w') as zipf:
    for root, _, files in os.walk(dir_name):  # change ufolder_name to dir_name
        for file in files:
            zipf.write(os.path.join(root, file), os.path.relpath(os.path.join(root, file), dir_name))  # change ufolder_name to dir_name

print(f"Output files zipped to: {zip_file_name}")

Output files zipped to: markdown_files_3.zip
