<a href="https://colab.research.google.com/github/heidingaway/datapeople/blob/main/GCThesaurus.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [23]:
import pandas as pd
import os
import zipfile
import yaml
import re
import unicodedata
import hashlib
from pickle import DEFAULT_PROTOCOL

def to_lower_camel_case(text):
    # Split the text into words based on underscores
    text = text.split(' ')
    # Convert the first word to lowercase and the rest to title case, then join them
    camel_case = text[0].lower() + ''.join(text.title() for text in text[1:])
    return camel_case

In [24]:
## Link to CSV

src = "https://canada.multites.net/cst/EAEAD1E6-7DD2-4997-BE7F-40BFB1CBE8A2/CST20240911.csv"

# Create the DataFrame from all the retrieved records
df = pd.read_csv(src)

# Add column names
df.columns = ['subject', 'predicate', 'object']

# Create a new column 'identifier' using the index number
df['identifier'] = df.index.astype(str) # Use index as identifier

# identify long title
df = df.rename(columns={'object': 'longTitle'}).sort_values(by='longTitle', ascending=False)

#lowercamelcase
df['predicate'] = df['predicate'].apply(to_lower_camel_case)

# Display the DataFrame
print(df.head())
print(df.count())

               subject predicate           longTitle identifier
1475         Art works    french         Œuvre d'art       1475
10161     Human beings    french         Être humain      10161
19251    Sports events    french   Événement sportif      19251
4635   Cultural events    french  Événement culturel       4635
19712    Tax avoidance    french    Évitement fiscal      19712
subject       22146
predicate     22146
longTitle     22146
identifier    22146
dtype: int64


In [25]:
# Function to remove accents and special characters from a string
def clean_title(text):
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('ascii') # Remove accents
    text = re.sub(r"[^a-zA-Z0-9 -]", "", text)  # Remove special characters except spaces and dashes
    text = text[:50]  # Truncate to 50 characters if longer
    return text

# Group by 'longTitle' and 'predicate' and aggregate 'linkedsubject' with comma separation
grouped_df = df.groupby(['longTitle', 'predicate'])['subject'].apply(lambda x: ', '.join(x)).reset_index()

# Define the strings you want to add
prefix = "\"[["  # Prefix without identifier
suffix = "]]\""

grouped_df['subject'] = grouped_df['subject'].apply(clean_title)
grouped_df['subject'] = grouped_df.apply(lambda row: f"{prefix}{row['subject']}{suffix}", axis=1)

# Before pivoting, reset the index and make "identifier" a column instead of the index.
pivoted_df = grouped_df.pivot_table(
    index='longTitle',
    columns='predicate',
    values='subject',
    aggfunc=lambda x: ', '.join(x.dropna().astype(str)) # Changed aggfunc to join values
)

# Reset index to make 'title' a column again
pivoted_df = pivoted_df.reset_index()


# Apply cleaning function to the 'title' and 'subject' column
pivoted_df['title'] = pivoted_df['longTitle'].apply(clean_title)
pivoted_df['longTitle'] = "'" + pivoted_df['longTitle'] + "'"

for column in pivoted_df.columns:
    if column not in ['title', 'longTitle']:
        pivoted_df[column] = pivoted_df[column].str.replace("], ", "\n- ", regex=False)

# Apply cleaning function to the 'title' and 'subject' column

print(pivoted_df)

predicate                                          longTitle broaderTerm  \
0          '"Bicycle paths" replaces "Cycling trails" as ...         NaN   
1          '"Biochemicals" replaces "Biochemical products...         NaN   
2          '"Coasts" replaces "Coastlands" as preferred t...         NaN   
3          '"Cultural groups" replaces "Cultural minoriti...         NaN   
4          '"Demining" replaces "Mine clearing" as prefer...         NaN   
...                                                      ...         ...   
7583                                      'Évitement fiscal'         NaN   
7584                                    'Événement culturel'         NaN   
7585                                     'Événement sportif'         NaN   
7586                                           'Être humain'         NaN   
7587                                           'Œuvre d'art'         NaN   

predicate                 french                                historyNote  \
0       

In [26]:
output_dir = "csv_files"
i = 1
while os.path.exists(output_dir):
    output_dir = f"csv_files_{i}"
    i += 1
os.makedirs(output_dir)

# Iterate through each group (title) and export to CSV
for title, group_data in pivoted_df.groupby('title'):
    file_name = f"{title}.csv"  #
    file_path = os.path.join(output_dir, file_name)
    group_data.to_csv(file_path, index=False)

print(f"CSV files exported to: {output_dir}")

CSV files exported to: csv_files_1


In [27]:
# Create a new directory for Markdown files with a unique name
dir_name = "markdown_files"
i = 1
while os.path.exists(dir_name):
    dir_name = f"markdown_files_{i}"
    i += 1
os.makedirs(dir_name)

# Function to create markdown content with bullet points
def create_markdown_content(row):
    markdown_content = f"---\ntitle: {row['title']}\ntags:\n- gccommon\n"
    for column in pivoted_df.columns:
        if column not in ['title', 'longTitle'] and pd.notna(row[column]) and row[column] != '':
            markdown_content += f"{column}:\n"  # Heading for the section
            markdown_content += f"- {row[column]}\n"  # Add the bulleted content
    markdown_content += "---"
    return markdown_content

# Create the markdown files
def create_markdown_file(row):  # Define a function to create individual files
    file_name = os.path.join(dir_name, f"{row['title']}.md")
    with open(file_name, "w") as f:
        f.write(create_markdown_content(row))

pivoted_df.apply(create_markdown_file, axis=1)  # Apply to pivoted_df

print(f"Markdown files created successfully in directory: {dir_name}")

Markdown files created successfully in directory: markdown_files_1


In [28]:
# Create a zip file of the markdown directory
def zip_directory(directory_path, zip_file_path):
    with zipfile.ZipFile(zip_file_path, 'w') as zipf:
        for root, _, files in os.walk(directory_path):
            for file in files:
                file_path = os.path.join(root, file)
                arcname = os.path.relpath(file_path, directory_path)
                zipf.write(file_path, arcname=arcname)

# Zip the 'markdown_files' directory
zip_directory(dir_name, 'markdown_files.zip')

print(f"Markdown files zipped to: markdown_files.zip")

Markdown files zipped to: markdown_files.zip
