<a href="https://colab.research.google.com/github/heidingaway/datapeople/blob/main/GCThesaurus.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import os
import zipfile
import yaml
import re
import unicodedata
import hashlib

In [None]:
## Link to CSV

src = "https://canada.multites.net/cst/EAEAD1E6-7DD2-4997-BE7F-40BFB1CBE8A2/CST20240911.csv"

# Create the DataFrame from all the retrieved records
df = pd.read_csv(src)

# Add column names
df.columns = ['subject', 'predicate', 'object']

# Display the DataFrame
print(df.head())

                          subject predicate                object
0               2-spirited people       Use     Two-spirit people
1               2019-nCoV disease       Use  Coronavirus diseases
2  2019 novel coronavirus disease       Use  Coronavirus diseases
3                 2SLGBTQ+ people       Use      2SLGBTQI+ people
4                2SLGBTQI+ people    French    Personne 2ELGBTQI+


In [None]:
# Define the strings you want to add
prefix = "[["
suffix = "]]"

# Use the apply method with a lambda function to modify the column content
df['linkedsubject'] = df['subject'].apply(lambda x: f"{prefix}{x}{suffix}")

# Define the headings you want to add
heading2 = "## "

# Use the apply method with a lambda function to modify the column content
df['h2'] = df['predicate'].apply(lambda x: f"{heading2}{x}")

# identify title
df = df.rename(columns={'object': 'longTitle'}).sort_values(by='longTitle', ascending=False)

# Display the modified DataFrame
print(df.head())

               subject predicate           longTitle        linkedsubject  \
1475         Art works    French         Œuvre d'art        [[Art works]]   
10161     Human beings    French         Être humain     [[Human beings]]   
19251    Sports events    French   Événement sportif    [[Sports events]]   
4635   Cultural events    French  Événement culturel  [[Cultural events]]   
19712    Tax avoidance    French    Évitement fiscal    [[Tax avoidance]]   

              h2  
1475   ## French  
10161  ## French  
19251  ## French  
4635   ## French  
19712  ## French  


In [None]:
cleandf = df.drop(['subject','predicate'], axis=1)
print(cleandf.head())

                longTitle        linkedsubject         h2
1475          Œuvre d'art        [[Art works]]  ## French
10161         Être humain     [[Human beings]]  ## French
19251   Événement sportif    [[Sports events]]  ## French
4635   Événement culturel  [[Cultural events]]  ## French
19712    Évitement fiscal    [[Tax avoidance]]  ## French


In [None]:
# Group by 'title' and 'h2' and aggregate 'linkedsubject' with comma separation
grouped_df = cleandf.groupby(['longTitle', 'h2'])['linkedsubject'].apply(lambda x: ', '.join(x)).reset_index()

# Pivot the grouped DataFrame
pivoted_df = grouped_df.pivot(index='longTitle', columns='h2', values='linkedsubject').reset_index()

# Function to remove accents and special characters from a string
def clean_title(text):
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('ascii') # Remove accents
    text = re.sub(r"[^a-zA-Z0-9 ]", "", text) # Remove special characters except spaces
    text = text[:50]  # Truncate to 50 characters if longer
    return text

# Apply cleaning function to the 'title' column
pivoted_df['title'] = pivoted_df['longTitle'].apply(clean_title)

print(pivoted_df)

h2                                            longTitle ## Broader Term  \
0     "Bicycle paths" replaces "Cycling trails" as p...             NaN   
1     "Biochemicals" replaces "Biochemical products"...             NaN   
2     "Coasts" replaces "Coastlands" as preferred te...             NaN   
3     "Cultural groups" replaces "Cultural minoritie...             NaN   
4     "Demining" replaces "Mine clearing" as preferr...             NaN   
...                                                 ...             ...   
7583                                   Évitement fiscal             NaN   
7584                                 Événement culturel             NaN   
7585                                  Événement sportif             NaN   
7586                                        Être humain             NaN   
7587                                        Œuvre d'art             NaN   

h2              ## French                               ## History note  \
0                     Na

In [None]:
output_dir = "csv_files"
i = 1
while os.path.exists(output_dir):
    output_dir = f"csv_files_{i}"
    i += 1
os.makedirs(output_dir)

# Iterate through each group (title) and export to CSV
for title, group_data in pivoted_df.groupby('title'):
    file_name = f"{title}.csv"  #
    file_path = os.path.join(output_dir, file_name)
    group_data.to_csv(file_path, index=False)

print(f"CSV files exported to: {output_dir}")

CSV files exported to: csv_files_7


In [110]:
# Create a new directory for Markdown files with a unique name
dir_name = "markdown_files"
i = 1
while os.path.exists(dir_name):
    dir_name = f"markdown_files_{i}"
    i += 1
os.makedirs(dir_name)

def create_markdown_file(row):
    title = row['title']
    file_name = os.path.join(dir_name, f"{title}.md")
    # Access the 'longTitle' column from the 'row' Series
    markdown_content = f"---\ntitle: {title}\nalias: {row['longTitle']}\n---\n\n"
    for column in pivoted_df.columns:
        if column not in ['title', 'longTitle'] and pd.notna(row[column]) and row[column] != '':
            markdown_content += f"{column}\n\n- {row[column]}\n\n"
    with open(file_name, "w") as f:
        f.write(markdown_content)

pivoted_df.apply(create_markdown_file, axis=1)

print(f"Markdown files created successfully in directory: {dir_name}")

Markdown files created successfully in directory: markdown_files_8


In [111]:
# Create a zip file of the markdown directory
def zip_directory(directory_path, zip_file_path):
    with zipfile.ZipFile(zip_file_path, 'w') as zipf:
        for root, _, files in os.walk(directory_path):
            for file in files:
                file_path = os.path.join(root, file)
                arcname = os.path.relpath(file_path, directory_path)
                zipf.write(file_path, arcname=arcname)

# Zip the 'markdown_files' directory
zip_directory(dir_name, 'markdown_files.zip')

print(f"Markdown files zipped to: markdown_files.zip")

Markdown files zipped to: markdown_files.zip
