<a href="https://colab.research.google.com/github/heidingaway/datapeople/blob/main/GCThesaurus.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import os
import zipfile
import yaml
import re
import unicodedata
import hashlib

In [105]:
## Link to CSV

src = "https://canada.multites.net/cst/EAEAD1E6-7DD2-4997-BE7F-40BFB1CBE8A2/CST20240911.csv"

# Create the DataFrame from all the retrieved records
df = pd.read_csv(src)

# Add column names
df.columns = ['subject', 'predicate', 'object']

# Display the DataFrame
print(df.head())
print(df.count())

                          subject predicate                object
0               2-spirited people       Use     Two-spirit people
1               2019-nCoV disease       Use  Coronavirus diseases
2  2019 novel coronavirus disease       Use  Coronavirus diseases
3                 2SLGBTQ+ people       Use      2SLGBTQI+ people
4                2SLGBTQI+ people    French    Personne 2ELGBTQI+
subject      22146
predicate    22146
object       22146
dtype: int64


In [106]:
# Reset index with drop=True to avoid creating a new 'index' or 'level_0' column
df = df.reset_index(drop=True)

# Create a new column 'identifier' using the index number
df['identifier'] = df.index.astype(str) # Use index as identifier

# Define the strings you want to add
prefix = "[["  # Prefix without identifier
suffix = "]]"

# Use the apply method with a lambda function to modify the column content
# Include identifier in linkedsubject using f-string
df['linkedsubject'] = df.apply(lambda row: f"{prefix}{row['identifier']} {row['subject']}{suffix}", axis=1)

# Define the headings you want to add
heading2 = "## "

# Use the apply method with a lambda function to modify the column content
df['h2'] = df['predicate'].apply(lambda x: f"{heading2}{x}")

# identify long title
df = df.rename(columns={'object': 'longTitle'}).sort_values(by='longTitle', ascending=False)

# Display the modified DataFrame
print(df.head())

               subject predicate           longTitle identifier  \
1475         Art works    French         Œuvre d'art       1475   
10161     Human beings    French         Être humain      10161   
19251    Sports events    French   Événement sportif      19251   
4635   Cultural events    French  Événement culturel       4635   
19712    Tax avoidance    French    Évitement fiscal      19712   

                  linkedsubject         h2  
1475         [[1475 Art works]]  ## French  
10161    [[10161 Human beings]]  ## French  
19251   [[19251 Sports events]]  ## French  
4635   [[4635 Cultural events]]  ## French  
19712   [[19712 Tax avoidance]]  ## French  


In [107]:
cleandf = df.drop(['subject','predicate', 'identifier'], axis=1)
print(cleandf.head())

                longTitle             linkedsubject         h2
1475          Œuvre d'art        [[1475 Art works]]  ## French
10161         Être humain    [[10161 Human beings]]  ## French
19251   Événement sportif   [[19251 Sports events]]  ## French
4635   Événement culturel  [[4635 Cultural events]]  ## French
19712    Évitement fiscal   [[19712 Tax avoidance]]  ## French


In [108]:
# Group by 'longTitle' and 'h2' and aggregate 'linkedsubject' with comma separation
grouped_df = cleandf.groupby(['longTitle', 'h2'])['linkedsubject'].apply(lambda x: ', '.join(x)).reset_index()

# Before pivoting, reset the index and make "identifier" a column instead of the index.
pivoted_df = cleandf.pivot_table(
    index='longTitle',
    columns='h2',
    values='linkedsubject',
    aggfunc=lambda x: ', '.join(x.dropna().astype(str)) # Changed aggfunc to join values
)

# Reset index to make 'longTitle' a column again
pivoted_df = pivoted_df.reset_index()

# Function to remove accents and special characters from a string
def clean_title(text):
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('ascii') # Remove accents
    text = re.sub(r"[^a-zA-Z0-9 ]", "", text) # Remove special characters except spaces
    text = text[:50]  # Truncate to 50 characters if longer
    return text

# Apply cleaning function to the 'title' column
pivoted_df['title'] = pivoted_df['longTitle'].apply(clean_title)

print(pivoted_df)

h2                                            longTitle ## Broader Term  \
0     "Bicycle paths" replaces "Cycling trails" as p...             NaN   
1     "Biochemicals" replaces "Biochemical products"...             NaN   
2     "Coasts" replaces "Coastlands" as preferred te...             NaN   
3     "Cultural groups" replaces "Cultural minoritie...             NaN   
4     "Demining" replaces "Mine clearing" as preferr...             NaN   
...                                                 ...             ...   
7583                                   Évitement fiscal             NaN   
7584                                 Événement culturel             NaN   
7585                                  Événement sportif             NaN   
7586                                        Être humain             NaN   
7587                                        Œuvre d'art             NaN   

h2                   ## French  \
0                          NaN   
1                          NaN 

In [89]:
output_dir = "csv_files"
i = 1
while os.path.exists(output_dir):
    output_dir = f"csv_files_{i}"
    i += 1
os.makedirs(output_dir)

# Iterate through each group (title) and export to CSV
for title, group_data in pivoted_df.groupby('title'):
    file_name = f"{title}.csv"  #
    file_path = os.path.join(output_dir, file_name)
    group_data.to_csv(file_path, index=False)

print(f"CSV files exported to: {output_dir}")

CSV files exported to: csv_files_5


In [102]:
# Create a new directory for Markdown files with a unique name
dir_name = "markdown_files"
i = 1
while os.path.exists(dir_name):
    dir_name = f"markdown_files_{i}"
    i += 1
os.makedirs(dir_name)

def create_markdown_file(row):
    title = row['title']
    file_name = os.path.join(dir_name, f"{title}.md")
    markdown_content = f"---\ntitle: {title}\nalias: {row['longTitle']}\n---\n\n"
    for column in pivoted_df.columns:
        if column not in ['title', 'longTitle'] and pd.notna(row[column]) and row[column] != '':
            markdown_content += f"{column}\n\n{row[column]}  \n\n"
    with open(file_name, "w") as f:
        f.write(markdown_content)

pivoted_df.apply(create_markdown_file, axis=1)

print(f"Markdown files created successfully in directory: {dir_name}")

Markdown files created successfully in directory: markdown_files_19


In [79]:
# Create a zip file of the markdown directory
def zip_directory(directory_path, zip_file_path):
    with zipfile.ZipFile(zip_file_path, 'w') as zipf:
        for root, _, files in os.walk(directory_path):
            for file in files:
                file_path = os.path.join(root, file)
                arcname = os.path.relpath(file_path, directory_path)
                zipf.write(file_path, arcname=arcname)

# Zip the 'markdown_files' directory
zip_directory(dir_name, 'markdown_files.zip')

print(f"Markdown files zipped to: markdown_files.zip")

Markdown files zipped to: markdown_files.zip
