<a href="https://colab.research.google.com/github/heidingaway/datapeople/blob/main/GCThesaurus.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [189]:
import pandas as pd
import os
import zipfile
import yaml
import re
import unicodedata
import hashlib

In [190]:
## Link to CSV

src = "https://canada.multites.net/cst/EAEAD1E6-7DD2-4997-BE7F-40BFB1CBE8A2/CST20240911.csv"

# Create the DataFrame from all the retrieved records
df = pd.read_csv(src)

# Add column names
df.columns = ['subject', 'predicate', 'object']

# Display the DataFrame
print(df.head())
print(df.count())

                          subject predicate                object
0               2-spirited people       Use     Two-spirit people
1               2019-nCoV disease       Use  Coronavirus diseases
2  2019 novel coronavirus disease       Use  Coronavirus diseases
3                 2SLGBTQ+ people       Use      2SLGBTQI+ people
4                2SLGBTQI+ people    French    Personne 2ELGBTQI+
subject      22146
predicate    22146
object       22146
dtype: int64


In [191]:
# Reset index with drop=True to avoid creating a new 'index' or 'level_0' column
df = df.reset_index(drop=True)

# Create a new column 'identifier' using the index number
df['identifier'] = df.index.astype(str) # Use index as identifier

# Use the apply method with a lambda function to modify the column content
# Include identifier in linkedsubject using f-string
df['linkedsubject'] = df.apply(lambda row: f"{row['identifier']} {row['subject']}", axis=1)

# identify long title
df = df.rename(columns={'object': 'longTitle', 'predicate': 'h2'}).sort_values(by='longTitle', ascending=False)

def to_lower_camel_case(text):
    """Converts a string to lowerCamelCase."""
    cleaned_text = re.sub(r'[^a-zA-Z0-9]', ' ', text).lower()
    camel_case = ''.join(word.capitalize() if i > 0 else word for i, word in enumerate(cleaned_text.split()))
    return camel_case[0].lower() + camel_case[1:]

# Iterate through the DataFrame and update 'h2' column
for index, row in df.iterrows():
    original_term = row['h2']
    camel_case_term = to_lower_camel_case(original_term)
    df.loc[index, 'h2'] = camel_case_term  # Update the value directly

# Now df['h2'] contains the updated terms in lowerCamelCase
print(df.head())

# Display the modified DataFrame
print(df.head())

               subject      h2           longTitle identifier  \
1475         Art works  french         Œuvre d'art       1475   
10161     Human beings  french         Être humain      10161   
19251    Sports events  french   Événement sportif      19251   
4635   Cultural events  french  Événement culturel       4635   
19712    Tax avoidance  french    Évitement fiscal      19712   

              linkedsubject  
1475         1475 Art works  
10161    10161 Human beings  
19251   19251 Sports events  
4635   4635 Cultural events  
19712   19712 Tax avoidance  
               subject      h2           longTitle identifier  \
1475         Art works  french         Œuvre d'art       1475   
10161     Human beings  french         Être humain      10161   
19251    Sports events  french   Événement sportif      19251   
4635   Cultural events  french  Événement culturel       4635   
19712    Tax avoidance  french    Évitement fiscal      19712   

              linkedsubject  
1475    

In [192]:
cleandf = df.drop(['subject', 'identifier'], axis=1)
print(cleandf.head())

print(cleandf['h2'].unique())

           h2           longTitle         linkedsubject
1475   french         Œuvre d'art        1475 Art works
10161  french         Être humain    10161 Human beings
19251  french   Événement sportif   19251 Sports events
4635   french  Événement culturel  4635 Cultural events
19712  french    Évitement fiscal   19712 Tax avoidance
['french' 'usedFor' 'broaderTerm' 'relatedTerm' 'use' 'narrowerTerm'
 'scopeNote' 'historyNote' 'subjectCategory']


In [193]:
# Before pivoting, reset the index and make "identifier" a column instead of the index.
pivoted_df = cleandf.pivot_table(
    index='longTitle',
    columns='h2',
    values='linkedsubject',
    aggfunc=lambda x: '; '.join(x.dropna().astype(str)) # Changed aggfunc to join values
)

# Reset index to make 'longTitle' a column again
pivoted_df = pivoted_df.reset_index()

# Function to remove accents and special characters from a string
def clean_title(text):
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('ascii') # Remove accents
    text = re.sub(r"[^a-zA-Z0-9 ]", "", text) # Remove special characters except spaces
    text = text[:50]  # Truncate to 50 characters if longer
    return text

# Apply cleaning function to the 'title' column
pivoted_df['title'] = pivoted_df['longTitle'].apply(clean_title)

print(pivoted_df)

h2                                            longTitle broaderTerm  \
0     "Bicycle paths" replaces "Cycling trails" as p...         NaN   
1     "Biochemicals" replaces "Biochemical products"...         NaN   
2     "Coasts" replaces "Coastlands" as preferred te...         NaN   
3     "Cultural groups" replaces "Cultural minoritie...         NaN   
4     "Demining" replaces "Mine clearing" as preferr...         NaN   
...                                                 ...         ...   
7583                                   Évitement fiscal         NaN   
7584                                 Événement culturel         NaN   
7585                                  Événement sportif         NaN   
7586                                        Être humain         NaN   
7587                                        Œuvre d'art         NaN   

h2                  french                                     historyNote  \
0                      NaN         4789 Cycling trails; 2016 Bicycle 

In [194]:
output_dir = "csv_files"
i = 1
while os.path.exists(output_dir):
    output_dir = f"csv_files_{i}"
    i += 1
os.makedirs(output_dir)

# Iterate through each group (title) and export to CSV
for title, group_data in pivoted_df.groupby('title'):
    file_name = f"{title}.csv"  #
    file_path = os.path.join(output_dir, file_name)
    group_data.to_csv(file_path, index=False)

print(f"CSV files exported to: {output_dir}")

CSV files exported to: csv_files_17


In [195]:
# Preformat for Markdown links
for column in pivoted_df.columns:
    # Check if column is not in the list ['title', 'longTitle']
    if column not in ['title', 'longTitle']:
        pivoted_df[column] = pivoted_df[column].astype(str).apply(lambda x: '- "[[' + x.replace('; ', ']]"\n- "[[') + ']]"' if x not in ['', 'nan'] else x)
        # We check for empty strings and "nan" and skip transformation if found

print(pivoted_df.sort_values(by=['broaderTerm']))

h2                         longTitle  \
4092                       Livestock   
970                       Businesses   
6384                 Social problems   
356             Alternative medicine   
4999                      Parliament   
...                              ...   
2605  Federal territorial agreements   
2604                   Federal taxes   
2603                Federal services   
2601            Federal publications   
7587                     Œuvre d'art   

h2                                          broaderTerm  \
4092  - "[[10000 Hogs]]"\n- "[[2873 Cattle]]"\n- "[[...   
970   - "[[10010 Home-based businesses]]"\n- "[[8466...   
6384  - "[[10043 Homelessness]]"\n- "[[5375 Discrimi...   
356                            - "[[10049 Homeopathy]]"   
4999  - "[[10100 House of Commons]]"\n- "[[18525 Sen...   
...                                                 ...   
2605                                                nan   
2604                                           

In [196]:
# Create a new directory for Markdown files with a unique name
dir_name = "markdown_files"
i = 1
while os.path.exists(dir_name):
    dir_name = f"markdown_files_{i}"
    i += 1
os.makedirs(dir_name)

# Function to create markdown content with bullet points
def create_markdown_content(row):
    markdown_content = f"---\ntitle: {row['title']}\nalias: {row['longTitle']}\n"
    for column in pivoted_df.columns:
        if column not in ['title', 'longTitle'] and row[column] != 'nan':
            markdown_content += f"\n{column}:\n" # property name
            markdown_content += f"{row[column]}"  # Add the bulleted content
    markdown_content += "\n---"  # Add the separator at the end
    return markdown_content

# Create the markdown files
def create_markdown_file(row):  # Define a function to create individual files
    file_name = os.path.join(dir_name, f"{row['title']}.md")
    with open(file_name, "w") as f:
        f.write(create_markdown_content(row))

pivoted_df.apply(create_markdown_file, axis=1)  # Apply to pivoted_df

print(f"Markdown files created successfully in directory: {dir_name}")

Markdown files created successfully in directory: markdown_files_20


In [197]:
# Create a zip file of the markdown directory
def zip_directory(directory_path, zip_file_path):
    with zipfile.ZipFile(zip_file_path, 'w') as zipf:
        for root, _, files in os.walk(directory_path):
            for file in files:
                file_path = os.path.join(root, file)
                arcname = os.path.relpath(file_path, directory_path)
                zipf.write(file_path, arcname=arcname)

# Zip the 'markdown_files' directory
zip_directory(dir_name, 'markdown_files.zip')

print(f"Markdown files zipped to: markdown_files.zip")

Markdown files zipped to: markdown_files.zip
