<a href="https://colab.research.google.com/github/heidingaway/datapeople/blob/main/GCThesaurus.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [157]:
import pandas as pd
import os
import zipfile
import yaml
import re
import unicodedata
import hashlib
from pickle import DEFAULT_PROTOCOL

def to_lower_camel_case(text):
    # Split the text into words based on underscores
    text = text.split(' ')
    # Convert the first word to lowercase and the rest to title case, then join them
    camel_case = text[0].lower() + ''.join(text.title() for text in text[1:])
    return camel_case



In [158]:
## Link to CSV

src = "https://canada.multites.net/cst/EAEAD1E6-7DD2-4997-BE7F-40BFB1CBE8A2/CST20240911.csv"

# Create the DataFrame from all the retrieved records
df = pd.read_csv(src)

# Add column names
df.columns = ['object', 'predicate', 'subject']

In [159]:
print(df.head())

                           object predicate               subject
0               2-spirited people       Use     Two-spirit people
1               2019-nCoV disease       Use  Coronavirus diseases
2  2019 novel coronavirus disease       Use  Coronavirus diseases
3                 2SLGBTQ+ people       Use      2SLGBTQI+ people
4                2SLGBTQI+ people    French    Personne 2ELGBTQI+


In [160]:
related_df = df[df['predicate'] == 'Related Term'].reset_index(drop=True)
related_df = related_df.rename(columns={'object': 'relatedTerm', 'subject': 'longTitle'})

# Define the strings you want to add
prefix = "\"[["  # Prefix without identifier
suffix = "]]\""

related_df['relatedTerm'] = prefix + related_df['relatedTerm'] + suffix
related_df['relatedTerm'] = "["+related_df['relatedTerm'] + "]"
print(related_df)

grouped_related_df = related_df.groupby('longTitle')['relatedTerm'].apply(lambda x: ','.join(x.astype(str))).reset_index()
print(grouped_related_df)

                        relatedTerm     predicate          longTitle
0             ["[[Abbreviations]]"]  Related Term        Terminology
1                  ["[[Abortion]]"]  Related Term          Bioethics
2                  ["[[Abortion]]"]  Related Term          Pregnancy
3       ["[[Access to education]]"]  Related Term          Education
4     ["[[Access to information]]"]  Related Term         Censorship
...                             ...           ...                ...
5165                  ["[[Youth]]"]  Related Term       Young adults
5166     ["[[Zoological gardens]]"]  Related Term            Animals
5167     ["[[Zoological gardens]]"]  Related Term  Protected species
5168                ["[[Zoology]]"]  Related Term    Animal research
5169                ["[[Zoology]]"]  Related Term            Animals

[5170 rows x 3 columns]
                  longTitle                                        relatedTerm
0            AIDS (disease)  ["[[Sexually transmitted diseases]]"],[

In [161]:
history_df = df[df['predicate'] == 'History note']
history_df['subject'] = "'" + history_df['subject'] + "'"

print(history_df.head())

                    object     predicate  \
296        Adverse effects  History note   
680          Air transport  History note   
2016         Bicycle paths  History note   
2082  Biochemical products  History note   
2090          Biochemicals  History note   

                                                subject  
296   '"Side effects" replaces "Adverse effects" as ...  
680   'Before May 2007, "Air transport" is not used ...  
2016  '"Bicycle paths" replaces "Cycling trails" as ...  
2082  '"Biochemicals" replaces "Biochemical products...  
2090  '"Biochemicals" replaces "Biochemical products...  


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  history_df['subject'] = "'" + history_df['subject'] + "'"


In [162]:
use_df = df[df['predicate'] == 'Use']
use_df = use_df.rename(columns={'object': 'aliases', 'subject': 'longTitle'})
use_df = use_df.drop(columns=['predicate'])
print(use_df)

                              aliases             longTitle
0                   2-spirited people     Two-spirit people
1                   2019-nCoV disease  Coronavirus diseases
2      2019 novel coronavirus disease  Coronavirus diseases
3                     2SLGBTQ+ people      2SLGBTQI+ people
25                          Abilities                Skills
...                               ...                   ...
22094                          Yeasts                 Fungi
22095                          Yeasts        Microorganisms
22129             Zoological research       Animal research
22141                            Zoos    Zoological gardens
22143                     Zootechnics      Animal husbandry

[2716 rows x 2 columns]


In [163]:
usefor_df = df[df['predicate'] == 'Used For']
usefor_df= usefor_df.rename(columns={'subject': 'aliases', 'object': 'longTitle'})
usefor_df = usefor_df.drop(columns=['predicate'])
print(usefor_df)

                 longTitle                          aliases
6         2SLGBTQI+ people                  2SLGBTQ+ people
7         2SLGBTQI+ people                     LGBTI people
8         2SLGBTQI+ people                    LGBTQ2 people
9         2SLGBTQI+ people                   LGBTQ2+ people
10        2SLGBTQI+ people                  LGBTQIA2 people
...                    ...                              ...
22062              Writing            Written communication
22086  Xenotransplantation  Animal-to-human transplantation
22109      Young offenders               Juvenile offenders
22124   Zoological gardens                             Zoos
22133              Zoology                  Animal sciences

[2717 rows x 2 columns]


In [164]:
appended_df = pd.concat([use_df, usefor_df], ignore_index=True).reset_index(drop=True)
appended_df.drop_duplicates(inplace=True)

print(appended_df)

                             aliases             longTitle
0                  2-spirited people     Two-spirit people
1                  2019-nCoV disease  Coronavirus diseases
2     2019 novel coronavirus disease  Coronavirus diseases
3                    2SLGBTQ+ people      2SLGBTQI+ people
4                          Abilities                Skills
...                              ...                   ...
2712                          Yeasts        Microorganisms
2713             Zoological research       Animal research
2714                            Zoos    Zoological gardens
2715                     Zootechnics      Animal husbandry
5206                 2-spirit people     Two-spirit people

[2717 rows x 2 columns]


In [165]:
fr_df = df[df['predicate'] == 'French']
fr_df = fr_df.rename(columns={'object': 'longTitle', 'subject': 'french'})
fr_df = fr_df.drop(columns=['predicate'])
fr_df.drop_duplicates(inplace=True)
print(fr_df)

                   longTitle                 french
4           2SLGBTQI+ people     Personne 2ELGBTQI+
22             Abbreviations            Abréviation
65                  Abortion             Avortement
79       Access to education    Accès à l'éducation
82     Access to information  Accès à l'information
...                      ...                    ...
22101           Young adults           Jeune adulte
22108        Young offenders     Jeune contrevenant
22117                  Youth               Jeunesse
22123     Zoological gardens      Jardin zoologique
22132                Zoology               Zoologie

[2312 rows x 2 columns]


In [192]:
merged_df = pd.merge(fr_df,appended_df, on='longTitle', how='left')

# Remove duplicates based on all columns
merged_df.drop_duplicates(inplace=True)
merged_df['aliases'] = merged_df['aliases'].apply(lambda x: f'"{x}"' if pd.notna(x) else x) ## add quotes
merged_df['aliases'] = merged_df['aliases'].apply(lambda x: f"[{x}]" if pd.notna(x) else x) ## add quotes
merged_df['french'] = merged_df['french'].apply(lambda x: f'"{x}"' if pd.notna(x) else x) ## add quotes
merged_df['french'] = merged_df['french'].apply(lambda x: f"[{x}]" if pd.notna(x) else x) ## add quotes
# Print the merged DataFrame (now without duplicates)
print(merged_df)

               longTitle                french                 aliases
0       2SLGBTQI+ people  "Personne 2ELGBTQI+"     ["2SLGBTQ+ people"]
1       2SLGBTQI+ people  "Personne 2ELGBTQI+"        ["LGBTI people"]
2       2SLGBTQI+ people  "Personne 2ELGBTQI+"       ["LGBTQ2 people"]
3       2SLGBTQI+ people  "Personne 2ELGBTQI+"      ["LGBTQ2+ people"]
4       2SLGBTQI+ people  "Personne 2ELGBTQI+"     ["LGBTQIA2 people"]
...                  ...                   ...                     ...
3909        Young adults        "Jeune adulte"                     NaN
3910     Young offenders  "Jeune contrevenant"  ["Juvenile offenders"]
3911               Youth            "Jeunesse"                     NaN
3912  Zoological gardens   "Jardin zoologique"                ["Zoos"]
3913             Zoology            "Zoologie"     ["Animal sciences"]

[3914 rows x 3 columns]


In [167]:
# Define a function to join non-empty values with comma
def join_non_empty(values):
    return ','.join([str(val) for val in values if pd.notna(val) and val != ''])

# Group by 'longTitle' and aggregate columns with comma separation, skipping empty values
aliased_df = merged_df.groupby(['longTitle', 'french']).agg({
    'aliases': join_non_empty,
}).reset_index()

aliased_df['combined'] = aliased_df.apply(lambda row: join_non_empty([row['french'], row['aliases']]), axis=1)

# Print the grouped DataFrame
print(aliased_df)

                longTitle                 french  \
0        2SLGBTQI+ people   "Personne 2ELGBTQI+"   
1          AIDS (disease)                 "Sida"   
2           Abbreviations          "Abréviation"   
3                Abortion           "Avortement"   
4     Access to education  "Accès à l'éducation"   
...                   ...                    ...   
2307         Young adults         "Jeune adulte"   
2308      Young offenders   "Jeune contrevenant"   
2309                Youth             "Jeunesse"   
2310   Zoological gardens    "Jardin zoologique"   
2311              Zoology             "Zoologie"   

                                                aliases  \
0     "2SLGBTQ+ people","LGBTI people","LGBTQ2 peopl...   
1     "Acquired immune deficiency syndrome","Acquire...   
2                                                         
3        "Pregnancy termination","Therapeutic abortion"   
4                                                         
...                  

In [168]:
# Merge history_df with appended_df, joining 'longTitle' and 'object'
mergeh_df = pd.merge(aliased_df, history_df, left_on='longTitle', right_on='object', how='left')  # Changed left_on and right_on

# Rename 'predicate' to 'historyNote' and drop 'object'
mergeh_df = mergeh_df.drop(columns=['aliases'])
mergeh_df = mergeh_df.rename(columns={'combined': 'aliases', 'subject': 'historyNote'})
mergeh_df = mergeh_df.drop(columns=['object', 'predicate']) # Changed this line to drop 'object' and 'predicate' separately

# Print the merged DataFrame
print(mergeh_df)

                longTitle                 french  \
0        2SLGBTQI+ people   "Personne 2ELGBTQI+"   
1          AIDS (disease)                 "Sida"   
2           Abbreviations          "Abréviation"   
3                Abortion           "Avortement"   
4     Access to education  "Accès à l'éducation"   
...                   ...                    ...   
2307         Young adults         "Jeune adulte"   
2308      Young offenders   "Jeune contrevenant"   
2309                Youth             "Jeunesse"   
2310   Zoological gardens    "Jardin zoologique"   
2311              Zoology             "Zoologie"   

                                                aliases historyNote  
0     "Personne 2ELGBTQI+","2SLGBTQ+ people","LGBTI ...         NaN  
1     "Sida","Acquired immune deficiency syndrome","...         NaN  
2                                         "Abréviation"         NaN  
3     "Avortement","Pregnancy termination","Therapeu...         NaN  
4                        

In [169]:
scope_df = df[df['predicate'] == 'Scope Note']
scope_df = scope_df.rename(columns={'object': 'longTitle', 'subject': 'scopeNote'})
scope_df['scopeNote'] = "'" + scope_df['scopeNote'] + "'"

mergeds_df = pd.merge(mergeh_df, scope_df, on='longTitle', how='left')
mergeds_df = mergeds_df.drop(columns=['predicate'])

smergeds_df = pd.merge(mergeds_df, grouped_related_df, on='longTitle', how='left')

# Print the merged DataFrame:
print(smergeds_df)

smergeds_df = pd.merge(mergeds_df, grouped_related_df, on='longTitle', how='left')

# Print the merged DataFrame:
print(smergeds_df)

                longTitle                 french  \
0        2SLGBTQI+ people   "Personne 2ELGBTQI+"   
1          AIDS (disease)                 "Sida"   
2           Abbreviations          "Abréviation"   
3                Abortion           "Avortement"   
4     Access to education  "Accès à l'éducation"   
...                   ...                    ...   
2307         Young adults         "Jeune adulte"   
2308      Young offenders   "Jeune contrevenant"   
2309                Youth             "Jeunesse"   
2310   Zoological gardens    "Jardin zoologique"   
2311              Zoology             "Zoologie"   

                                                aliases historyNote  \
0     "Personne 2ELGBTQI+","2SLGBTQ+ people","LGBTI ...         NaN   
1     "Sida","Acquired immune deficiency syndrome","...         NaN   
2                                         "Abréviation"         NaN   
3     "Avortement","Pregnancy termination","Therapeu...         NaN   
4                   

In [170]:
nt_df = df[df['predicate'] == 'Narrower Term']
nt_df = nt_df.rename(columns={'object': 'broaderTerm', 'subject': 'narrowerTerm'})
nt_df = nt_df.drop(columns=['predicate'])

print(nt_df)

            broaderTerm                 narrowerTerm
12     2SLGBTQI+ people               Asexual people
13     2SLGBTQI+ people              Bisexual people
14     2SLGBTQI+ people                   Gay people
15     2SLGBTQI+ people  Gender-nonconforming people
16     2SLGBTQI+ people              Intersex people
...                 ...                          ...
22065           Writing          Legislative writing
22066           Writing            Technical writing
22135           Zoology                   Entomology
22136           Zoology                  Ichthyology
22137           Zoology                  Ornithology

[1267 rows x 2 columns]


In [171]:
bt_df = df[df['predicate'] == 'Broader Term']
bt_df = bt_df.rename(columns={'object': 'narrowerTerm', 'subject': 'broaderTerm'})
bt_df = bt_df.drop(columns=['predicate'])

print(bt_df)

                                      narrowerTerm                 broaderTerm
11                                2SLGBTQI+ people                  Minorities
102    Accessibility for persons with disabilities               Accessibility
109                           Accessible transport                   Transport
115                            Accident prevention                  Prevention
140                                     Accounting           Business services
...                                            ...                         ...
22087                          Xenotransplantation  Transplantation (Medicine)
22099                                       Yogurt              Dairy products
22103                                 Young adults                      Adults
22125                           Zoological gardens     Recreational facilities
22134                                      Zoology                     Biology

[1267 rows x 2 columns]


In [172]:
ntbt_df = pd.concat([bt_df, nt_df])
ntbt_df = ntbt_df.drop_duplicates()

print(ntbt_df)

                                      narrowerTerm                 broaderTerm
11                                2SLGBTQI+ people                  Minorities
102    Accessibility for persons with disabilities               Accessibility
109                           Accessible transport                   Transport
115                            Accident prevention                  Prevention
140                                     Accounting           Business services
...                                            ...                         ...
22087                          Xenotransplantation  Transplantation (Medicine)
22099                                       Yogurt              Dairy products
22103                                 Young adults                      Adults
22125                           Zoological gardens     Recreational facilities
22134                                      Zoology                     Biology

[1267 rows x 2 columns]


In [173]:
# Unpivot the DataFrame
unpivoted_df = pd.melt(ntbt_df, value_vars=['narrowerTerm', 'broaderTerm'], var_name='term_type', value_name='term')

# Remove duplicate rows
unpivoted_df.drop_duplicates(subset=['term'], inplace=True)

print(unpivoted_df)

         term_type                                         term
0     narrowerTerm                             2SLGBTQI+ people
1     narrowerTerm  Accessibility for persons with disabilities
2     narrowerTerm                         Accessible transport
3     narrowerTerm                          Accident prevention
4     narrowerTerm                                   Accounting
...            ...                                          ...
2439   broaderTerm                        Railway installations
2454   broaderTerm                                    Radiation
2479   broaderTerm                               Rehabilitation
2485   broaderTerm                                    Monuments
2506   broaderTerm                       Assistive technologies

[1438 rows x 2 columns]


In [174]:
# Perform the join
joined_df = pd.merge(unpivoted_df, aliased_df[['longTitle', 'french', 'combined']], left_on='term', right_on='longTitle', how='left')

# Drop the 'longTitle' column from the joined DataFrame (if you don't need it)
joined_df = joined_df.drop(columns=['term_type'])

print(joined_df)

                                             term  \
0                                2SLGBTQI+ people   
1     Accessibility for persons with disabilities   
2                            Accessible transport   
3                             Accident prevention   
4                                      Accounting   
...                                           ...   
1433                        Railway installations   
1434                                    Radiation   
1435                               Rehabilitation   
1436                                    Monuments   
1437                       Assistive technologies   

                                        longTitle  \
0                                2SLGBTQI+ people   
1     Accessibility for persons with disabilities   
2                            Accessible transport   
3                             Accident prevention   
4                                      Accounting   
...                                          

In [175]:
subcat_df = df[df['predicate'] == 'Subject Category']
subcat_df = subcat_df.rename(columns={'object': 'longTitle', 'subject': 'subcat'})
subcat_df = subcat_df.drop(columns=['predicate'])

print(subcat_df)

                 longTitle                       subcat
21        2SLGBTQI+ people       SO Society and Culture
24           Abbreviations  LN Language and Linguistics
26               Abilities                 PR Processes
28      Aboriginal affairs   GV Government and Politics
29      Aboriginal affairs       SO Society and Culture
...                    ...                          ...
22131  Zoological research    ST Science and Technology
22140              Zoology    ST Science and Technology
22142                 Zoos    NE Nature and Environment
22144          Zootechnics               AG Agriculture
22145          Zootechnics    NE Nature and Environment

[6306 rows x 2 columns]


In [176]:
filtered_subcat_df = subcat_df[subcat_df['longTitle'].isin(ntbt_df['broaderTerm'])]
new_df = filtered_subcat_df.sort_values(by=['subcat']).copy().reset_index(drop=True)
print(new_df)

                 longTitle                      subcat
0              Visual arts  AA Arts, Music, Literature
1          Performing arts  AA Arts, Music, Literature
2                     Arts  AA Arts, Music, Literature
3         Public buildings  AA Arts, Music, Literature
4                 Drawings  AA Arts, Music, Literature
..                     ...                         ...
508     Navigation systems                TR Transport
509   Marine installations                TR Transport
510         Motor vehicles                TR Transport
511               Vehicles                TR Transport
512  Transportation safety                TR Transport

[513 rows x 2 columns]


In [177]:
unique_subcats = new_df['subcat'].unique().tolist()
print(unique_subcats)

indval = 501.01 # Initialize indval
updated_subcats = []
for i, subcat in enumerate(unique_subcats):
    updated_subcat =  f"{indval + i * 0.01:.2f} {subcat}"   # Add indval + 0.01 to the beginning
    updated_subcats.append(updated_subcat)

print(updated_subcats)

['AA Arts, Music, Literature', 'AG Agriculture', 'EC Economics and Industry', 'ET Education and Training', 'FM Form descriptors', 'GV Government and Politics', 'HE Health and Safety', 'HI History and Archaeology', 'IN Information and Communications', 'LB Labour', 'LN Language and Linguistics', 'LW Law', 'MI Military', 'NE Nature and Environment', 'PE Persons', 'PR Processes', 'SO Society and Culture', 'ST Science and Technology', 'TR Transport']
['501.01 AA Arts, Music, Literature', '501.02 AG Agriculture', '501.03 EC Economics and Industry', '501.04 ET Education and Training', '501.05 FM Form descriptors', '501.06 GV Government and Politics', '501.07 HE Health and Safety', '501.08 HI History and Archaeology', '501.09 IN Information and Communications', '501.10 LB Labour', '501.11 LN Language and Linguistics', '501.12 LW Law', '501.13 MI Military', '501.14 NE Nature and Environment', '501.15 PE Persons', '501.16 PR Processes', '501.17 SO Society and Culture', '501.18 ST Science and Tec

In [178]:
subcat_mapping = dict(zip(unique_subcats, updated_subcats))  # Create a mapping dictionary
new_df['subcat'] = new_df['subcat'].map(subcat_mapping)

print(new_df)

                 longTitle                             subcat
0              Visual arts  501.01 AA Arts, Music, Literature
1          Performing arts  501.01 AA Arts, Music, Literature
2                     Arts  501.01 AA Arts, Music, Literature
3         Public buildings  501.01 AA Arts, Music, Literature
4                 Drawings  501.01 AA Arts, Music, Literature
..                     ...                                ...
508     Navigation systems                501.19 TR Transport
509   Marine installations                501.19 TR Transport
510         Motor vehicles                501.19 TR Transport
511               Vehicles                501.19 TR Transport
512  Transportation safety                501.19 TR Transport

[513 rows x 2 columns]


In [179]:
# Define the strings you want to add
prefix = "\"[["  # Prefix without identifier
suffix = "]]\""

new_df['longTitle'] = prefix + new_df['longTitle'] + suffix

print(new_df)

                       longTitle                             subcat
0              "[[Visual arts]]"  501.01 AA Arts, Music, Literature
1          "[[Performing arts]]"  501.01 AA Arts, Music, Literature
2                     "[[Arts]]"  501.01 AA Arts, Music, Literature
3         "[[Public buildings]]"  501.01 AA Arts, Music, Literature
4                 "[[Drawings]]"  501.01 AA Arts, Music, Literature
..                           ...                                ...
508     "[[Navigation systems]]"                501.19 TR Transport
509   "[[Marine installations]]"                501.19 TR Transport
510         "[[Motor vehicles]]"                501.19 TR Transport
511               "[[Vehicles]]"                501.19 TR Transport
512  "[[Transportation safety]]"                501.19 TR Transport

[513 rows x 2 columns]


In [180]:
result_df = new_df.groupby('subcat')['longTitle'].apply(','.join).reset_index()

print(result_df)

                                      subcat  \
0          501.01 AA Arts, Music, Literature   
1                      501.02 AG Agriculture   
2           501.03 EC Economics and Industry   
3           501.04 ET Education and Training   
4                 501.05 FM Form descriptors   
5          501.06 GV Government and Politics   
6                501.07 HE Health and Safety   
7          501.08 HI History and Archaeology   
8   501.09 IN Information and Communications   
9                           501.10 LB Labour   
10        501.11 LN Language and Linguistics   
11                             501.12 LW Law   
12                        501.13 MI Military   
13          501.14 NE Nature and Environment   
14                         501.15 PE Persons   
15                       501.16 PR Processes   
16             501.17 SO Society and Culture   
17          501.18 ST Science and Technology   
18                       501.19 TR Transport   

                                       

In [181]:
# Create a unique folder name using uuid
import uuid
unique_folder_name = str(uuid.uuid4())
os.makedirs(unique_folder_name, exist_ok=True)  # Create the folder

alias_start = 2025010217001  # Initial alias value

# List to store markdown file titles
markdown_titles = []

def create_markdown_file(row, folder_name, alias_start):
    filename = os.path.join(folder_name, f"{row['subcat']}.md")
    with open(filename, 'w') as f:
        # Wrap the class property value in square brackets
        class_value = f"[{row['longTitle']}]"
        f.write(f"---\ntags: gccommon\ntitle: {row['subcat']}\nclass: {class_value}\naliases: {alias_start}\n---\n")

    # Append the title to the list
    markdown_titles.append(row['subcat'])

    return alias_start + 1

# Apply the function to each row, passing the folder name and alias_start
# Use a for loop to iterate over rows and update alias_start for each file creation
for index, row in result_df.iterrows():
    alias_start = create_markdown_file(row, unique_folder_name, alias_start)

# Get the full path of the folder
full_path = os.path.abspath(unique_folder_name)

print(f"Output files are in the folder: {full_path}")

Output files are in the folder: /content/4543f681-b801-4ed8-859a-88ee6c26847f


In [182]:
# Zip the output folder
zip_file_name = f"{unique_folder_name}.zip"
with zipfile.ZipFile(zip_file_name, 'w') as zipf:
    for root, _, files in os.walk(unique_folder_name):
        for file in files:
            zipf.write(os.path.join(root, file), os.path.relpath(os.path.join(root, file), unique_folder_name))

print(f"Output files zipped to: {zip_file_name}")

Output files zipped to: 4543f681-b801-4ed8-859a-88ee6c26847f.zip


In [183]:
print("Markdown File Titles:")
for title in markdown_titles:
    print(f'"[[{title}]]"')

Markdown File Titles:
"[[501.01 AA Arts, Music, Literature]]"
"[[501.02 AG Agriculture]]"
"[[501.03 EC Economics and Industry]]"
"[[501.04 ET Education and Training]]"
"[[501.05 FM Form descriptors]]"
"[[501.06 GV Government and Politics]]"
"[[501.07 HE Health and Safety]]"
"[[501.08 HI History and Archaeology]]"
"[[501.09 IN Information and Communications]]"
"[[501.10 LB Labour]]"
"[[501.11 LN Language and Linguistics]]"
"[[501.12 LW Law]]"
"[[501.13 MI Military]]"
"[[501.14 NE Nature and Environment]]"
"[[501.15 PE Persons]]"
"[[501.16 PR Processes]]"
"[[501.17 SO Society and Culture]]"
"[[501.18 ST Science and Technology]]"
"[[501.19 TR Transport]]"


In [184]:
tax_df = ntbt_df.reset_index(drop=True).sort_values(by=['broaderTerm'])
tax_df['narrowerTerm'] = prefix + tax_df['narrowerTerm'] + suffix
tax_df =tax_df.groupby('broaderTerm')['narrowerTerm'].apply(','.join).reset_index()

print(tax_df)

          broaderTerm                                       narrowerTerm
0    2SLGBTQI+ people  "[[Bisexual people]]","[[Pansexual people]]","...
1       Accessibility  "[[Accessibility for persons with disabilities]]"
2           Accidents  "[[Occupational accidents]]","[[Road accidents...
3      Accountability                              "[[Open government]]"
4          Accounting                         "[[Accounting standards]]"
..                ...                                                ...
426          Wildlife                             "[[Aquatic wildlife]]"
427     Winter sports            "[[Skiing]]","[[Hockey]]","[[Skating]]"
428           Workers  "[[Agricultural workers]]","[[Industrial worke...
429           Writing  "[[Technical writing]]","[[Administrative writ...
430           Zoology  "[[Ornithology]]","[[Ichthyology]]","[[Entomol...

[431 rows x 2 columns]


In [185]:
alias_start = 2025010217491  # Initial alias value

def create_markdown_file(row, folder_name, alias_start):
    filename = os.path.join(folder_name, f"{row['broaderTerm']}.md")  # Use broaderTerm as filename
    with open(filename, 'w') as f:
        narrower_term_value = f"[{row['narrowerTerm']}]"  # Wrap narrowerTerm in []
        f.write(f"---\ntags: gccommon\ntitle: {row['broaderTerm']}\nnarrowerTerm: {narrower_term_value}\naliases: {alias_start}\n---\n")
    return alias_start + 1

# Create a unique folder name using uuid
ufolder_name = str(uuid.uuid4())
# Fix: Use ufolder_name in os.makedirs
os.makedirs(ufolder_name, exist_ok=True)  # Create the folder

# Apply the function to each row of tax_df
for index, row in tax_df.iterrows():
    alias_start = create_markdown_file(row, ufolder_name, alias_start)


# Get the full path of the folder
full_path = os.path.abspath(ufolder_name)

print(f"Output files are in the folder: {full_path}")

Output files are in the folder: /content/c8b71928-9eff-4ac2-baa3-d784313b2b09


In [186]:
# Zip the output folder
zip_file_name = f"{ufolder_name}.zip"
with zipfile.ZipFile(zip_file_name, 'w') as zipf:
    for root, _, files in os.walk(ufolder_name):
        for file in files:
            zipf.write(os.path.join(root, file), os.path.relpath(os.path.join(root, file), ufolder_name))

print(f"Output files zipped to: {zip_file_name}")

Output files zipped to: c8b71928-9eff-4ac2-baa3-d784313b2b09.zip


In [187]:
# Create a new directory for Markdown files with a unique name
dir_name = "markdown_files"
i = 1
while os.path.exists(dir_name):
    dir_name = f"markdown_files_{i}"
    i += 1
os.makedirs(dir_name)

# Function to create markdown content with bullet points
def create_markdown_content(row):
    markdown_content = f"---\ntitle: {row['longTitle']}\ntags:\n- gccommon\n"
    for column in mergeds_df.columns:
        if column not in ['longTitle']:
            value = row[column]
            # Check if value is a Series and handle it appropriately
            if isinstance(value, pd.Series):
                # Use any() to check if any value in the Series is not NA and not empty string
                if value.notna().any() and value.astype(str).str.strip().ne('').any():
                    markdown_content += f"{column}: {','.join(value.astype(str).tolist())}\n"
            # If not a Series, use pd.notna as before
            elif pd.notna(value) and value != '':
                markdown_content += f"{column}: {value}\n"
    markdown_content += "---"
    return markdown_content

def create_markdown_file(row):
    file_name = os.path.join(dir_name, f"{row['longTitle']}.md")
    with open(file_name, "w") as f:
        f.write(create_markdown_content(row))

mergeds_df.apply(create_markdown_file, axis=1)

print(f"Markdown files created successfully in directory: {dir_name}")

# Zip the output folder
zip_file_name = f"{dir_name}.zip"
with zipfile.ZipFile(zip_file_name, 'w') as zipf:
    for root, _, files in os.walk(dir_name):  # change ufolder_name to dir_name
        for file in files:
            zipf.write(os.path.join(root, file), os.path.relpath(os.path.join(root, file), dir_name))  # change ufolder_name to dir_name

print(f"Output files zipped to: {zip_file_name}")

Markdown files created successfully in directory: markdown_files_8
Output files zipped to: markdown_files_8.zip


In [188]:
print(len(smergeds_df))  # Print number of rows
print(smergeds_df.shape[0])  # Print number of rows

2312
2312
