In [40]:
import os
import sqlite3
from collections import defaultdict
from datetime import date
import xml.etree.ElementTree as ET

directory = "/Users/kaanerdem/Desktop/projeler/pdtxt/pdtxt_notebook/transcripted_episodes/episodes_with_timestamps/new_episodes"
saf_root = "SAF/"

def create_SAF_Folder(episode_number:int):
    # Create the folder
    episode_folder_path = saf_root + str(episode_number)
    print(episode_folder_path)
    os.makedirs(episode_folder_path, exist_ok=True)

    # Connect to cursor to retreive all important information.
    with sqlite3.connect("nasil_olunur_database.db") as connection:
        cursor = connection.cursor()
        query = """SELECT title,episode_date,audio_url,word_document_name,description_tr,description_en
                    FROM episodes WHERE episode_number = ?"""
        cursor.execute(query, (episode_number,))
        episode = cursor.fetchone()
        title,episode_date,audio_url,word_document_name,description_tr,description_en = episode

        # Need to retrieve guest information
        guest_query = """ SELECT guest_name FROM guests WHERE episode_number = ?"""
        cursor.execute(guest_query, (episode_number,))
        guests = cursor.fetchall()
        guests = ' - '.join(name[0] for name in guests) # it's a string now.

        # Need to retrieve subject keywords.
        keyword_query = """SELECT type, keyword_text FROM keywords WHERE episode_number = ?"""
        cursor.execute(keyword_query, (episode_number,))
        keywords = cursor.fetchall()
        kw_dict = defaultdict(list) # key = keyword type, val = keyword text
        for label, kw in keywords:
            kw_dict[label].append(kw)

    # Add the .txt file
    add_txt(episode_folder_path, word_document_name)

    # Create the contents file
    create_contents(episode_folder_path)

    # Create the dublin_core
    # create_dublin_core(episode_number, episode_folder_path)
    create_dublin_core(guests,
                       episode_date,
                       description_tr,
                       description_en,
                       title,
                       audio_url,
                       kw_dict["subject_tr"],
                       kw_dict["subject_en"],
                       episode_number,
                       episode_folder_path)

    # Create the metadata file
    create_metadata(episode_number,
                    episode_folder_path,
                    kw_dict["person"],
                    kw_dict["place_tr"])

# create_SAF_Folder(235)

SAF/235


In [42]:
# Batch Create SAF FOLDERS
def batch_create_SAF(start_ep:int, end_ep:int):

    for i in range(start_ep, end_ep+1):
        try:
            create_SAF_Folder(i)
        except Exception as e:
            print(f"Error when creating saf folder {i}. --> {e}")
    remove_ds_store()
    zip_folders(start_ep,end_ep+1)

# batch_create_SAF(235,243)

SAF/235
SAF/236
SAF/237
SAF/238
SAF/239
SAF/240
SAF/241
SAF/242
SAF/243


ValueError: invalid literal for int() with base 10: 'zip'

In [47]:
import shutil
import os

def zip_folders(start_ep:int, end_ep:int):
    directory = "/Users/kaanerdem/Desktop/projeler/pdtxt/pdtxt_notebook/transcription_notebooks/SAF"
    zip_directory = directory + "/zip"

    for item in os.listdir(directory):
        print("item is: ", item)
        item_path = os.path.join(directory,item)

        #Check if the item is a folder
        if os.path.isdir(item_path) and (start_ep <= int(item) <= end_ep):
            zip_path = os.path.join(zip_directory,item)
            try:
                shutil.make_archive(zip_path, 'zip', item_path)
                print(f"Zipped: {item_path} -> {zip_path}.zip")
            except Exception as e:
                print(f"Problem zipping {item_path}. Error -> {e}")

# zip_folders(235,243)

item is:  .DS_Store
item is:  242
Zipped: /Users/kaanerdem/Desktop/projeler/pdtxt/pdtxt_notebook/transcription_notebooks/SAF/242 -> /Users/kaanerdem/Desktop/projeler/pdtxt/pdtxt_notebook/transcription_notebooks/SAF/zip/242.zip
item is:  243
Zipped: /Users/kaanerdem/Desktop/projeler/pdtxt/pdtxt_notebook/transcription_notebooks/SAF/243 -> /Users/kaanerdem/Desktop/projeler/pdtxt/pdtxt_notebook/transcription_notebooks/SAF/zip/243.zip
item is:  235
Zipped: /Users/kaanerdem/Desktop/projeler/pdtxt/pdtxt_notebook/transcription_notebooks/SAF/235 -> /Users/kaanerdem/Desktop/projeler/pdtxt/pdtxt_notebook/transcription_notebooks/SAF/zip/235.zip
item is:  241
Zipped: /Users/kaanerdem/Desktop/projeler/pdtxt/pdtxt_notebook/transcription_notebooks/SAF/241 -> /Users/kaanerdem/Desktop/projeler/pdtxt/pdtxt_notebook/transcription_notebooks/SAF/zip/241.zip
item is:  240
Zipped: /Users/kaanerdem/Desktop/projeler/pdtxt/pdtxt_notebook/transcription_notebooks/SAF/240 -> /Users/kaanerdem/Desktop/projeler/pdtxt/

In [15]:
import os

def remove_ds_store(): # Removes the macOS specific .DS_Store files
    directory = "/Users/kaanerdem/Desktop/projeler/pdtxt/pdtxt_notebook/SAF"
    for root, _, files in os.walk(directory):
        for file in files:
            if file == ".DS_Store":
                os.remove(os.path.join(root, file))

# remove_ds_store()

In [17]:
# Keyword querying trial
def deneme_keyword_retrieval():
    from collections import defaultdict
    episode_number = 61
    with sqlite3.connect("nasil_olunur_database.db") as connection:
        cursor = connection.cursor()
        keyword_query = """
            SELECT type, keyword_text FROM keywords WHERE episode_number = ?
        """
        cursor.execute(keyword_query, (episode_number,))
        keywords = cursor.fetchall()

        kw_dict = defaultdict(list)
        for label, kw in keywords:
            kw_dict[label].append(kw)
            # print(f"Label-> {label}, keyword-> {kw}")

        print(kw_dict.keys())

In [18]:
"""
    Function to create dublin_core file
"""

def create_dublin_core(guests,episode_date, description_tr, description_en, title, audio_url, subject_tr, subject_en, episode_number, episode_folder_path):

    #Clearing audio_url
    audio_url = audio_url[:audio_url.find("mp3")+3]

    dublin_core_metadata_structure = {
        "contributor_author": guests,
        "date_accessioned": str(date.today()),
        "date_issued": str(episode_date), # tire (-) ile ayirmam gerekir mi?
        "episode_no": str(episode_number),
        "identifier_uri": audio_url,
        "description_abstract_en": description_en,
        "description_abstract_tr": description_tr,
        "language_iso": "tr",
        "subjects": {
            "en": subject_en,
            "tr": subject_tr,
        },
        "title": title,
        "type": "Recording, oral",
    }

    # Create the root element
    root = ET.Element("dublin_core", attrib={"schema": "dc"})

    # Add metadata as sub-elements
    ET.SubElement(root, "dcvalue", attrib={"element": "contributor", "qualifier": "author"}).text = dublin_core_metadata_structure["contributor_author"]
    ET.SubElement(root, "dcvalue", attrib={"element": "date", "qualifier": "accessioned"}).text = dublin_core_metadata_structure["date_accessioned"]
    ET.SubElement(root, "dcvalue", attrib={"element": "date", "qualifier": "issued"}).text = dublin_core_metadata_structure["date_issued"]
    ET.SubElement(root, "dcvalue", attrib={"element": "identifier", "qualifier": "none"}).text = dublin_core_metadata_structure["episode_no"]
    ET.SubElement(root, "dcvalue", attrib={"element": "identifier", "qualifier": "uri"}).text = dublin_core_metadata_structure["identifier_uri"]

    # Add descriptions
    ET.SubElement(
        root,
        "dcvalue",
        attrib={"element": "description", "qualifier": "version"}
    ).text = dublin_core_metadata_structure["description_abstract_en"]
    ET.SubElement(
        root,
        "dcvalue",
        attrib={"element": "description", "qualifier": "abstract"}
    ).text = dublin_core_metadata_structure["description_abstract_tr"]

    # Add Language attribute
    ET.SubElement(
        root,
        "dcvalue",
        attrib={"element": "language", "qualifier": "iso"}
    ).text = "tr"

    # Add subjects
    for subject in dublin_core_metadata_structure["subjects"]["en"]:
        ET.SubElement(root, "dcvalue", attrib={"element": "subject", "qualifier": "none"}).text = subject

    for subject in dublin_core_metadata_structure["subjects"]["tr"]:
        ET.SubElement(root, "dcvalue", attrib={"element": "subject", "qualifier": "none"}).text = subject

    # Add title and type
    ET.SubElement(root, "dcvalue", attrib={"element": "title", "qualifier": "none"}).text = dublin_core_metadata_structure["title"]
    ET.SubElement(root, "dcvalue", attrib={"element": "type", "qualifier": "none"}).text = dublin_core_metadata_structure["type"]

    # Write to an XML file
    output_file = os.path.join(episode_folder_path, "dublin_core.xml")
    tree = ET.ElementTree(root)
    tree.write(output_file, encoding="utf-8", xml_declaration=True)

    # print(f"XML file created: {output_file}")

In [19]:
def create_contents(path):
    text = ""
    for item in os.listdir(path):
        if item != ".DS_Store":
            text += item + "\n"

    # save it into a file in correct dir.
    with open(path+"/contents", "w") as f:
        f.write(text)
# create_contents("/Users/kaanerdem/Desktop/projeler/pdtxt/pdtxt_notebook/SAF/61")


In [39]:
from docx import Document

def add_txt(episode_folder_path, word_document_name):
    doc = Document(word_document_name)
    text = "\n".join([para.text for para in doc.paragraphs])

    n_start = word_document_name.find("new_episodes/") + 13
    name = word_document_name[n_start:-5]+".txt".strip()

    with open(f"{episode_folder_path}/{name}", "w") as file:
        file.write(text)


In [37]:
import sqlite3
import os

def add_word_addresses():
    with sqlite3.connect("nasil_olunur_database.db") as connection:
        data = []
        directory = "/Users/kaanerdem/Desktop/projeler/pdtxt/pdtxt_notebook/transcripted_episodes/episodes_with_timestamps/new_episodes"
        for filename in os.listdir(directory):
            if filename.endswith(".docx"):
                ep_no = int(filename[:filename.find("-")].strip())
                full_ad = os.path.join(directory, filename)
                data.append((full_ad,ep_no))

        cursor = connection.cursor()
        query = """
            UPDATE episodes
            SET word_document_name = ?
            WHERE episode_number = ?
        """
        cursor.executemany(query,data)

add_word_addresses()

In [38]:
def create_metadata(episode_number,episode_folder_path, person, place_tr):

    # Create the root element
    root = ET.Element("dublin_core", schema="local")

    # Add people to the XML
    for p in person:
        dcvalue = ET.SubElement(root, "dcvalue")
        dcvalue.set("element", "person")
        dcvalue.set("qualifier", "name")
        dcvalue.text = p

    # Add places to the XML
    for place in place_tr:
        dcvalue = ET.SubElement(root, "dcvalue")
        dcvalue.set("element", "place")
        dcvalue.set("qualifier", "name")
        dcvalue.text = place

    # Create an ElementTree object
    tree = ET.ElementTree(root)

    output_dir = episode_folder_path
    output_file = os.path.join(output_dir, "metadata_local.xml")
    # Write the tree to an XML file
    with open(output_file, "wb") as file:
        file.write(b'<?xml version="1.0" encoding="utf-8" standalone="no"?>\n')
        tree.write(file, encoding="utf-8", xml_declaration=False)