# Friends show transcripts

In [2]:
import os
import zipfile

def zip_directory(folder_path, zip_file_path):
    with zipfile.ZipFile(zip_file_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
        for root, dirs, files in os.walk(folder_path):
            for file in files:
                file_path = os.path.join(root, file)
                zipf.write(file_path, os.path.relpath(file_path, folder_path))

In [1]:
import os
import pandas as pd

# Read the TSV dataset
file_path = 'raw_data/friends_transcripts.tsv'
df = pd.read_csv(file_path, sep='\t')

destination_path = "datasets/friends"

# Create the folder structure and write transcripts
for (season_id, episode_id, scene_id), group in df.groupby(['season_id', 'episode_id', 'scene_id']):
    # Create the directory if it doesn't exist
    directory = f'{destination_path}/{season_id}/{episode_id}'
    os.makedirs(directory, exist_ok=True)
    
    # File path for the transcript
    file_name = f'{scene_id}.txt'
    file_path = os.path.join(directory, file_name)
    
    # Write the transcript to the file
    with open(file_path, 'w') as file:
        for _, row in group.iterrows():
            file.write(f'{row["speaker"]}: {row["transcript"]}\n')

print("Transcripts have been successfully written.")

# Define the folder path and the zip file path
zip_file_path = f'{destination_path}/friends_transcripts.zip'

# Zip the directory
zip_directory(destination_path, zip_file_path)

print("Files have been successfully zipped.")



Transcripts have been successfully written.


# Purdue Pharma Bankruptcy Transcripts Collection

In [3]:
import os
import shutil

def separate_and_rename_documents(main_folder_path, pdf_folder_path, ocr_folder_path):
    # Create destination directories if they don't exist
    os.makedirs(pdf_folder_path, exist_ok=True)
    os.makedirs(ocr_folder_path, exist_ok=True)

    # Traverse the main folder
    for root, dirs, files in os.walk(main_folder_path):
        for file in files:
            file_path = os.path.join(root, file)
            relative_path = os.path.relpath(root, main_folder_path)
            
            if file.endswith('.pdf'):
                # Destination path for PDF files
                dest_folder = os.path.join(pdf_folder_path, relative_path)
                os.makedirs(dest_folder, exist_ok=True)
                shutil.move(file_path, os.path.join(dest_folder, file))
            elif file.endswith('.ocr'):
                # Destination path for OCR files
                dest_folder = os.path.join(ocr_folder_path, relative_path)
                os.makedirs(dest_folder, exist_ok=True)
                new_file_name = file.replace('.ocr', '.txt')
                shutil.move(file_path, os.path.join(dest_folder, new_file_name))

# Define the paths
main_folder_path = 'raw_data/purdue_pharma_bankruptcy'
pdf_folder_path = 'datasets/purdue_pharma_bankruptcy/pdf'
ocr_folder_path = 'datasets/purdue_pharma_bankruptcy/txt'

# Separate documents by extension
separate_and_rename_documents(main_folder_path, pdf_folder_path, ocr_folder_path)

print("Documents have been successfully separated by extension.")


Documents have been successfully separated by extension.
