In [None]:
import logging
from datetime import datetime

current_file_name = "7_3_Combine_Chunks"

dt_string = datetime.now().strftime("%Y%m%d_%H%M%S")
log_file = f"logs/{current_file_name}/{dt_string}.log"
logging.basicConfig(level=logging.INFO, filename=log_file,filemode="w", format="%(asctime)s %(levelname)s %(message)s")

# https://blog.sentry.io/logging-in-python-a-developers-guide/

In [None]:
import os
import csv
import json
from pathlib import Path
import pandas as pd

In [None]:
from helpers.pages import *
from helpers.constants import *
from helpers.utils import *

In [None]:
pd.set_option('display.max_columns', 500)

In [None]:
def combine_files(variant):
    # Define the root directories
    csv_root = Path(f"data/6_2_Remove_Pauses/{variant}")
    json_root = Path(f"data/7_2_Elaborations_Transcripts_From_Chunks/{variant}")
    output_root = Path(f"data/7_3_Combine_Chunks/{variant}")

    # Create the output directory if it doesn't exist
    output_root.mkdir(parents=True, exist_ok=True)

    # Function to read the CSV and JSON files, and merge them
    def process_respondent(respondent_id):
        respondent_csv_path = csv_root / respondent_id
        respondent_json_path = json_root / respondent_id
        respondent_output_path = output_root / respondent_id
        
        # Iterate over all elaboration directories
        for elaboration_dir in respondent_csv_path.glob("elaboration_*"):
            elaboration_id = elaboration_dir.name
            csv_file = elaboration_dir / f"{elaboration_id}_removed_silence_timestamps.csv"
            
            # Initialize a list to store combined data
            combined_data = {
                "text": "",
                "segments": [],
                "words": []
            }

            # Check if the CSV file exists and if not, assume that the whole elaboration is being used as a single chunk
            if not csv_file.exists():
                print(f"Warning: CSV file {csv_file} not found. Assuming single chunk - handle manually.")
                continue

            # Read the CSV file and process each chunk
            with open(csv_file, mode='r', newline='') as f:
                reader = csv.DictReader(f)
                for row in reader:
                    chunk_name = Path(row['chunk_name']).name  # Get the chunk filename
                    start_offset = int(row['start']) / 1000  # convert milliseconds to seconds
                    end_offset = int(row['end']) / 1000

                    # Construct the correct path to the JSON file
                    json_file = respondent_json_path / elaboration_id / f"{Path(chunk_name).stem}_response.json"
                    
                    if not json_file.exists():
                        print(f"Warning: JSON file {json_file} not found. Skipping.")
                        continue

                    # Read the corresponding JSON file
                    with open(json_file, 'r') as jf:
                        chunk_data = json.load(jf)

                    # Update the timestamps for segments and words
                    for segment in chunk_data['segments']:
                        segment['start'] += start_offset
                        segment['end'] += start_offset
                        combined_data['segments'].append(segment)

                    for word in chunk_data['words']:
                        word['start'] += start_offset
                        word['end'] += start_offset
                        combined_data['words'].append(word)

                    # Append the chunk text to the combined text
                    combined_data['text'] += chunk_data['text'] + " "

            # Save the combined data to a new JSON file
            respondent_output_path.mkdir(parents=True, exist_ok=True)
            output_json_file = respondent_output_path / f"{elaboration_id}_response.json"
            output_txt_file = respondent_output_path / f"{elaboration_id}.txt"
            with open(output_json_file, 'w') as out_json:
                json.dump(combined_data, out_json, indent=4)
            with open(output_txt_file, 'w') as out_txt:
                out_txt.write(combined_data['text'])

    # Process each respondent
    respondent_ids = [d.name for d in csv_root.iterdir() if d.is_dir()]
    for respondent_id in respondent_ids:
        process_respondent(respondent_id)

    # # Process a single respondent
    # process_respondent("respondent_104")
    # process_respondent("respondent_105")

    print("Processing complete.")

In [None]:
combine_files("FG")

In [None]:
combine_files("H")