## This code uses the dates from the .json file (generated from running gen_bird_params_json.ipynb in the folder containing the bird's days of recorded data) to correct the dates from the TweetyBERT .csv file (which assumes all recordings happened in 2024). 

### NOTE: This code assumes that the .csv file contains the Bird ID in its file name, which it uses to find the corresponding json file. It also assumes that we have not collected data from the bird for over one year (e.g. don't have to worry about 2 Octobers' data).

In [11]:
import os
import re
import pandas as pd
import json
from datetime import datetime

# Function to extract the ID from the file path
def extract_id_from_path(file_path):
    # Use regular expression to find the pattern 'USA' followed by 4 digits
    match = re.search(r'USA\d{4}', file_path)
    if match:
        return match.group(0)
    else:
        return None

# Function to find the corresponding JSON file in the same directory as the CSV
def find_json_in_csv_directory(csv_file_path):
    # Extract the directory from the csv file path
    directory = os.path.dirname(csv_file_path)
    # Extract the ID number from the csv file path
    id_number = extract_id_from_path(csv_file_path)
    
    if id_number is None:
        return None

    # Search for a .json file in the same directory that contains the ID number
    for filename in os.listdir(directory):
        if filename.endswith(".json") and id_number in filename:
            return os.path.join(directory, filename)
    
    return None

# Function to update the year based on the JSON file's creation dates
def update_years_in_csv(csv_file_path):
    # Find the corresponding JSON file
    json_file = find_json_in_csv_directory(csv_file_path)

    if not json_file:
        print("No matching JSON file found in the directory.")
        return

    print(f"Found JSON file: {json_file}")
    
    # Load the CSV file
    df = pd.read_csv(csv_file_path)

    # Load the JSON file
    with open(json_file, 'r') as f:
        creation_data = json.load(f)

    # Extract the subdirectory data and unique creation dates from the JSON file
    subdirectories = creation_data['subdirectories']
    creation_dates = []
    for subdir in subdirectories.values():
        creation_dates.extend(subdir['unique_file_creation_dates'])

    # Convert the creation dates to datetime objects for easier comparison
    creation_dates_dt = [datetime.strptime(date, "%Y-%m-%d") for date in creation_dates]

    # Function to match the month and update the year in the CSV date_time
    def update_year(row):
        # Extract month and day from the current date_time in the CSV
        current_date = datetime.strptime(row['date_time'], "%Y-%m-%d %H:%M:%S")

        # Find a matching creation date by month
        for creation_date in creation_dates_dt:
            if current_date.month == creation_date.month:
                # Update the year in the CSV date_time to match the JSON creation date's year
                return current_date.replace(year=creation_date.year)

        # If no match, return the original date_time
        return current_date

    # Apply the update_year function to the CSV DataFrame
    df['date_time'] = df.apply(update_year, axis=1)

    # Save the updated CSV file in the same directory as the original file
    corrected_csv_path = os.path.join(os.path.dirname(csv_file_path), 'corrected_' + os.path.basename(csv_file_path))
    df.to_csv(corrected_csv_path, index=False)

    print(f"Year values have been updated based on matching months. Corrected CSV saved to {corrected_csv_path}")

# Set the path to your CSV file here
csv_file_path = "/Users/mirandahulsey-vincent/Documents/allPythonCode/syntax_analysis/2nd_set_of_song_databases/labels_USA5468_RC3_AreaXRose_database.csv"

# Call the function to update the CSV based on the corresponding JSON file
update_years_in_csv(csv_file_path)


Found JSON file: /Users/mirandahulsey-vincent/Documents/allPythonCode/syntax_analysis/2nd_set_of_song_databases/USA5468_RC3_creation_data.json
Year values have been updated based on matching months. Corrected CSV saved to /Users/mirandahulsey-vincent/Documents/allPythonCode/syntax_analysis/2nd_set_of_song_databases/corrected_labels_USA5468_RC3_AreaXRose_database.csv
