# This notebook can be used to clean your openai file database, it finds files with duplicate names, and then it can optionally (final cell) delete all duplicates except for the newest

In [None]:
from openai import OpenAI
import time
# Initialize the OpenAI client
import os
import pandas as pd
client = OpenAI(api_key=os.getenv('OPENAI_API_KEY'))

# Initialize the OpenAI client
client = OpenAI()


# Find all duplicate filenames, count how many

In [None]:
# List all files
response = client.files.list()

# Dictionary to track all versions of each file
all_files_dict = {}

# Populate the dictionary with all versions of each file
for file in response.data:
    if file.filename in all_files_dict:
        all_files_dict[file.filename].append(file)
    else:
        all_files_dict[file.filename] = [file]

# Filter out non-duplicates and prepare data for DataFrame
duplicates = {filename: files for filename, files in all_files_dict.items() if len(files) > 1}

# Displaying the count of duplicated filenames
print(f"Count of duplicated filenames: {len(duplicates)}")

# Displaying the total count of files
print(f"Total count of files: {len(response.data)}")

# Display a df of duplicated files with number of times duped, first and last version timestamps

In [None]:

# Prepare data for DataFrame
data_for_df = []
for filename, files in duplicates.items():
    oldest = min(files, key=lambda x: x.created_at)
    newest = max(files, key=lambda x: x.created_at)
    data_for_df.append({
        'Filename': filename,
        'Count': len(files),
        'Oldest Created At': oldest.created_at,
        'Newest Created At': newest.created_at
    })

# Create DataFrame
df = pd.DataFrame(data_for_df)
df.sort_values(by='Count', ascending=False, inplace=True)

df


# If everything is fine, next cell keeps only newest of each name

In [None]:

# Proceed with deletion
for filename, files in duplicates.items():
    files.sort(key=lambda x: x.created_at)
    # Skip the newest file, delete all others
    for file in files[:-1]:
        client.files.delete(file.id)
        print(f"Deleted older file {file.filename} with ID {file.id}")

print("Duplicate file analysis completed.")