In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import os
import pickle
from concurrent.futures import ThreadPoolExecutor
import itertools

# File paths and settings
file_path = 'drive/MyDrive/SP_cup/features/fake/'
output_file = os.path.join(file_path, 'merged_facial_fake.pkl')
pkl_files = [f'landmarks_fake{i}.pkl' for i in range(1, 17)]

# Validate pickle files
def is_valid_pickle(file_path):
    try:
        with open(file_path, 'rb') as f:
            pickle.load(f)
        return True
    except Exception as e:
        print(f"Invalid pickle file: {file_path}, Error: {e}")
        return False

# Load a single pickle file in chunks
def load_pkl_chunked(file, chunk_size=1000):
    with open(file, 'rb') as f:
        data = pickle.load(f)
        for i in range(0, len(data), chunk_size):
            yield data[i:i + chunk_size]

# Save data incrementally to avoid memory overflow
def safe_save(data, output_file, mode='ab'):
    with open(output_file, mode) as f:
        pickle.dump(data, f)

# Incremental merging
def incremental_merge(pkl_files, file_path, output_file, chunk_size=1000):
    processed_files = set()

    # Check if the output file exists and is valid
    if os.path.exists(output_file):
        try:
            with open(output_file, 'rb') as f:
                while True:
                    data = pickle.load(f)
                    processed_files.update({entry['source_file'] for entry in data})
        except EOFError:
            print("Resuming from existing progress.")

    remaining_files = [file for file in pkl_files if file not in processed_files]

    with ThreadPoolExecutor(max_workers=4) as executor:
        for file in remaining_files:
            full_path = os.path.join(file_path, file)
            print(f"Processing: {file}")
            for chunk in executor.map(load_pkl_chunked, [full_path], [chunk_size]):
                for data in chunk:
                    for entry in data:
                        entry['source_file'] = file
                    safe_save(data, output_file)

            processed_files.add(file)

    print("Merging complete.")
    return output_file

if __name__ == '__main__':
    # Validate pickle files
    valid_files = [file for file in pkl_files if is_valid_pickle(os.path.join(file_path, file))]

    # Merge data
    final_output = incremental_merge(valid_files, file_path, output_file)
    print(f"Data successfully merged into: {final_output}")


Processing: landmarks_fake1.pkl
Processing: landmarks_fake2.pkl
Processing: landmarks_fake3.pkl
Processing: landmarks_fake4.pkl
Processing: landmarks_fake5.pkl
Processing: landmarks_fake6.pkl
Processing: landmarks_fake7.pkl
Processing: landmarks_fake8.pkl
Processing: landmarks_fake9.pkl
Processing: landmarks_fake10.pkl
Processing: landmarks_fake11.pkl
Processing: landmarks_fake12.pkl
Processing: landmarks_fake13.pkl
Processing: landmarks_fake14.pkl
Processing: landmarks_fake15.pkl
Processing: landmarks_fake16.pkl
Merging complete.
Data successfully merged into: drive/MyDrive/SP_cup/features/fake/merged_facial_fake.pkl
