In [4]:
from pymol import cmd
import os
import re
import shutil
import gemmi

In [18]:
### This code will change all predicted files' names with a specific naming strategy and save to a single file ###

In [4]:
# Input directory containing all subfolders like "mac-x3288.out"
base_dir = "/Users/JB/Rotation_bkslab/250115_chaifold/Chai_output"

# Output directory to store all converted .cif files
output_dir = "/Users/JB/Rotation_bkslab/250115_chaifold/CIFS_in_one"

# Create output directory if it doesn't exist
os.makedirs(output_dir, exist_ok=True)

# Regex pattern to capture the 4-digit ID from folder names like "mac-x3288.out"
FOLDER_PATTERN = re.compile(r"mac-x(\d+)\.out$", re.IGNORECASE)

# Verify base directory exists
if not os.path.exists(base_dir):
    raise ValueError(f"The base directory '{base_dir}' does not exist.")

# List all items in the base directory
subfolders = [f for f in os.listdir(base_dir) if os.path.isdir(os.path.join(base_dir, f))]

for folder_name in subfolders:
    folder_path = os.path.join(base_dir, folder_name)

    # Check if folder_name matches "mac-x###.out"
    match = FOLDER_PATTERN.match(folder_name)
    if not match:
        print(f"[WARNING] Folder '{folder_name}' does not match pattern. Skipping.")
        continue

    # Extract the ### portion from "mac-x###.out"
    numeric_id = match.group(1)  # e.g., "3288"

    # Find .cif files in this folder
    cif_files = [f for f in os.listdir(folder_path) if f.lower().endswith(".cif")]

    # Move all .cif files to the output directory with unique names
    for cif_file in cif_files:
        cif_path = os.path.join(folder_path, cif_file)

        # Create a unique name using numeric_id and the original filename
        base_name = os.path.splitext(cif_file)[0]  # Get the original filename without extension
        new_cif_name = f"mac-x{numeric_id}_{base_name}.cif"  # e.g., "mac-x3288_originalfilename.cif"
        new_cif_path = os.path.join(output_dir, new_cif_name)

        # Copy the file to the output directory
        shutil.copy2(cif_path, new_cif_path)

print(f"[INFO] All .cif files have been moved to: {output_dir}")

[INFO] All .cif files have been moved to: /Users/JB/Rotation_bkslab/250115_chaifold/CIFS_in_one


In [None]:
### Now all the cif files will be arranged based on their prediction # ###

In [15]:
import os
import re
import shutil  # For file copying/moving

# Input directory containing all subfolders
input_dir = "/Users/JB/Rotation_bkslab/250115_chaifold/CIFS_in_one"

# Output directory to store organized files
output_base_dir = "/Users/JB/Rotation_bkslab/250115_chaifold/Full_data/Organized_CIFS"

# Create subfolders for idx_0 to idx_4
indices = range(5)  # idx_0 to idx_4
output_dirs = {f"idx_{idx}": os.path.join(output_base_dir, f"idx_{idx}") for idx in indices}
for path in output_dirs.values():
    os.makedirs(path, exist_ok=True)

# Iterate over all .cif files in the input directory
for cif_file in os.listdir(input_dir):
    if cif_file.lower().endswith(".cif"):
        # Match the idx_# from the filename
        for idx in indices:
            if f"idx_{idx}" in cif_file:
                source_path = os.path.join(input_dir, cif_file)
                destination_path = os.path.join(output_dirs[f"idx_{idx}"], cif_file)
                
                # Move the file to the corresponding idx_# subfolder
                shutil.copy2(source_path, destination_path)
                break

print(f"[INFO] All files have been organized into {output_base_dir}")

[INFO] All files have been organized into /Users/JB/Rotation_bkslab/250115_chaifold/Full_data/Organized_CIFS


In [None]:
### Convert all cif to pdb without breaking the file format ###

In [21]:
def convert_cif_to_pdb(input_dir, output_dir):
    """
    Convert all .cif files in the input directory (including subdirectories) to .pdb format
    while preserving the directory structure and renaming the output files to 'mac-x###_pred_chain.pdb'.
    """
    for root, _, files in os.walk(input_dir):  # Recursively traverse input_dir
        relative_path = os.path.relpath(root, input_dir)  # Get subdirectory path relative to input_dir
        output_subdir = os.path.join(output_dir, relative_path)  # Preserve subdirectory structure

        os.makedirs(output_subdir, exist_ok=True)  # Create corresponding subdir in output_dir

        for file in files:
            if file.endswith(".cif"):
                input_path = os.path.join(root, file)

                # Extract the numerical part from the filename (e.g., "mac-x5071" from "mac-x5071_pred.model_idx_0.cif")
                match = re.search(r"mac-x(\d+)", file)
                if match:
                    pdb_file_name = f"mac-x{match.group(1)}_pred_chain.pdb"
                else:
                    print(f"Skipping file with unexpected name format: {file}")
                    continue  # Skip files without expected numbering pattern

                output_path = os.path.join(output_subdir, pdb_file_name)

                # Read the CIF file
                try:
                    structure = gemmi.read_structure(input_path)
                except Exception as e:
                    print(f"Error reading CIF file {input_path}: {e}")
                    continue

                # Write the structure to a PDB file with new naming format
                try:
                    structure.write_pdb(output_path)
                except Exception as e:
                    print(f"Error writing PDB file for {input_path}: {e}")

# Define input and output directories
input_dir = "/Users/JB/Rotation_bkslab/250115_chaifold/Full_data/Organized_CIFS"
output_dir = "/Users/JB/Rotation_bkslab/250115_chaifold/Full_data/Organized_PDB"

# Convert CIF to PDB while preserving directory structure and renaming output files
convert_cif_to_pdb(input_dir, output_dir)

In [25]:
### This code enables to remove residues that are not to be included in pdb file, and renumber residues so that predicted files match the format ###

In [24]:
def remove_residues(file_path):
    """
    Modifies a PDB file in place by removing residues 1-3 and 171-173,
    but retaining lines that contain 'LIG2' or 'LIG'.
    """
    with open(file_path, 'r') as file:
        lines = file.readlines()

    filtered_data = []

    for line in lines:
        # Keep header and remarks intact
        if line.startswith(("HEADER", "TITLE", "REMARK", "ATOM", "HETATM", "TER", "END")):
            # Extract residue number from fixed-width format
            try:
                residue_number = int(line[22:26].strip())  # Residue number is in columns 23-26
            except ValueError:
                filtered_data.append(line)
                continue

            # Preserve lines containing 'LIG2' or 'LIG'
            if "LIG2" in line or "LIG" in line:
                filtered_data.append(line)
                continue

            # Exclude residues 1-3 and 171-173
            if 1 <= residue_number <= 3 or 171 <= residue_number <= 173:
                continue  # Skip this line

        # Keep all other lines
        filtered_data.append(line)

    # Overwrite the same file with the filtered content
    with open(file_path, 'w') as output_file:
        output_file.writelines(filtered_data)

# Base directory containing idx_0 to idx_4
base_dir = "/Users/JB/Rotation_bkslab/250115_chaifold/Full_data/Organized_PDB"

# Iterate over idx_0 to idx_4 directories
for idx in range(5):  # idx_0 to idx_4
    input_dir = os.path.join(base_dir, f"idx_{idx}")

    if not os.path.exists(input_dir):
        print(f"Skipping missing directory: {input_dir}")
        continue  

    # Process each PDB file in the directory
    for file_name in os.listdir(input_dir):
        if file_name.endswith(".pdb"):
            file_path = os.path.join(input_dir, file_name)
            remove_residues(file_path)

print("[DONE] All PDB files processed.")


def renumber_residues(input_file):
    new_start = 3  # New starting residue number
    resi_offset = None  # Offset to adjust residue numbering

    with open(input_file, "r") as infile:
        lines = infile.readlines()

    with open(input_file, "w") as outfile:  # Overwrite the same file
        for line in lines:
            if line.startswith(("ATOM", "HETATM")):
                try:
                    resi = int(line[22:26].strip())  # Extract residue number
                except ValueError:
                    outfile.write(line)  # If residue number is malformed, keep it unchanged
                    continue

                # Set the offset for residue renumbering based on the first residue
                if resi_offset is None:
                    resi_offset = new_start - resi  # Calculate offset from first residue

                new_resi = resi + resi_offset  # Apply the offset to renumber residue
                
                # Rewrite the line with the new residue number
                new_line = f"{line[:22]}{new_resi:4}{line[26:]}"
                outfile.write(new_line)
            else:
                outfile.write(line)  # Write non-ATOM lines unchanged

# Base directory containing idx_0 to idx_4
base_dir = "/Users/JB/Rotation_bkslab/250115_chaifold/Full_data/Organized_PDB"

# Iterate over idx_0 to idx_4 directories
for idx in range(5):  # idx_0 to idx_4
    input_dir = os.path.join(base_dir, f"idx_{idx}")

    if not os.path.exists(input_dir):
        print(f"Skipping missing directory: {input_dir}")
        continue  

    # Process each PDB file in the directory
    for file_name in os.listdir(input_dir):
        if file_name.endswith(".pdb"):
            file_path = os.path.join(input_dir, file_name)
            renumber_residues(file_path)

print("[DONE] All PDB files renumbered.")

[DONE] All PDB files processed.
[DONE] All PDB files renumbered.


In [None]:
### Write a code to extract out the B-factor column - residue-specific confidence level in Chai cofolding ###