In [None]:
import os
import pandas as pd

# load the position data downloaded from our database
position_df = pd.read_csv('./position_data.csv')

# Ensure the output directory exists.
output_directory = "./pdb_file/"
os.makedirs(output_directory, exist_ok=True)

df = position_df[['sampleid','x','y','z']]  # Your DataFrame with columns: 'sampleid', 'x', 'y', 'z'

# Group data by sampleid (chain identifier)
grouped = df.groupby("sampleid")

def write_chain_to_pdb(chain_id, data, output_directory):
    """
    Writes the given chain's coordinates to a PDB file with fixed-width formatting.

    Format specification (1-indexed columns):
      1-6   : "ATOM  "
      7-11  : Atom serial number (5 characters, right-justified)
      13-16 : Atom name (centered, 4 characters)
      17    : AltLoc (blank)
      18-20 : Residue name (right-justified, 3 characters)
      22    : Chain identifier
      23-26 : Residue sequence number (right-justified, 4 digits)
      27-30 : Four spaces
      31-38 : x coordinate (8.3f)
      39-46 : y coordinate (8.3f)
      47-54 : z coordinate (8.3f)
      55-60 : Occupancy (6.2f)
      61-66 : Temperature factor (6.2f)
      67-76 : Ten spaces
      77-78 : Element symbol (right-justified, 2 characters)

    Default values:
      - Atom name: "CA"
      - Residue name: "ALA"
      - Occupancy: 1.00
      - Temperature factor: 0.00
      - Residue sequence number: increments using the serial number
      - Element: derived from the first letter of the atom name (i.e. "C" for "CA")
    """
    filename = os.path.join(output_directory, f"chain_{chain_id}.pdb")
    with open(filename, 'w') as f:
        serial = 1  # Atom serial number starting at 1
        for _, row in data.iterrows():
            atom_name = "CA"
            res_name = "ALA"
            alt_loc = ""             # AltLoc remains blank
            res_seq = serial         # Residue sequence number equals serial
            x, y, z = row['x'], row['y'], row['z']
            occupancy = 1.00
            temp_factor = 0.00
            element = atom_name[0]   # "C" from "CA"

            pdb_line = (
                f"ATOM  "                         # Columns 1-6: Record name "ATOM  "
                f"{serial:5d}"                    # Columns 7-11: Atom serial number (5 digits)
                f" "                              # Column 12: single space
                f"{atom_name:^4s}"                # Columns 13-16: Atom name, centered in 4 characters
                f"{alt_loc:1s}"                   # Column 17: AltLoc (blank)
                f"{res_name:>3s}"                 # Columns 18-20: Residue name (right-justified, 3 characters)
                f" "                              # Column 21: single space
                f"{str(1):1s}"                    # Column 22: Chain identifier (converted to string)
                f"{res_seq:4d}"                   # Columns 23-26: Residue sequence number (4 digits)
                f"    "                          # Columns 27-30: Four spaces
                f"{x:8.3f}"                      # Columns 31-38: x coordinate
                f"{y:8.3f}"                      # Columns 39-46: y coordinate
                f"{z:8.3f}"                      # Columns 47-54: z coordinate
                f"{occupancy:6.2f}"               # Columns 55-60: Occupancy (6.2f)
                f"{temp_factor:6.2f}"             # Columns 61-66: Temperature factor (6.2f)
                f"          "                    # Columns 67-76: 10 spaces
                f"{element:>2s}"                  # Columns 77-78: Element symbol (right-justified, 2 characters)
                "\n"
            )
            f.write(pdb_line)
            serial += 1
        
        # Write a termination record for the chain.
        f.write("TER\n")
    print(f"Wrote PDB file for chain {chain_id}: {filename}")

# Write a PDB file for each chain in the grouped DataFrame.
for chain_id, group_data in grouped:
    write_chain_to_pdb(chain_id, group_data, output_directory)