In [1]:
import os
import glob
from Bio.PDB import MMCIFParser
import pandas as pd
import numpy as np

##### CIF ATOM record structure
```
ATOM 1     N N   . MET A 1 1    ? -8.328  3.708   1.145   1.00 27.91 1    A 1
     │     │ │   │ │   │ │ │    │ │       │       │       │    │    │    │ │
     │     │ │   │ │   │ │ │    │ └───────┴───────┴───── 좌표 (x, y, z)
     │     │ │   │ │   │ │ │    │                         
     │     │ │   │ │   │ │ │    └─ insertion code 
     │     │ │   │ │   │ │ └────── label_seq_id 
     │     │ │   │ │   │ └──────── label_entity_id
     │     │ │   │ │   └────────── label_asym_id 
     │     │ │   │ └────────────── label_comp_id 
     │     │ │   └──────────────── alt_loc (.)
     │     │ └──────────────────── label_atom_id 
     │     └────────────────────── type_symbol 
     └──────────────────────────── atom serial number 
                                    
                                    1.00 = Occupancy
                                    27.91 = B-factor (pLDDT in AlphaFold)
```

In [2]:
CIFname= "test_model.cif"
parser = MMCIFParser()
structure = parser.get_structure('protein', CIFname)

data = []
for atom in structure.get_atoms():
    residue = atom.get_parent()
    chain = residue.get_parent()
    data.append({
        'atom_name': atom.get_name(),
        'element': atom.element,
        'pDDLT': atom.get_bfactor(),
        'residue_name': residue.get_resname(),
        'residue_number': residue.id[1],
        'chain': chain.id
    })

df = pd.DataFrame(data)
df

Unnamed: 0,atom_name,element,pDDLT,residue_name,residue_number,chain
0,N,N,27.94,MET,1,A
1,CA,C,31.14,MET,1,A
2,C,C,30.55,MET,1,A
3,O,O,28.87,MET,1,A
4,CB,C,30.31,MET,1,A
...,...,...,...,...,...,...
18068,CG,C,7.31,GLN,2261,A
18069,CD,C,8.35,GLN,2261,A
18070,OE1,O,11.56,GLN,2261,A
18071,NE2,N,15.38,GLN,2261,A


In [3]:
def calculate_stats(bfactors, label):
    total = len(bfactors)
    below_50 = sum(1 for bf in bfactors if bf < 50)
    below_50_ratio = (below_50 / total * 100) if total > 0 else 0
    return {
        'Atom Type': label,
        'Total Count': total,
        'Mean': np.mean(bfactors),
        'Median': np.median(bfactors),
        'BF < 50 Count': below_50,
        'BF < 50 Ratio (%)': below_50_ratio
    }


CIFname= "test_model.cif"
parser = MMCIFParser()
structure = parser.get_structure('protein', CIFname)


all_bfactors = [atom.get_bfactor() for atom in structure.get_atoms()]

ca_bfactors = [atom.get_bfactor() for atom in structure.get_atoms() 
               if atom.get_name() == 'CA']


df_stats = pd.DataFrame([
    calculate_stats(all_bfactors, 'All Atoms'),
    calculate_stats(ca_bfactors, 'CA Atoms')
])


df_stats['name'] = "test"


df_stats['Mean'] = df_stats['Mean'].round(2)
df_stats['Median'] = df_stats['Median'].round(2)
df_stats['all_pDDLT < 50 Ratio (%)'] = df_stats['BF < 50 Ratio (%)'].round(2)
df_stats = df_stats[['name', 'Atom Type', 'Total Count', 'Mean', 'Median', 
                     'BF < 50 Count', 'BF < 50 Ratio (%)']]

df_stats.columns = ['name', 'Atom Type', 'Total Count', 'pLDDT(Mean)', 'pLDDT(Median)', 
                    'pLDDT < 50 Count', 'pLDDT < 50 Ratio (%)']
df_stats

Unnamed: 0,name,Atom Type,Total Count,pLDDT(Mean),pLDDT(Median),pLDDT < 50 Count,pLDDT < 50 Ratio (%)
0,test,All Atoms,18073,63.06,75.51,6243,34.543241
1,test,CA Atoms,2261,66.51,83.74,770,34.055728
