In [1]:
import os
import glob

In [2]:
with open('data/1OLG.pdb', 'r') as f:
    f_str = f.read()

`open` is a built-in function; it opens the file.

The input is a string indicating the directory, and `'r'` says to **read** the file.

`f` says to refer to the open file as `f` in the context of the open file object.

This is called context management. If you open a file outside of context management, you need to explicitly close it, but with context management, if you get an error, it will close the file before throwing the error.

In [3]:
f_str[:1000]

'HEADER    ANTI-ONCOGENE                           13-JUN-94   1OLG              \nTITLE     HIGH-RESOLUTION SOLUTION STRUCTURE OF THE OLIGOMERIZATION             \nTITLE    2 DOMAIN OF P53 BY MULTI-DIMENSIONAL NMR                               \nCOMPND    MOL_ID: 1;                                                            \nCOMPND   2 MOLECULE: TUMOR SUPPRESSOR P53 (OLIGOMERIZATION DOMAIN);             \nCOMPND   3 CHAIN: A, B, C, D;                                                   \nCOMPND   4 ENGINEERED: YES                                                      \nSOURCE    MOL_ID: 1;                                                            \nSOURCE   2 ORGANISM_SCIENTIFIC: HOMO SAPIENS;                                   \nSOURCE   3 ORGANISM_COMMON: HUMAN;                                              \nSOURCE   4 ORGANISM_TAXID: 9606                                                 \nKEYWDS    ANTI-ONCOGENE                                                         \nEXPDTA    SOLUT

In [6]:
# Read the file in as a list, instead

with open('data/1OLG.pdb', 'r') as f:
    f_list = f.readlines()
    
# Look at the list (first ten entries)
f_list[:10]

['HEADER    ANTI-ONCOGENE                           13-JUN-94   1OLG              \n',
 'TITLE     HIGH-RESOLUTION SOLUTION STRUCTURE OF THE OLIGOMERIZATION             \n',
 'TITLE    2 DOMAIN OF P53 BY MULTI-DIMENSIONAL NMR                               \n',
 'COMPND    MOL_ID: 1;                                                            \n',
 'COMPND   2 MOLECULE: TUMOR SUPPRESSOR P53 (OLIGOMERIZATION DOMAIN);             \n',
 'COMPND   3 CHAIN: A, B, C, D;                                                   \n',
 'COMPND   4 ENGINEERED: YES                                                      \n',
 'SOURCE    MOL_ID: 1;                                                            \n',
 'SOURCE   2 ORGANISM_SCIENTIFIC: HOMO SAPIENS;                                   \n',
 'SOURCE   3 ORGANISM_COMMON: HUMAN;                                              \n']

In [11]:
# Remove the ending whitespace and \n newline character using string methods

f_list[0].rstrip()

'HEADER    ANTI-ONCOGENE                           13-JUN-94   1OLG'

In [10]:
# Read the file in line by line

with open('data/1OLG.pdb', 'r') as f:
    for i, line in enumerate (f): # A file object is iterable!
        print(line.rstrip())
        if i >= 10:
            break # Only read the first 10 lines)

HEADER    ANTI-ONCOGENE                           13-JUN-94   1OLG
TITLE     HIGH-RESOLUTION SOLUTION STRUCTURE OF THE OLIGOMERIZATION
TITLE    2 DOMAIN OF P53 BY MULTI-DIMENSIONAL NMR
COMPND    MOL_ID: 1;
COMPND   2 MOLECULE: TUMOR SUPPRESSOR P53 (OLIGOMERIZATION DOMAIN);
COMPND   3 CHAIN: A, B, C, D;
COMPND   4 ENGINEERED: YES
SOURCE    MOL_ID: 1;
SOURCE   2 ORGANISM_SCIENTIFIC: HOMO SAPIENS;
SOURCE   3 ORGANISM_COMMON: HUMAN;
SOURCE   4 ORGANISM_TAXID: 9606


In [12]:
# Another way

with open('data/1OLG.pdb', 'r') as f:
    i = 0
    while i < 10:
        print(f.readline().rstrip()) # Different from .readlines--only reads one line. Each time you call it it reads the next line.
        i += 1

HEADER    ANTI-ONCOGENE                           13-JUN-94   1OLG
TITLE     HIGH-RESOLUTION SOLUTION STRUCTURE OF THE OLIGOMERIZATION
TITLE    2 DOMAIN OF P53 BY MULTI-DIMENSIONAL NMR
COMPND    MOL_ID: 1;
COMPND   2 MOLECULE: TUMOR SUPPRESSOR P53 (OLIGOMERIZATION DOMAIN);
COMPND   3 CHAIN: A, B, C, D;
COMPND   4 ENGINEERED: YES
SOURCE    MOL_ID: 1;
SOURCE   2 ORGANISM_SCIENTIFIC: HOMO SAPIENS;
SOURCE   3 ORGANISM_COMMON: HUMAN;


`'.readline'` is like a VHS tape. To go back, you need to "rewind": `f.seek(0)`

Before writing a file, we should check if the file already exists. ***Otherwise, we will overwrite it!!!***

In [13]:
os.path.isfile('data/1OLG.pdb')

True

In [17]:
# Check if file exists before opening it to write to it

if os.path.isfile('mastery.txt'):
    raise RuntimeError('File mastery.txt already exists.')
else:
    with open('mastery.txt', 'w') as f:
        f.write('This is my file.\n')
        f.write('There are many like it, but this one is mine.\n')
        f.write('I must master my file like I must master my life.\n')

In [18]:
!cat mastery.txt

This is my file.
There are many like it, but this one is mine.
I must master my file like I must master my life.


Note: we need to be explicit and add newline characters when writing a file.

In [19]:
with open('gimme_phi.txt', 'w') as f:
    f.write('The golden ratio is φ = ')
    f.write(1.61803398875)

TypeError: write() argument must be str, not float

In [25]:
with open('gimme_phi.txt', 'w') as f:
    f.write('The golden ratio is φ = ')
    f.write('{phi:.8f}'.format(phi = 1.61803398875))

In [26]:
!cat gimme_phi.txt

The golden ratio is φ = 1.61803399

Let's look at the atomic positions of the atoms in chain A. Column 21 in an atom entry is the chain (according to the PDB specs).

In [31]:
with open('data/1OLG.pdb', 'r') as f, open ('atoms_chain_A.txt', 'w') as f_out:
    for line in f:
        if len(line) > 21 and line[:4] == 'ATOM' and line[21] == 'A':
            f_out.write(line) # newline characters are already in the .pdb file, don't need to add or strip

In [34]:
# Check out the file we wrote

!head atoms_chain_A.txt

ATOM      1  N   LYS A 319      18.634  25.437  10.685  1.00  4.81           N  
ATOM      2  CA  LYS A 319      17.984  25.295   9.354  1.00  4.32           C  
ATOM      3  C   LYS A 319      18.160  23.876   8.818  1.00  3.74           C  
ATOM      4  O   LYS A 319      19.259  23.441   8.537  1.00  3.67           O  
ATOM      5  CB  LYS A 319      18.609  26.282   8.371  1.00  4.67           C  
ATOM      6  CG  LYS A 319      18.003  26.056   6.986  1.00  5.15           C  
ATOM      7  CD  LYS A 319      16.476  26.057   7.091  1.00  5.90           C  
ATOM      8  CE  LYS A 319      16.014  27.341   7.784  1.00  6.51           C  
ATOM      9  NZ  LYS A 319      16.388  28.518   6.952  1.00  7.33           N  
ATOM     10  H1  LYS A 319      18.414  24.606  11.281  1.00  5.09           H  


# Working with multiple files:

In [35]:
file_list = glob.glob('data/*.pdb') # glob.glob is a function that does pattern matching

In [36]:
file_list

['data/1OLG.pdb', 'data/1J6Z.pdb', 'data/1FAG.pdb', 'data/2ERK.pdb']

## Let's get the sequences of the proteins

It might be nice to store the sequences in a **dictionary** with the PDB IDs as the keys.

In [37]:
file_list = glob.glob('data/*.pdb')

seqs = {}

for file_name in file_list:
    pdb_id = file_name[file_name.find('/')+1:file_name.rfind('.')]
    
    # Initialize sequence string
    seq = ''
    
    with open(file_name, 'r') as f:
        for line in f:
            if len(line) > 11 and line[:6] == 'SEQRES' and line[11] == 'A':
                seq += line[19:].rstrip() + ' '
                
    # Splits the string into a list wherever there is whitespace (the default if no argument), then joins with '-'
    seq = '-'.join(seq.split()) 
    
    seqs[pdb_id] = seq

In [39]:
seqs['1J6Z'] # Actin

'ASP-GLU-ASP-GLU-THR-THR-ALA-LEU-VAL-CYS-ASP-ASN-GLY-SER-GLY-LEU-VAL-LYS-ALA-GLY-PHE-ALA-GLY-ASP-ASP-ALA-PRO-ARG-ALA-VAL-PHE-PRO-SER-ILE-VAL-GLY-ARG-PRO-ARG-HIS-GLN-GLY-VAL-MET-VAL-GLY-MET-GLY-GLN-LYS-ASP-SER-TYR-VAL-GLY-ASP-GLU-ALA-GLN-SER-LYS-ARG-GLY-ILE-LEU-THR-LEU-LYS-TYR-PRO-ILE-GLU-HIC-GLY-ILE-ILE-THR-ASN-TRP-ASP-ASP-MET-GLU-LYS-ILE-TRP-HIS-HIS-THR-PHE-TYR-ASN-GLU-LEU-ARG-VAL-ALA-PRO-GLU-GLU-HIS-PRO-THR-LEU-LEU-THR-GLU-ALA-PRO-LEU-ASN-PRO-LYS-ALA-ASN-ARG-GLU-LYS-MET-THR-GLN-ILE-MET-PHE-GLU-THR-PHE-ASN-VAL-PRO-ALA-MET-TYR-VAL-ALA-ILE-GLN-ALA-VAL-LEU-SER-LEU-TYR-ALA-SER-GLY-ARG-THR-THR-GLY-ILE-VAL-LEU-ASP-SER-GLY-ASP-GLY-VAL-THR-HIS-ASN-VAL-PRO-ILE-TYR-GLU-GLY-TYR-ALA-LEU-PRO-HIS-ALA-ILE-MET-ARG-LEU-ASP-LEU-ALA-GLY-ARG-ASP-LEU-THR-ASP-TYR-LEU-MET-LYS-ILE-LEU-THR-GLU-ARG-GLY-TYR-SER-PHE-VAL-THR-THR-ALA-GLU-ARG-GLU-ILE-VAL-ARG-ASP-ILE-LYS-GLU-LYS-LEU-CYS-TYR-VAL-ALA-LEU-ASP-PHE-GLU-ASN-GLU-MET-ALA-THR-ALA-ALA-SER-SER-SER-SER-LEU-GLU-LYS-SER-TYR-GLU-LEU-PRO-ASP-GLY-GLN-VAL-ILE-THR-ILE

In [40]:
%load_ext watermark
%watermark -v -p jupyterlab

CPython 3.7.7
IPython 7.13.0

jupyterlab 1.2.6
