# 생물정보학 및 실습 1 - Term Project (Free Analysis)
생물정보학 및 실습 1   
서울대학교 협동과정 생물정보학전공 2022년 1학기

In [178]:
from collections import Counter, defaultdict
import math
import os
import sys
import time

import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
from scipy import stats
import pysam

In [155]:
CLIP_BAM_PATH = '../data/binfo1-datapack1/CLIP-35L33G.bam'
FASTA_DIR_PATH    = '../data/chromFa'

## 1. Prepare Reference FASTAs
+ `mm39.chromFa.tar.gz` was downloaded from UCSC Genome Browser
+ FASTAs were extracted into `FASTA_DIR_PATH` by `tar -zxvf`

In [156]:
# Make dctFastaMatch that has {gencode chromID : RefSeq chromID}
lstRefSeqFa = os.listdir(FASTA_DIR_PATH)
dctFastaMatch = {}
for fa in lstRefSeqFa:
    chrId = fa.replace('.fa', '')
    if '_' not in fa:
        dctFastaMatch[chrId] = fa
    else:
        dctFastaMatch[chrId.split('_')[1].replace('v', '.')] = fa

In [157]:
# Validation
dctFastaMatch['GL456210.1']

'chr1_GL456210v1_random.fa'

## 2. Collect Putative Binding Sequences

In [158]:
def get_chr_binding_positions(chrId):
    pileUp = pysam.AlignmentFile(CLIP_BAM_PATH).pileup(chrId)
    lstBindingPositions = []
    for col in pileUp:
        bases = col.get_query_sequences()
        if len(bases) < 50:
            continue
        cres = stats.entropy(list(Counter(bases).values()), base=2)
        if cres >= 0.8:
            lstBindingPositions.append(col.reference_pos)
    return lstBindingPositions

In [177]:
def get_binding_sequences(chrId):
    lstBindingPositions = get_chr_binding_positions(chrId)

    FASTA_FILE_PATH = os.path.join(FASTA_DIR_PATH, dctFastaMatch[chrId])
    with open(FASTA_FILE_PATH, 'rt') as fIn:
        assert next(fIn).startswith('>') # Skip header
        seq = fIn.read().strip().replace('\n', '') # Remove new line chr.

    dctSeqs = Counter()
    for pos in lstBindingPositions:
        dctSeqs.update([seq[pos-2:pos+4]])
    return dctSeqs

In [None]:
dctSeqs = Counter()
for chrId, fa in dctFastaMatch.items():
    print(time.ctime(), f'{fa} now opens', sep='---')
    dctSeqs.update(get_binding_sequences(chrId))

Fri Jun  3 02:43:21 2022---chr10.fa now opens
Fri Jun  3 02:45:38 2022---chr11.fa now opens
Fri Jun  3 02:49:53 2022---chr12.fa now opens


In [167]:
dctSeqs_chr19 = get_binding_sequences('chr19')

In [138]:
dctCols_chr19 = get_chr_binding_positions('chr19')

In [139]:
len(dctCols_chr19)

6841

In [116]:
pd.DataFrame.from_dict(dctCols, orient='index')

Unnamed: 0,depth,entropy
3059011,8,0.0
3059012,8,0.0
3059013,8,0.0
3059014,8,0.0
3059015,8,0.0
...,...,...
3532395,1,0.0
3532396,1,0.0
3532397,1,0.0
3532398,1,0.0


In [115]:
dir(pd.DataFrame)

['T',
 '_AXIS_LEN',
 '_AXIS_NAMES',
 '_AXIS_NUMBERS',
 '_AXIS_ORDERS',
 '_AXIS_REVERSED',
 '_AXIS_TO_AXIS_NUMBER',
 '__abs__',
 '__add__',
 '__and__',
 '__annotations__',
 '__array__',
 '__array_priority__',
 '__array_wrap__',
 '__bool__',
 '__class__',
 '__contains__',
 '__copy__',
 '__deepcopy__',
 '__delattr__',
 '__delitem__',
 '__dict__',
 '__dir__',
 '__div__',
 '__doc__',
 '__eq__',
 '__finalize__',
 '__floordiv__',
 '__format__',
 '__ge__',
 '__getattr__',
 '__getattribute__',
 '__getitem__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__iadd__',
 '__iand__',
 '__ifloordiv__',
 '__imod__',
 '__imul__',
 '__init__',
 '__init_subclass__',
 '__invert__',
 '__ior__',
 '__ipow__',
 '__isub__',
 '__iter__',
 '__itruediv__',
 '__ixor__',
 '__le__',
 '__len__',
 '__lt__',
 '__matmul__',
 '__mod__',
 '__module__',
 '__mul__',
 '__ne__',
 '__neg__',
 '__new__',
 '__nonzero__',
 '__or__',
 '__pos__',
 '__pow__',
 '__radd__',
 '__rand__',
 '__rdiv__',
 '__reduce__',
 '__reduce_ex__',
 '__re

In [72]:
for i in iterBam.fetch('chr1'):
    print(i)
    break

SRR458758.23028115	0	0	3056472	0	20M	-1	-1	20	GAATGGAAGTTCAAGGATCT	array('B', [39, 39, 38, 39, 39, 38, 36, 31, 38, 38, 39, 39, 36, 35, 30, 38, 36, 38, 35, 38])	[('MD', '20'), ('NH', 40), ('HI', 1), ('NM', 0), ('SM', 0), ('XQ', 40), ('X2', 40), ('XO', 'UM'), ('XS', '-')]


In [75]:
stats.entropy(Counter(['A', 'A', 'A', 'A']))

TypeError: unsupported operand type(s) for *: 'float' and 'Counter'

In [84]:
np.array(list(Counter(['A', 'A', 'G', 'G']).values())) / sum(Counter(['A', 'A', 'G', 'G']).values())

array([0.5, 0.5])

In [93]:
stats.entropy(np.array(list(Counter(['A', 'C', 'G', 'T']).values())), base=2)

2.0

In [94]:
stats.entropy(np.array(list(Counter(['A', 'C', 'G', 'T']).values())) / sum(Counter(['A', 'A', 'G', 'G']).values()), base=2)

2.0

In [86]:
np.array(list(Counter(['A', 'A', 'G', 'G']).values()))

array([2, 2])

In [99]:
isinstance(Counter(['A', 'A', 'A']).values(), list)

False

In [159]:
dctCounter = Counter()

In [160]:
dctCounter.update('AA')

In [161]:
dctCounter

Counter({'A': 2})

In [162]:
dctCounter.update(['AA'])

In [163]:
dctCounter

Counter({'A': 2, 'AA': 1})

In [164]:
dctCounter.update(['AA'])

In [165]:
dctCounter

Counter({'A': 2, 'AA': 2})