<a href="https://colab.research.google.com/github/heispv/bioinformatics/blob/master/lab-of-bioinformatics/CpG-island.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [95]:
!wget https://hgdownload.cse.ucsc.edu/goldenpath/hg38/chromosomes/chr21.fa.gz

--2024-03-22 12:30:43--  https://hgdownload.cse.ucsc.edu/goldenpath/hg38/chromosomes/chr21.fa.gz
Resolving hgdownload.cse.ucsc.edu (hgdownload.cse.ucsc.edu)... 128.114.198.53
Connecting to hgdownload.cse.ucsc.edu (hgdownload.cse.ucsc.edu)|128.114.198.53|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 12709705 (12M) [application/x-gzip]
Saving to: ‘chr21.fa.gz.2’


2024-03-22 12:30:44 (76.1 MB/s) - ‘chr21.fa.gz.2’ saved [12709705/12709705]



In [96]:
!wget https://hgdownload.cse.ucsc.edu/goldenpath/hg38/database/cpgIslandExt.txt.gz

--2024-03-22 12:30:44--  https://hgdownload.cse.ucsc.edu/goldenpath/hg38/database/cpgIslandExt.txt.gz
Resolving hgdownload.cse.ucsc.edu (hgdownload.cse.ucsc.edu)... 128.114.198.53
Connecting to hgdownload.cse.ucsc.edu (hgdownload.cse.ucsc.edu)|128.114.198.53|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 717984 (701K) [application/x-gzip]
Saving to: ‘cpgIslandExt.txt.gz.2’


2024-03-22 12:30:44 (8.50 MB/s) - ‘cpgIslandExt.txt.gz.2’ saved [717984/717984]



In [97]:
!gunzip /content/cpgIslandExt.txt.gz

gzip: /content/cpgIslandExt.txt already exists; do you wish to overwrite (y or n)? ^C


In [98]:
!gunzip /content/chr21.fa.gz

gzip: /content/chr21.fa already exists; do you wish to overwrite (y or n)? ^C


In [99]:
input_file_path = '/content/cpgIslandExt.txt'
output_file_path = '/content/chr21.txt'

delimiter = '\t'

with open(input_file_path, 'r') as input_file, open(output_file_path, 'w') as output_file:
    for line in input_file:
        columns = line.strip().split(delimiter)
        if len(columns) > 1 and columns[1] == 'chr21':
            output_file.write(line)

In [100]:
!grep ">" /content/chr21.fa

>chr21


In [101]:
import numpy as np

In [102]:
def get_seq(seqfile):
    seq = ''
    with open(seqfile) as f:
        for line in f:
            if line[0] == '>': continue
            seq += line.rstrip()
    return seq

In [103]:
seq = get_seq("/content/chr21.fa")

In [104]:
def get_ranges(bedfile, start_col=2, end_col=3):
    ranges = []
    with open(bedfile) as f:
        for line in f:
            line_list = line.split()
            ranges.append((int(line_list[start_col]), int(line_list[end_col])))
    return ranges

In [105]:
ranges = get_ranges('/content/chr21.txt')

In [106]:
def get_states(ranges, seq):
    state = np.zeros(len(seq)+1)
    for i, j in ranges:
        state[i:j+1] = 1
    return state[1:]

In [107]:
state = get_states(ranges, seq)

In [108]:
def list_to_str(state_list):
    end = ''
    for num in state_list:
        if num:
            end += 'Y'
        else:
            end += 'N'
    return end

In [109]:
state = list_to_str(state)

In [110]:
def get_matrices(seq, state, nuc='ACGTN', states='NY'):
    tm_dict = dict()
    ep_dict = dict()
    seq = seq.upper()

    for st1, st2, ch1 in zip(state, state[1:], seq):
        tm_dict[st1+st2] = tm_dict.get(st1+st2, 0) + 1
        ep_dict[st1+ch1] = ep_dict.get(st1+ch1, 0) + 1
    return tm_dict, ep_dict

In [111]:
from collections import Counter

def get_matrices_optimized(seq, state):
    seq = seq.upper()

    transition_pairs = zip(state, state[1:])
    emission_pairs = zip(state, seq)

    tm_dict = Counter(transition_pairs)
    ep_dict = Counter(emission_pairs)

    tm_dict = {k[0] + k[1]: v for k, v in tm_dict.items()}
    ep_dict = {k[0] + k[1]: v for k, v in ep_dict.items()}

    return tm_dict, ep_dict

In [112]:
%%time
tm_dict, ep_dict = get_matrices_optimized(seq, state)

CPU times: user 14.6 s, sys: 122 ms, total: 14.7 s
Wall time: 14.9 s


In [113]:
%%time
tm_dict, ep_dict = get_matrices(seq, state)

CPU times: user 33.3 s, sys: 155 ms, total: 33.5 s
Wall time: 33.7 s


In [114]:
import pandas as pd

In [115]:
expanded_data = [(key[0], key[1], value) for key, value in ep_dict.items()]

df = pd.DataFrame(expanded_data, columns=['Index', 'Column', 'Value'])

pivot_df = df.pivot(index='Index', columns='Column', values='Value')

In [116]:
pivot_df

Column,A,C,G,N,T
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
N,11769906.0,8063026.0,8106230.0,6621363.0,11800527.0
Y,50758.0,122218.0,120151.0,,55803.0


In [117]:
normalized_df = pivot_df.div(pivot_df.sum(axis=1), axis=0)
normalized_df

Column,A,C,G,N,T
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
N,0.253875,0.173918,0.17485,0.142822,0.254535
Y,0.145468,0.350265,0.344341,,0.159926
