<a href="https://colab.research.google.com/github/heispv/bioinformatics/blob/master/lab-of-bioinformatics/CpG_island.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!wget https://hgdownload.cse.ucsc.edu/goldenpath/hg38/chromosomes/chr21.fa.gz

--2024-03-21 15:13:31--  https://hgdownload.cse.ucsc.edu/goldenpath/hg38/chromosomes/chr21.fa.gz
Resolving hgdownload.cse.ucsc.edu (hgdownload.cse.ucsc.edu)... 128.114.198.53
Connecting to hgdownload.cse.ucsc.edu (hgdownload.cse.ucsc.edu)|128.114.198.53|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 12709705 (12M) [application/x-gzip]
Saving to: ‘chr21.fa.gz’


2024-03-21 15:13:33 (17.0 MB/s) - ‘chr21.fa.gz’ saved [12709705/12709705]



In [None]:
!wget https://hgdownload.cse.ucsc.edu/goldenpath/hg38/database/cpgIslandExt.txt.gz

--2024-03-21 15:13:33--  https://hgdownload.cse.ucsc.edu/goldenpath/hg38/database/cpgIslandExt.txt.gz
Resolving hgdownload.cse.ucsc.edu (hgdownload.cse.ucsc.edu)... 128.114.198.53
Connecting to hgdownload.cse.ucsc.edu (hgdownload.cse.ucsc.edu)|128.114.198.53|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 717984 (701K) [application/x-gzip]
Saving to: ‘cpgIslandExt.txt.gz’


2024-03-21 15:13:33 (1.98 MB/s) - ‘cpgIslandExt.txt.gz’ saved [717984/717984]



In [None]:
!gunzip /content/cpgIslandExt.txt.gz

In [None]:
!gunzip /content/chr21.fa.gz

In [None]:
input_file_path = '/content/cpgIslandExt.txt'
output_file_path = '/content/chr21.txt'

delimiter = '\t'

with open(input_file_path, 'r') as input_file, open(output_file_path, 'w') as output_file:
    for line in input_file:
        columns = line.strip().split(delimiter)
        if len(columns) > 1 and columns[1] == 'chr21':
            output_file.write(line)

In [None]:
!grep ">" /content/chr21.fa

>chr21


In [None]:
import numpy as np

In [None]:
def get_seq(seqfile):
    seq = ''
    with open(seqfile) as f:
        for line in f:
            if line[0] == '>': continue
            seq += line.rstrip()
    return seq

In [None]:
seq = get_seq("/content/chr21.fa")

In [None]:
def get_ranges(bedfile, start_col=2, end_col=3):
    ranges = []
    with open(bedfile) as f:
        for line in f:
            line_list = line.split()
            ranges.append((int(line_list[start_col]), int(line_list[end_col])))
    return ranges

In [None]:
ranges = get_ranges('/content/chr21.txt')

In [None]:
len(ranges)

446

In [None]:
def get_states(ranges, seq):
    state = np.zeros(len(seq))
    for i, j in ranges:
        state[i:j+1] = 1
    return state[1:]

In [None]:
state = get_states(ranges, seq)

In [None]:
def list_to_str(state_list):
    end = ''
    for num in state_list:
        if num:
            end += 'Y'
        else:
            end += 'N'
    return end

In [None]:
state = list_to_str(state)

In [None]:
def get_matrices(seq, state, nuc='ACGTN', states='NY'):
    tm_dict = dict()
    ep_dict = dict()
    seq = seq.upper()

    for st1, st2, ch1 in zip(state, state[1:], seq):
        tm_dict[st1+st2] = tm_dict.get(st1+st2, 0) + 1
        ep_dict[st1+ch1] = ep_dict.get(st1+ch1, 0) + 1
    return tm_dict, ep_dict

In [None]:
from collections import Counter

def get_matrices_optimized(seq, state):
    seq = seq.upper()

    transition_pairs = zip(state, state[1:])
    emission_pairs = zip(state, seq)

    tm_dict = Counter(transition_pairs)
    ep_dict = Counter(emission_pairs)

    tm_dict = {k[0] + k[1]: v for k, v in tm_dict.items()}
    ep_dict = {k[0] + k[1]: v for k, v in ep_dict.items()}

    return tm_dict, ep_dict

In [None]:
%%time
tm_dict, ep_dict = get_matrices_optimized(seq, state)

CPU times: user 14.9 s, sys: 135 ms, total: 15 s
Wall time: 15.3 s


In [None]:
%%time
tm_dict, ep_dict = get_matrices(seq, state)

CPU times: user 34.7 s, sys: 138 ms, total: 34.9 s
Wall time: 35.2 s


In [None]:
import pandas as pd

In [None]:
expanded_data = [(key[0], key[1], value) for key, value in ep_dict.items()]

df = pd.DataFrame(expanded_data, columns=['Index', 'Column', 'Value'])

pivot_df = df.pivot(index='Index', columns='Column', values='Value')

In [None]:
pivot_df

Column,A,C,G,N,T
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
N,11769906.0,8063026.0,8106230.0,6621362.0,11800527.0
Y,50758.0,122218.0,120151.0,,55803.0


In [None]:
normalized_df = pivot_df.div(pivot_df.sum(axis=1), axis=0)
normalized_df

Column,A,C,G,N,T
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
N,0.253875,0.173918,0.17485,0.142822,0.254535
Y,0.145468,0.350265,0.344341,,0.159926
