# Introduction to bioinformatics - project no. 2
## CpG islands and DNA Methylation analysis

## Necessary libraries

In [1]:
import pandas as pd
import copy
import time

## Data load

In [2]:
cpg_islands = pd.read_csv("data/cpgIslandExt.txt", sep='\t', header=None)
dna_methylation = pd.read_csv("data/wgEncodeHaibMethyl450A549Etoh02SitesRep1.bed", sep='\t', header=None)
chromosomes_sizes = pd.read_csv("data/hg19.chrom.sizes.txt", sep='\t', header=None)

## Data preprocessing

In [3]:
dna_methylation = dna_methylation.loc[:,0:2]
dna_methylation.columns = ["chr_nr", "start", "stop"]
dna_methylation['location'] = ((dna_methylation['stop']-dna_methylation['start'])/2 +
                            dna_methylation['start']).astype('int32')

chromosomes_sizes.columns = ['chr_nr', 'size']
chromosomes_sizes.set_index('chr_nr', inplace=True)

cpg_islands = cpg_islands.loc[:, 1:3]
cpg_islands.columns = ['chr_nr', 'start', 'stop']

In [4]:
chromosomes_sizes.size

93

In [5]:
dna_methylation.head()

Unnamed: 0,chr_nr,start,stop,location
0,chr16,53468112,53468162,53468137
1,chr3,37459206,37459256,37459231
2,chr3,171916037,171916087,171916062
3,chr1,91194674,91194724,91194699
4,chr8,42263294,42263344,42263319


In [6]:
dna_methylation.size

1929684

In [7]:
dna_methylation.chr_nr.unique()

array(['chr16', 'chr3', 'chr1', 'chr8', 'chr14', 'chr15', 'chr9', 'chr19',
       'chr6', 'chr12', 'chr2', 'chr4', 'chr11', 'chr20', 'chr10',
       'chr18', 'chr21', 'chr17', 'chr7', 'chr22', 'chrX', 'chr13',
       'chr5', 'chrY'], dtype=object)

In [8]:
dna_methylation.head()

Unnamed: 0,chr_nr,start,stop,location
0,chr16,53468112,53468162,53468137
1,chr3,37459206,37459256,37459231
2,chr3,171916037,171916087,171916062
3,chr1,91194674,91194724,91194699
4,chr8,42263294,42263344,42263319


In [9]:
cpg_islands.head()

Unnamed: 0,chr_nr,start,stop
0,chr1,28735,29810
1,chr1,135124,135563
2,chr1,327790,328229
3,chr1,437151,438164
4,chr1,449273,450544


In [10]:
chromosomes_sizes.head()

Unnamed: 0_level_0,size
chr_nr,Unnamed: 1_level_1
chr1,249250621
chr2,243199373
chr3,198022430
chr4,191154276
chr5,180915260


## Functions

In [11]:
# states
island = 3
shore = 2
shelve = 1
sea  = 0

def set_point_label(vector, index, new_state):
    """
    tries to set new state of a gene - shore, shelve,
    """
    if(max(new_state, vector[index]) == new_state):
        vector[index] = new_state

In [12]:
def get_list_bounded_range(max_len, start_index, stop_index):
    if(start_index < 0):
        start_index = 0
    if(stop_index > max_len):
        stop_index = max_len
    return range(start_index, stop_index)

In [13]:
def set_range_label(vector, start_index, stop_index, new_state):
    for i in get_list_bounded_range(len(vector), start_index, stop_index):
        set_point_label(vector, i, new_state)

In [14]:
def set_island_states(vector, cpg_island_range):
    diff = 10 # do zmiany
    # left side
    set_range_label(vector, cpg_island_range[0]-2*diff, cpg_island_range[0]-diff, shelve)
    set_range_label(vector, cpg_island_range[0]-diff, cpg_island_range[0], shore)
    
    # island
    set_range_label(vector, cpg_island_range[0], cpg_island_range[1], island)
    
    # right side
    set_range_label(vector, cpg_island_range[1], cpg_island_range[1]+diff, shore)
    set_range_label(vector, cpg_island_range[1]+diff, cpg_island_range[1]+2*diff, shelve)

In [15]:
def vec_states2df(df_list, chr_nr, states_vec):
    starts = [None]*5
    for i in range(1,len(states_vec)):
        if(states_vec[i] - states_vec[i-1] == 0):
            continue
        elif(states_vec[i] - states_vec[i-1] > 0):
            starts[states_vec[i]] = i
        elif(states_vec[i] - states_vec[i-1] < 0):
            # dodaj do df states_vec[i-1] od starts[tego] do i-1, wyzeruj starts
            df_list[states_vec[i-1]].loc[df_list[states_vec[i-1]].shape[0]+1] = [chr_nr, starts[states_vec[i-1]], i-1]
            starts[states_vec[i-1]] = None
    # considering last element of states_vec
    for state, start in enumerate(starts):
        if(start is not None):
            df_list[state].loc[df_list[state].shape[0]+1] = [chr_nr, start, len(states_vec)]
            start = None

In [None]:
chromosomes = [('chr' + str(i)) for i in range(1, 2)]
islands = pd.DataFrame(columns = ['chr_nr', 'start', 'stop'])
shores = pd.DataFrame(columns = ['chr_nr', 'start', 'stop'])
shelves = pd.DataFrame(columns = ['chr_nr', 'start', 'stop'])
seas = pd.DataFrame(columns = ['chr_nr', 'start', 'stop'])
df_list = [seas, shelves, shores, islands]

print(time.asctime(time.localtime(time.time())))
for i in chromosomes:
    print(i, time.asctime(time.localtime(time.time())))
    states_vec = [0] * chromosomes_sizes['size'][i]
    cpg_islands_i = cpg_islands[cpg_islands.chr_nr == i]
    print("cpg_islands_i.shape", cpg_islands_i.shape)
    for j in range(cpg_islands_i.shape[0]):
        diff = 10 #do zmiany
        cpg_island_range = list(cpg_islands.iloc[j][1:3])
        #print("cpg_island_range", cpg_island_range)
        set_island_states(states_vec, cpg_island_range)
    #locate states
    print(states_vec)
    vec_states2df(df_list, i, states_vec)
print("end", time.asctime(time.localtime(time.time())))

Thu Nov 21 22:05:58 2019
chr1 Thu Nov 21 22:05:58 2019
cpg_islands_i.shape (2462, 3)


In [25]:
shelves

Unnamed: 0,chr_nr,start,stop
1,chr1,28715,29829
2,chr1,135104,135582
3,chr1,327770,328248
4,chr1,437131,438183
5,chr1,449253,450563
6,chr1,533199,534133
7,chr1,544718,546668
8,chr1,713964,714566
9,chr1,762396,763464
10,chr1,788843,789230


In [24]:
states_vec

[0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
