In [2]:
import gzip
import os
import glob
import pandas as pd
import matplotlib.pyplot as plt
import sys
from typing import Tuple, List
from tabulate import tabulate

In [3]:
pd.set_option('display.max_rows', None)
directory = 'data/GSE93957_RAW/'

In [4]:

sample_list = ["Lung", "Heart", "Liver", "Cortex"]

def build_df(filename) -> pd.DataFrame:
    # Create empty DataFrame to be filled
    df = pd.DataFrame()
    columns_names = ['chromosome', 's_loc', 'e_loc', 'methyl_rate', 's_depth', 'e_depth']

    file_path = os.path.join(directory, filename)
    # Open the gzipped file in text mode
    with gzip.open(file_path, 'rt') as file:  
        print(file_path)
        df = pd.read_csv(file_path, sep="\t", header=None, names=columns_names, low_memory=False)

        # Add Column for CG-Sites
        loc_list = df['s_loc'].tolist()
        cg_list = []
        for idx in range(len(loc_list)):
            # check if index isn't the last value, and check if next methylated read is sequential to current index
            if idx < len(loc_list)-1 and (loc_list[idx] == loc_list[idx+1]-1):
                cg_list.append(True)
            # check if index isn't the first value, and check if the previous methylated read is sequential to current index
            elif idx > 0 and (loc_list[idx] == loc_list[idx-1]+1):
                cg_list.append(True)
            # Append false otherwise
            else:
                cg_list.append(False)
        df['CG site'] = cg_list

    return df

In [4]:
df = build_df("GSM2465618_M00018359_41wk_Lung.cov.txt.gz")
unique_chromosomes = df['chromosome'].unique()


data/GSE93957_RAW/GSM2465618_M00018359_41wk_Lung.cov.txt.gz


In [5]:
df_chromo_sort = df.sort_values(by=['chromosome', 's_loc']).reset_index(drop=True)

In [6]:
df_chromo_sort.head(10)

Unnamed: 0,chromosome,s_loc,e_loc,methyl_rate,s_depth,e_depth,CG site
0,1,3007431,3007431,0.0,0,1,False
1,1,3014611,3014611,100.0,1,0,True
2,1,3014612,3014612,7.692308,2,24,True
3,1,3014928,3014928,100.0,1,0,True
4,1,3014929,3014929,76.0,19,6,True
5,1,3014974,3014974,0.0,0,1,True
6,1,3014975,3014975,92.592593,25,2,True
7,1,3020794,3020794,100.0,1,0,False
8,1,3020814,3020814,100.0,1,0,False
9,1,3020842,3020842,0.0,0,1,False


In [7]:
df2 = build_df("GSM2465622_M00018362_41wk_Lung.cov.txt.gz")
df_chromo_sort2 = df2.sort_values(by=['chromosome', 's_loc']).reset_index(drop=True)

data/GSE93957_RAW/GSM2465622_M00018362_41wk_Lung.cov.txt.gz


In [None]:
df_chromo_sort2.head(10)

Unnamed: 0,chromosome,s_loc,e_loc,methyl_rate,s_depth,e_depth,CG site
0,1,3007431,3007431,0.0,0,1,False
1,1,3007533,3007533,100.0,1,0,False
2,1,3014611,3014611,100.0,2,0,True
3,1,3014612,3014612,0.0,0,11,True
4,1,3014928,3014928,0.0,0,1,True
5,1,3014929,3014929,90.0,9,1,True
6,1,3014974,3014974,0.0,0,2,True
7,1,3014975,3014975,90.909091,10,1,True
8,1,3016622,3016622,0.0,0,1,False
9,1,3020877,3020877,100.0,7,0,True


In [None]:
# Merge the two dataframes on 'chromosome' and 's_loc'
merged_df = pd.merge(df_chromo_sort, df_chromo_sort2, on=['chromosome', 's_loc'], how="outer", suffixes=('_df1', '_df2'), sort=True)

# Calculate the average of methyl_rate from both dataframes
merged_df['avg_methyl_rate'] = (merged_df['methyl_rate_df1'] + merged_df['methyl_rate_df2']) / 2

# Drop the individual methyl_rate columns if not needed
# merged_df.drop(['methyl_rate_df1', 'methyl_rate_df2'], axis=1, inplace=True)

merged_df.head(20)

Unnamed: 0,chromosome,s_loc,e_loc_df1,methyl_rate_df1,s_depth_df1,e_depth_df1,CG site_df1,e_loc_df2,methyl_rate_df2,s_depth_df2,e_depth_df2,CG site_df2,avg_methyl_rate
0,1,3007431,3007431.0,0.0,0.0,1.0,False,3007431.0,0.0,0.0,1.0,False,0.0
1,1,3014611,3014611.0,100.0,1.0,0.0,True,3014611.0,100.0,2.0,0.0,True,100.0
2,1,3014612,3014612.0,7.692308,2.0,24.0,True,3014612.0,0.0,0.0,11.0,True,3.846154
3,1,3014928,3014928.0,100.0,1.0,0.0,True,3014928.0,0.0,0.0,1.0,True,50.0
4,1,3014929,3014929.0,76.0,19.0,6.0,True,3014929.0,90.0,9.0,1.0,True,83.0
5,1,3014974,3014974.0,0.0,0.0,1.0,True,3014974.0,0.0,0.0,2.0,True,0.0
6,1,3014975,3014975.0,92.592593,25.0,2.0,True,3014975.0,90.909091,10.0,1.0,True,91.750842
7,1,3020794,3020794.0,100.0,1.0,0.0,False,,,,,,
8,1,3020814,3020814.0,100.0,1.0,0.0,False,,,,,,
9,1,3020842,3020842.0,0.0,0.0,1.0,False,,,,,,


In [None]:
merged_df['avg_methyl_rate'] = merged_df['avg_methyl_rate'].fillna(merged_df['methyl_rate_df1'].combine_first(merged_df['methyl_rate_df2']))

In [None]:
merged_df.head(20)

Unnamed: 0,chromosome,s_loc,e_loc_df1,methyl_rate_df1,s_depth_df1,e_depth_df1,CG site_df1,e_loc_df2,methyl_rate_df2,s_depth_df2,e_depth_df2,CG site_df2,avg_methyl_rate
0,1,3007431,3007431.0,0.0,0.0,1.0,False,3007431.0,0.0,0.0,1.0,False,0.0
1,1,3014611,3014611.0,100.0,1.0,0.0,True,3014611.0,100.0,2.0,0.0,True,100.0
2,1,3014612,3014612.0,7.692308,2.0,24.0,True,3014612.0,0.0,0.0,11.0,True,3.846154
3,1,3014928,3014928.0,100.0,1.0,0.0,True,3014928.0,0.0,0.0,1.0,True,50.0
4,1,3014929,3014929.0,76.0,19.0,6.0,True,3014929.0,90.0,9.0,1.0,True,83.0
5,1,3014974,3014974.0,0.0,0.0,1.0,True,3014974.0,0.0,0.0,2.0,True,0.0
6,1,3014975,3014975.0,92.592593,25.0,2.0,True,3014975.0,90.909091,10.0,1.0,True,91.750842
7,1,3020794,3020794.0,100.0,1.0,0.0,False,,,,,,100.0
8,1,3020814,3020814.0,100.0,1.0,0.0,False,,,,,,100.0
9,1,3020842,3020842.0,0.0,0.0,1.0,False,,,,,,0.0


In [5]:
def grab_tissue(tissue_type: str, age: str):
    merged_df =  pd.DataFrame(columns=['chromosome', 's_loc', 'e_loc', 'methyl_rate', 's_depth', 'e_depth'])
    # Initialize an empty list to store individual DataFrames
    dataframes_to_merge = []


    matching_tissue = os.listdir(directory)
    for tissue in matching_tissue:
        if age in tissue and tissue_type in tissue:
            df = build_df(tissue)
            dataframes_to_merge.append(df)
    
    suffixes = []
    for i in range(len(dataframes_to_merge)):
        suffixes.append(f"_df{i}")

    if dataframes_to_merge:
        merged_df = pd.merge(dataframes_to_merge, on=['chromosome', 's_loc'], how="outer", suffixes=suffixes, sort=True)
        return merged_df
    else:
        raise Exception("no dataframes found")

            

merged_df = grab_tissue("Heart", "14wk").head(10)




data/GSE93957_RAW/GSM2465662_M0420527_14wk_Heart.cov.txt.gz
data/GSE93957_RAW/GSM2465658_M0420525_14wk_Heart.cov.txt.gz
data/GSE93957_RAW/GSM2465674_M0520522_14wk_Heart.cov.txt.gz


TypeError: merge() missing 1 required positional argument: 'right'

In [None]:
def grab_tissue2(tissue_type: str, age: str):
    merged_df =  pd.DataFrame(columns=['chromosome', 's_loc', 'e_loc', 'methyl_rate', 's_depth', 'e_depth'])
    # Initialize an empty list to store individual DataFrames
    dataframes_to_merge = []
    matching_tissue = os.listdir(directory)

    for tissue in matching_tissue:
        if age in tissue and tissue_type in tissue:
            df = build_df(tissue)
            dataframes_to_merge.append(df)

    # Check if there are any DataFrames to merge
    if dataframes_to_merge:
        # Initialize the merged DataFrame with the first DataFrame
        merged_df = dataframes_to_merge[0]

        # Iterate through the rest of the DataFrames and merge them with a unique suffix
        for i, df in enumerate(dataframes_to_merge[1:], start=2):
            print("test")
            suffix = f"_df{i}"  # Create a unique suffix based on the DataFrame index
            merged_df = merged_df.merge(df, on=['chromosome', 's_loc'], suffixes=('', suffix))
    else:
        raise Exception("no dataframes found")
    return merged_df

grab_tissue2("Heart", "14wk")

data/GSE93957_RAW/GSM2465662_M0420527_14wk_Heart.cov.txt.gz
data/GSE93957_RAW/GSM2465658_M0420525_14wk_Heart.cov.txt.gz
data/GSE93957_RAW/GSM2465674_M0520522_14wk_Heart.cov.txt.gz


In [10]:
methyl_rate_columns = [col for col in merged_df.columns if col.startswith('methyl_rate_')]
merged_df['avg_methyl_rate'] = merged_df[methyl_rate_columns].mean(axis=1)
merged_df.head()

NameError: name 'merged_df' is not defined