### Load Data from 'data' directory

In [1]:
import gzip
import os
import pandas as pd
import sys

In [12]:
directory = 'data'
sample_list = ["Lung", "Heart", "Liver", "Cortex"]

def build_df(sample: str) -> pd.DataFrame:
    # Create empty DataFrame to be filled
    df = pd.DataFrame()
    columns_names = ['chromosome', 's_loc', 'e_loc', 'methyl_rate', 's_depth', 'e_depth']

    # Iterate over files in the directory
    for filename in os.listdir(directory):

        # Check if the file is gzipped and the sample matches what we're working on
        if filename.endswith('.gz') and sample in filename:  
            file_path = os.path.join(directory, filename)

            # Open the gzipped file in text mode
            with gzip.open(file_path, 'rt') as file:  
                print(file_path)
                if len(df) > 0:                    
                    # Create temporary dataframe to concatenate results
                    temp_df = pd.read_csv(file_path, sep="\t", header=None, names=columns_names, low_memory=False)
                    pd.concat([df, temp_df], ignore_index=True)
                else:
                    df = pd.read_csv(file_path, sep="\t", header=None, names=columns_names, low_memory=False)
    return df

In [3]:
def build_result():
    column_names = ['sites', 'ave depth', 'ave methylation']
    result = pd.DataFrame(columns=column_names)

    for sample in sample_list:
        df = build_df(sample)
        length = len(df)
        depth = (df['s_depth'] + df['e_depth']).mean()
        mean = df['methyl_rate'].mean()
        result.loc[len(result)] = [length, depth, mean]

    return result



In [16]:
print("Pick the number of which tissue sample you'd like to perform statistical analysis on: ")
print("\n1. Lung \n2. Heart \n3. Liver \n4. Cortex \n5. All Samples")
opt_sel = int(input("--->"))

if opt_sel < 5:
    df = build_df(sample_list[opt_sel-1])
    chromo_df = df[df['chromosome'] == '10']
else:
    df = build_result()

# # print(chromo_df.head())
# print(f"Total # of Samples: {len(df)}")
# print(f"Average Methylation Rate: {df['methyl_rate'].mean()}%")
# print(f"Average Methylation depth: {(df['s_depth'] + df['e_depth']).mean()}")

df['Row'] = sample_list
column_order = ['Row', 'sites', 'ave depth', 'ave methylation']
df = df[column_order]
df.to_csv('result.csv')

Pick the number of which tissue sample you'd like to perform statistical analysis on: 

1. Lung 
2. Heart 
3. Liver 
4. Cortex 
5. All Samples
data/GSM2465653_M02NB_1wk_Lung.cov.txt.gz
data/GSM2465668_M04NB_1wk_Lung.cov.txt.gz
data/GSM2465656_M03NB_1wk_Lung.cov.txt.gz
data/GSM2465650_M01NB_1wk_Lung.cov.txt.gz
data/GSM2465648_M01NB_1wk_Heart.cov.txt.gz


KeyboardInterrupt: 

In [10]:
df

Unnamed: 0,sites,ave depth,ave methylation
0,4085551.0,15.241029,35.427501
1,4021292.0,14.609889,35.120765
2,4147692.0,13.342087,36.218352
3,4353634.0,14.978642,38.081742
