In [1]:
import os
import pandas as pd
import requests

In [4]:
def fetch_chrom_sizes(genome, output_file):
    '''Utility for downloading chromosome sizes from UCSC.'''
    full_file_path = os.path.join(os.getcwd(), output_file)

    url = f'http://hgdownload.soe.ucsc.edu/goldenPath/{genome}/bigZips/{genome}.chrom.sizes'
    response = requests.get(url)

    if response.status_code == 200:
        with open(full_file_path, 'w') as f:
            for line in response.text.splitlines():
                chrom, size = line.split()
                f.write(f"{chrom}\t{size}\n")
        print(f"Chromosome sizes saved to {full_file_path}")
    else:
        print(f"Failed to fetch chromosome sizes for {genome}")

output_file = 'chromosome_sizes.txt'
fetch_chrom_sizes('hg38', output_file)

Chromosome sizes saved to /Users/siyuanzhao/Documents/GitHub/CS522_Project/Scripts/chromosome_sizes.txt


In [2]:
def make_dir(d):
    '''Utility for making a directory if not existing.'''
    if not os.path.exists(d):
        os.makedirs(d)

def get_spe_inter(hic_data, alpha=0.05):
    '''Filter Hi-C data for significant interactions based on the alpha threshold.'''
    hic_spe = hic_data.loc[hic_data['fdr'] < alpha]
    return hic_spe

def get_fold_inputs(spe_df, chr_name):
    '''Prepare folding input file from the filtered significant interactions.'''
    spe_out_df = spe_df[['ibp', 'jbp', 'fq', 'i1', 'j1']]
    spe_out_df['chr'] = chr_name
    spe_out_df['w'] = [1] * len(spe_out_df)
    result = spe_out_df[['chr', 'ibp', 'jbp', 'fq', 'w']]
    return result

def process_hic_files(input_folder, output_folder, alpha=0.05):
    '''Process all Hi-C files in the input folder and save results in the output folder.'''

    # Create the output directory if it doesn't exist
    make_dir(output_folder)

    # Iterate over all subdirectories in the specified input folder
    for root, dirs, files in os.walk(input_folder):
        for file_name in files:
            if file_name == 'hic.clean.csv.gz':
                hic_file_path = os.path.join(root, file_name)

                all_hic = pd.read_csv(hic_file_path)

                spe_hic = get_spe_inter(all_hic, alpha)

                # Get the chromosome name
                chr_name = os.path.basename(os.path.dirname(root)).split('.')[0]

                # Get the folding input file with the dynamic chromosome name
                fold_hic = get_fold_inputs(spe_hic, chr_name)

                # Extract the subdirectory name for output file naming
                relative_path = os.path.basename(os.path.dirname(root))
                output_file_name = f"{relative_path}.txt"
                output_file_name = output_file_name.replace(os.sep, '_')

                fold_hic_path = os.path.join(output_folder, output_file_name)

                # Save the results to a text file
                fold_hic.to_csv(fold_hic_path, header=None, index=None, sep='\t', mode='a')

input_folder = '../Data'
output_folder = '../Folding_input'
process_hic_files(input_folder, output_folder)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  spe_out_df['chr'] = chr_name
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  spe_out_df['w'] = [1] * len(spe_out_df)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  spe_out_df['chr'] = chr_name
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] 