In [1]:
import os
import pandas as pd
import requests

**Generate human chromosome size data**


In [None]:
def fetch_chrom_sizes(genome, output_file):
    """Utility for downloading chromosome sizes from UCSC."""
    full_file_path = os.path.join(os.getcwd(), output_file)

    url = f"http://hgdownload.soe.ucsc.edu/goldenPath/{genome}/bigZips/{genome}.chrom.sizes"
    response = requests.get(url)

    if response.status_code == 200:
        with open(full_file_path, "w") as f:
            for line in response.text.splitlines():
                chrom, size = line.split()
                f.write(f"{chrom}\t{size}\n")
        print(f"Chromosome sizes saved to {full_file_path}")
    else:
        print(f"Failed to fetch chromosome sizes for {genome}")


output_file = "chromosome_sizes.txt"
fetch_chrom_sizes("hg38", output_file)

Chromosome sizes saved to /Users/siyuanzhao/Documents/GitHub/CS522_Project/Scripts/chromosome_sizes.txt


**Generate folding input files**


In [None]:
def make_dir(d):
    """Utility for making a directory if not existing."""
    if not os.path.exists(d):
        os.makedirs(d)

def get_spe_inter(hic_data, alpha=0.05):
    """Filter Hi-C data for significant interactions based on the alpha threshold."""
    hic_spe = hic_data.loc[hic_data['fdr'] < alpha]
    return hic_spe

def get_fold_inputs(spe_df):
    """Prepare folding input file from the filtered significant interactions."""
    spe_out_df = spe_df[['ibp', 'jbp', 'fq', 'chr', 'fdr']]
    spe_out_df['w'] = 1  # Set the weight column to 1
    result = spe_out_df[['chr', 'ibp', 'jbp', 'fq', 'w']]
    return result

def process_hic_files(input_folder, output_folder, reference_file, alpha=0.05):
    """Process all Hi-C files in the input folder based on the reference file and save results in the output folder."""

    # Create the output directory if it doesn't exist
    make_dir(output_folder)

    # read the reference file
    reference_df = pd.read_csv(reference_file, usecols=["chrID", "cell_line", "start_value", "end_value"])

    for file_name in os.listdir(input_folder):
        if file_name.endswith(".csv.gz"):
            hic_file_path = os.path.join(input_folder, file_name)
            
            all_hic = pd.read_csv(hic_file_path)

            spe_hic = get_spe_inter(all_hic, alpha)

            for _, row in reference_df.iterrows():
                chrID = row["chrID"]
                cell_line = row["cell_line"]
                start_value = row["start_value"]
                end_value = row["end_value"]

                chr_hic_data = spe_hic[
                    (spe_hic["chr"] == chrID)
                    & (spe_hic["ibp"] >= start_value)
                    & (spe_hic["ibp"] < end_value)
                ]

                if chr_hic_data.empty:
                    continue 

                fold_hic = get_fold_inputs(chr_hic_data)

                output_file_name = (
                    f"{cell_line}.{chrID}.{start_value}.{end_value}.txt"
                )
                fold_hic_path = os.path.join(output_folder, output_file_name)

                fold_hic.to_csv(
                    fold_hic_path, header=None, index=None, sep="\t", mode="a"
                )


input_folder = '../Data/refined_processed_HiC'
output_folder = '../Data/Folding_input'
reference_file = '../Data/ibp_ranges_summary.csv.gz'
process_hic_files(input_folder, output_folder, reference_file)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  spe_out_df['chr'] = chr_name
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  spe_out_df['w'] = [1] * len(spe_out_df)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  spe_out_df['chr'] = chr_name
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] 