In [3]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from tqdm import tqdm


In [4]:
dataset_path = '/n/netscratch/zhuang_lab/Lab/Peter/higashi_dnamerfish'
dists_path = '/n/netscratch/zhuang_lab/Lab/Peter/higashi_dnamerfish/multiplexed_fish/merfish_dists'
contacts_path = '/n/netscratch/zhuang_lab/Lab/Peter/higashi_dnamerfish/multiplexed_fish/merfish_contacts'
libraries_path = '/n/home09/pren/libraries'
fish_path = os.path.join(dataset_path, 'multiplexed_fish')
hic_path = os.path.join(dataset_path, 'schic_snm3cseq')
hic_ecker_2023_path = os.path.join(hic_path, 'data')

fish_datasets = ['4DNESMTNNB3N', '4DNESPE924IP']
fish_path1 = os.path.join(fish_path, fish_datasets[0])
fish_path2 = os.path.join(fish_path, fish_datasets[1])


In [5]:
dist_test_df = pd.read_csv(os.path.join(dists_path, 'dist_cell270024673801033364642738458147668450508_ord2049_data.txt'), sep='\t')
dist_test_df


Unnamed: 0,cell_id,chrom1,pos1,chrom2,pos2,distance
0,270024673801033364642738458147668450508,chr1,3751343,chr1,6252463,2.892381
1,270024673801033364642738458147668450508,chr1,3751343,chr1,8749962,
2,270024673801033364642738458147668450508,chr1,3751343,chr1,9632900,1.133079
3,270024673801033364642738458147668450508,chr1,3751343,chr1,9805415,2.707854
4,270024673801033364642738458147668450508,chr1,3751343,chr1,11252680,
...,...,...,...,...,...,...
926836,270024673801033364642738458147668450508,chrX,163754620,chrX,168751817,
926837,270024673801033364642738458147668450508,chrX,163754620,chrY,1252883,
926838,270024673801033364642738458147668450508,chrX,166253807,chrX,168751817,
926839,270024673801033364642738458147668450508,chrX,166253807,chrY,1252883,


In [6]:
dist_test_df.isnull().sum()

cell_id          0
chrom1           0
pos1             0
chrom2           0
pos2             0
distance    855588
dtype: int64

In [7]:
def convert_dist_to_contact_df(dist_df, thresh):
  def threshold_contact(distance):
    if distance <= thresh:
      return 1
    else:
      return 0
  dist_df['count'] = dist_df['distance'].apply(threshold_contact)
  contact_df = dist_df.drop('distance', axis=1)
  return contact_df

In [8]:
contact_test_df = convert_dist_to_contact_df(dist_test_df, 0.75) 
contact_test_df


Unnamed: 0,cell_id,chrom1,pos1,chrom2,pos2,count
0,270024673801033364642738458147668450508,chr1,3751343,chr1,6252463,0
1,270024673801033364642738458147668450508,chr1,3751343,chr1,8749962,0
2,270024673801033364642738458147668450508,chr1,3751343,chr1,9632900,0
3,270024673801033364642738458147668450508,chr1,3751343,chr1,9805415,0
4,270024673801033364642738458147668450508,chr1,3751343,chr1,11252680,0
...,...,...,...,...,...,...
926836,270024673801033364642738458147668450508,chrX,163754620,chrX,168751817,0
926837,270024673801033364642738458147668450508,chrX,163754620,chrY,1252883,0
926838,270024673801033364642738458147668450508,chrX,166253807,chrX,168751817,0
926839,270024673801033364642738458147668450508,chrX,166253807,chrY,1252883,0


In [9]:
def remove_zero_contacts(contact_df):
    return contact_df[contact_df['count'] != 0]

contact_test_df_nozero = remove_zero_contacts(contact_test_df)
contact_test_df_nozero


Unnamed: 0,cell_id,chrom1,pos1,chrom2,pos2,count
297,270024673801033364642738458147668450508,chr1,3751343,chr12,21446085,1
1363,270024673801033364642738458147668450508,chr1,6252463,chr1,9805415,1
1383,270024673801033364642738458147668450508,chr1,6252463,chr1,26250909,1
1678,270024673801033364642738458147668450508,chr1,6252463,chr12,52776962,1
1688,270024673801033364642738458147668450508,chr1,6252463,chr12,66247079,1
...,...,...,...,...,...,...
822189,270024673801033364642738458147668450508,chr2,161160681,chr2,161254330,1
822191,270024673801033364642738458147668450508,chr2,161160681,chr2,163755554,1
825823,270024673801033364642738458147668450508,chr2,169633013,chr2,180443515,1
826713,270024673801033364642738458147668450508,chr2,171254187,chr2,172430212,1


In [10]:
# for cur_file in tqdm(os.listdir(dists_path)):
#     out_filename = os.path.join(contacts_path, os.path.splitext(cur_file)[0] + '_proc.txt')
#     if not os.path.exists(out_filename):
#         cur_dist_df = pd.read_csv(os.path.join(dists_path, cur_file), sep='\t')
#         cur_contact_df = convert_dist_to_contact_df(cur_dist_df, 0.75)
#         cur_contact_df = remove_zero_contacts(cur_contact_df)
#     #     print('cur_contact_df: ', cur_contact_df)
#         cur_contact_df.to_csv(out_filename, sep='\t')
    



# Test different thresholds for contact matrices

In [11]:
contacts_path = '/n/netscratch/zhuang_lab/Lab/Peter/higashi_dnamerfish/multiplexed_fish/merfish_contacts_thresh500nm'
for cur_file in tqdm(os.listdir(dists_path)):
    out_filename = os.path.join(contacts_path, os.path.splitext(cur_file)[0] + '_proc.txt')
    if not os.path.exists(out_filename):
        try:
            cur_dist_df = pd.read_csv(os.path.join(dists_path, cur_file), sep='\t')
        except ParseError as err:
            print('Skip this file, as it is loading')
            continue
        cur_contact_df = convert_dist_to_contact_df(cur_dist_df, 0.5)
        cur_contact_df = remove_zero_contacts(cur_contact_df)
        cur_contact_df.to_csv(out_filename, sep='\t', index=False)
    

In [12]:
contacts_path = '/n/netscratch/zhuang_lab/Lab/Peter/higashi_dnamerfish/multiplexed_fish/merfish_contacts_thresh1000nm'
for cur_file in tqdm(os.listdir(dists_path)):
    out_filename = os.path.join(contacts_path, os.path.splitext(cur_file)[0] + '_proc.txt')
    if not os.path.exists(out_filename):
        try:
            cur_dist_df = pd.read_csv(os.path.join(dists_path, cur_file), sep='\t')
        except ParseError as err:
            print('Skip this file, as it is loading')
            continue
        cur_contact_df = convert_dist_to_contact_df(cur_dist_df, 1)
        cur_contact_df = remove_zero_contacts(cur_contact_df)
        cur_contact_df.to_csv(out_filename, sep='\t', index=False)


00%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2857/2857 [50:32<00:00,  1.06s/it]