In [34]:
import pandas as pd
import os

# Define input files (modify these for each species comparison)
anchors_file = r"C:\Users\ibirc\OneDrive\Documents\Projects\CD_300\Cetaceans\anchors\blue_whale_parsed.camel_parsed.anchors"
bed1_file = r"C:\Users\ibirc\OneDrive\Documents\Projects\CD_300\Cetaceans\bed\blue_whale_parsed.bed"
bed2_file = r"C:\Users\ibirc\OneDrive\Documents\Projects\CD_300\Cetaceans\bed\camel_parsed.bed"

# Load BED files for each species as DataFrames
bed1_df = pd.read_csv(bed1_file, sep="\t", header=None, names=["chrom", "start", "end", "gene", "score", "strand"])
bed2_df = pd.read_csv(bed2_file, sep="\t", header=None, names=["chrom", "start", "end", "gene", "score", "strand"])

# Clean up bed1_df and bed2_df to only include relevant columns
bed1_df = bed1_df[["chrom", "start", "end", "gene"]]  # Remove unnecessary columns like 'score' and 'strand'
bed2_df = bed2_df[["chrom", "start", "end", "gene"]]  # Keep only chrom, start, end, and gene

# Load anchors file (no headers, so we define column names)
anchors_df = pd.read_csv(anchors_file, sep="\t", header=None, names=["gene1", "gene2", "score"])

# Optional: Strip any leading/trailing spaces in gene names (ensure they match between files)
anchors_df['gene1'] = anchors_df['gene1'].str.strip()
anchors_df['gene2'] = anchors_df['gene2'].str.strip()

bed1_df['gene'] = bed1_df['gene'].str.strip()  # Clean gene names in BED file 1
bed2_df['gene'] = bed2_df['gene'].str.strip()  # Clean gene names in BED file 2

# Now, let's create separate DataFrames for gene1 and gene2 with their corresponding BED data
# Merge gene1 with BED file of species 1
merged_df1 = pd.merge(anchors_df[['gene1', 'gene2', 'score']], bed1_df[['gene', 'chrom', 'start', 'end']], left_on="gene1", right_on="gene", how="left")
merged_df1 = merged_df1.rename(columns={"chrom": "chrom1", "start": "start1", "end": "end1"}).drop(columns=["gene"])

# Print the columns of merged_df1 to ensure 'gene2' is still present
print("Columns in merged_df1:", merged_df1.columns)

# Merge gene2 with BED file of species 2
merged_df2 = pd.merge(merged_df1[['gene1', 'gene2', 'score', 'chrom1', 'start1', 'end1']], bed2_df[['gene', 'chrom', 'start', 'end']], left_on="gene2", right_on="gene", how="left")
merged_df2 = merged_df2.rename(columns={"chrom": "chrom2", "start": "start2", "end": "end2"}).drop(columns=["gene"])

# Print the columns of merged_df2 to ensure it's correct
print("Columns in merged_df2:", merged_df2.columns)

# Remove rows where any of the chrom, start, or end columns are NaN (indicating no match)
merged_df2 = merged_df2.dropna(subset=["chrom1", "chrom2"])

# Check a sample of the merged data to ensure it's correct
print(merged_df2.head())

# Define output directory
output_dir = r"C:\Users\ibirc\OneDrive\Documents\Projects\CD_300\Cetaceans\synteny_results"

# Ensure output directory exists
os.makedirs(output_dir, exist_ok=True)

# Save the merged output to the specified directory
output_file = os.path.join(output_dir, f"synteny_{os.path.basename(anchors_file).replace('.anchors', '.csv')}")
merged_df2.to_csv(output_file, sep="\t", index=False)

print(f"Saved synteny file: {output_file}")



Columns in merged_df1: Index(['gene1', 'gene2', 'score', 'chrom1', 'start1', 'end1'], dtype='object')
Columns in merged_df2: Index(['gene1', 'gene2', 'score', 'chrom1', 'start1', 'end1', 'chrom2',
       'start2', 'end2'],
      dtype='object')
        gene1        gene2   score chrom1    start1      end1 chrom2  \
1  SAMD11-203   SAMD11-206   576.0      1   40638.0   58208.0     13   
2    mrna_313   KLHL17-201  2560.0      1   72329.0   77435.0     13   
3    mrna_340  PLEKHN1-201  1970.0      1   73931.0   85653.0     13   
4    mrna_501    PERM1-201   220.0      1   87364.0   87790.0     13   
5    AGRN-204     AGRN-201  7620.0      1  139327.0  158568.0     13   

     start2      end2  
1  185471.0  196293.0  
2  219380.0  224778.0  
3  225647.0  232556.0  
4  233044.0  237502.0  
5  275074.0  295082.0  
Saved synteny file: C:\Users\ibirc\OneDrive\Documents\Projects\CD_300\Cetaceans\synteny_results\synteny_blue_whale_parsed.camel_parsed.csv


In [30]:
bed1_df

Unnamed: 0,chrom,start,end,gene
0,1,40638,58208,SAMD11-201
1,1,40638,58208,SAMD11-202
2,1,40638,58208,SAMD11-203
3,1,58294,70780,NOC2L-201
4,1,72329,77435,mrna_313
...,...,...,...,...
31349,Y,2171515,2217554,mrna_1104684
31350,Y,2230472,2238562,KDM5D-201
31351,Y,2230472,2237321,KDM5D-202
31352,Y,2281401,2282843,mrna_1104777


In [31]:
bed2_df

Unnamed: 0,chrom,start,end,gene
0,1,13089,13368,mrna_23448
1,1,411688,413855,mrna_23527
2,1,556876,947852,CPNE4-201
3,1,962356,1004082,MRPL3-201
4,1,1063116,1065626,NUDT16-201
...,...,...,...,...
19374,X,119451881,119473208,CLIC2-201
19375,X,119485177,119565763,TMLHE-201
19376,X,119671643,119672510,SPRY3-201
19377,X,119714998,119778089,VAMP7-201


In [32]:
anchors_df

Unnamed: 0,gene1,gene2,score
0,###,,
1,SAMD11-203,SAMD11-206,576.0
2,mrna_313,KLHL17-201,2560.0
3,mrna_340,PLEKHN1-201,1970.0
4,mrna_501,PERM1-201,220.0
...,...,...,...
7487,CCM2-202,mrna_910527,2090.0
7488,NACAD-201,mrna_910491,2140.0
7489,TBRG4-202,TBRG4-201,2470.0
7490,mrna_1059891,mrna_910414,324.0


In [33]:
merged_df1

Unnamed: 0,gene1,score,chrom1,start1,end1
0,###,,,,
1,SAMD11-203,576.0,1,40638.0,58208.0
2,mrna_313,2560.0,1,72329.0,77435.0
3,mrna_340,1970.0,1,73931.0,85653.0
4,mrna_501,220.0,1,87364.0,87790.0
...,...,...,...,...,...
7487,CCM2-202,2090.0,9,98649530.0,98709992.0
7488,NACAD-201,2140.0,9,98711624.0,98722020.0
7489,TBRG4-202,2470.0,9,98728404.0,98738412.0
7490,mrna_1059891,324.0,9,98747677.0,98749498.0
