# <b>Reference File Processing

In [5]:
import pandas as pd
from pathlib import Path

In [6]:
REF_DIR = Path("../Data/Reference/GRCh38")
PASCALX_DIR = Path("../Data/PascalX")
MAGMA_DIR = Path("../Data/MAGMA")
limix_file = REF_DIR / "LimixAnnotationFile.txt"

#### <b>1. Formatting Limix gene location file to MAGMA format
The gene location file must contain at least four columns, in this order: (1) **gene ID**, (2) **chromosome**, (3) **start site**, (4) **stop site**.<br> 

In [None]:
limix_file_data = pd.read_csv(limix_file, sep="\t")
print(limix_file_data.head())

In [None]:
print(limix_file_data["chromosome"].unique())

In [None]:
# Format and writ to file
file_loc = MAGMA_DIR / "gene_locations" / "GRCh38_GENELOC.tsv"
limix_file_data[ [ "feature_id", "chromosome", "start", "end"] ].to_csv(file_loc, sep="\t", header=None, index=None)
print(f"File written to {file_loc}")

#### <b>2. Adding strand information to Limix file

In [7]:
# Load Limix annotation file
limix_file_data = pd.read_csv(limix_file, sep="\t")
print(limix_file_data.head())

    feature_id chromosome  start     end             ENSG         biotype
0  MIR1302-2HG          1  29554   31109  ENSG00000243485         lincRNA
1      FAM138A          1  34554   36081  ENSG00000237613         lincRNA
2        OR4F5          1  65419   71585  ENSG00000186092  protein_coding
3   AL627309.1          1  89295  133723  ENSG00000238009         lincRNA
4   AL627309.3          1  89551   91105  ENSG00000239945         lincRNA


In [10]:
# Load strand information
strand_file = PASCALX_DIR / "resources" / "strand_info.txt"
strand_data = pd.read_csv(strand_file, sep="\t").rename(columns={"Gene stable ID": "ENSG"})
print(strand_data.head())

              ENSG  Strand Gene name
0  ENSG00000210049       1     MT-TF
1  ENSG00000211459       1   MT-RNR1
2  ENSG00000210077       1     MT-TV
3  ENSG00000210082       1   MT-RNR2
4  ENSG00000209082       1    MT-TL1


In [16]:
# Merge on ENSG (keeping all genes in df_main)
df_merged = limix_file_data.merge(strand_data[['ENSG', 'Strand']], on="ENSG", how="left")
print(f'Missing strand values: {df_merged["Strand"].isnull().sum()}\n')
print(df_merged.head())

Missing strand values: 1145

    feature_id chromosome  start     end             ENSG         biotype  \
0  MIR1302-2HG          1  29554   31109  ENSG00000243485         lincRNA   
1      FAM138A          1  34554   36081  ENSG00000237613         lincRNA   
2        OR4F5          1  65419   71585  ENSG00000186092  protein_coding   
3   AL627309.1          1  89295  133723  ENSG00000238009         lincRNA   
4   AL627309.3          1  89551   91105  ENSG00000239945         lincRNA   

   Strand  
0     1.0  
1    -1.0  
2     1.0  
3     NaN  
4    -1.0  


In [17]:
# Create a DataFrame with rows where strand info is missing
df_missing_strand = df_merged[df_merged["Strand"].isnull()]
print(df_missing_strand.head())

     feature_id chromosome     start       end             ENSG  \
3    AL627309.1          1     89295    133723  ENSG00000238009   
17   AL645608.3          1    911435    914948  ENSG00000230699   
18   AL645608.5          1    914171    914971  ENSG00000241180   
150  AL365255.1          1   5561709   5668295  ENSG00000236948   
241  AL109811.4          1  11012662  11030528  ENSG00000277726   

            biotype  Strand  
3           lincRNA     NaN  
17          lincRNA     NaN  
18          lincRNA     NaN  
150         lincRNA     NaN  
241  protein_coding     NaN  


In [24]:
# Write missing data file
missing_strand_file = PASCALX_DIR / "resources" / 'limix_missing_strand.tsv'
df_missing_strand.to_csv(missing_strand_file, sep='\t', index=False)
print(f"File saved: {missing_strand_file}")

File saved: ../Data/PascalX/resources/limix_missing_strand.tsv


In [23]:
# Replace NaN values in 'Strand' with 1
df_merged['Strand'] = df_merged['Strand'].fillna(1).astype(int)

# Define output file path
merged_strand_file = PASCALX_DIR / "resources" / "limix_annotation_strand.tsv"

# Write the DataFrame to a TSV file
df_merged.to_csv(merged_strand_file, sep='\t', index=False)

print(f"File saved: {merged_strand_file}")

File saved: ../Data/PascalX/resources/limix_annotation_strand.tsv
