In [3]:
import pandas as pd

# File path
file_path = '/bml/shreya/TF_binding_site/dataset_test/DEEPSEA_dataextraction/1.sorted'

# Read the file into a DataFrame
df = pd.read_csv(file_path, sep='\t', header=None)

# Assuming the columns correspond to: 
# chr, start, end, feature, chromosome_2, start_2, end_2, dot, value_1, dot_2, score, minus, value_2, score_2
# You can modify these column names as needed based on the actual structure of your file
df.columns = ['chromosome', 'start', 'end', 'feature', 'chromosome_2', 'start_2', 'end_2', 'dot', 
              'value_1', 'dot_2', 'score', 'minus', 'value_2', 'score_2']

# Get the unique features
unique_features = df['feature'].unique()

# Output the number of unique features
print(f"Number of unique features: {len(unique_features)}")


Number of unique features: 1


In [None]:
import os
import re
import csv

# Directory where the metadata file is located
metadata_file = '/bml/shreya/TF_binding_site/dataset_test/DEEPSEA_dataextraction/data/downloads/wgEncodeAwgTfbsUniform/files.txt'  # Update this path

# Lists to store extracted TF and cell type
tf_celltype_list = []
unique_tfs = set()
unique_celltypes = set()
filename_tf_celltype_map = {}

# Read the metadata file and extract TF and cell type
def parse_metadata(file_path):
    if not os.path.exists(file_path):
        print(f"Metadata file not found: {file_path}")
        return {}
    
    result = {}
    with open(file_path, 'r') as f:
        for line in f:
            # Skip empty lines
            if not line.strip():
                continue
                
            # Parse the line
            parts = line.strip().split('\t')
            if len(parts) < 2:
                continue
                
            filename = parts[0]
            metadata = parts[1]
            
            # Extract antibody (TF) and cell from metadata
            antibody_match = re.search(r'antibody=([^;]+)', metadata)
            cell_match = re.search(r'cell=([^;]+)', metadata)
            
            if antibody_match and cell_match:
                antibody = antibody_match.group(1)
                cell = cell_match.group(1)
                
                # Clean up antibody name
                # Remove any part after underscore or parentheses (like "_(39875)")
                tf = re.sub(r'_\([^)]+\)|\([^)]+\)', '', antibody)
                
                # Store in result dictionary
                result[filename] = (tf, cell)
    
    return result

# Parse the metadata file
metadata_dict = parse_metadata(metadata_file)

# Extract the TF and cell type information
for filename, (tf, cell) in metadata_dict.items():
    # Add to the list and sets
    tf_celltype_list.append((tf, cell, filename))
    unique_tfs.add(tf)
    unique_celltypes.add(cell)

# Sort by TF name alphabetically
tf_celltype_list.sort(key=lambda x: x[0])

# Path to save the CSV
output_csv = '/bml/shreya/TF_binding_site/dataset_test/DEEPSEA_dataextraction/data/tf_celltype_list.csv'

# Write to CSV
with open(output_csv, mode='w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(['Transcription Factor', 'Cell Type', 'Filename'])  # Header
    for tf, cell, filename in tf_celltype_list:
        writer.writerow([tf, cell, filename])

# Print counts and debug information
print(f"Saved the TF and cell type list to {output_csv}")
print(f"Number of unique transcription factors: {len(unique_tfs)}")
print(f"Number of unique cell types: {len(unique_celltypes)}")
print("\nUnique transcription factors (sorted alphabetically):")
for tf in sorted(unique_tfs):
    print(f"- {tf}")
print("\nUnique cell types (sorted alphabetically):")
for cell in sorted(unique_celltypes):
    print(f"- {cell}")

Saved the TF and cell type list to /bml/shreya/TF_binding_site/dataset_test/DEEPSEA_dataextraction/data/tf_celltype_list.csv
Number of unique transcription factors: 171
Number of unique cell types: 91

Unique transcription factors (sorted alphabetically):
- AP-2alpha
- AP-2gamma
- ARID3A
- ATF1
- ATF2
- ATF3
- BAF155
- BAF170
- BATF
- BCL11A
- BCL3
- BCLAF1
- BDP1
- BHLHE40
- BRCA1
- BRF1
- BRF2
- Bach1
- Brg1
- CBX3
- CCNT2
- CEBPB
- CEBPD
- CHD1
- CHD2
- COREST
- CREB1
- CTCF
- CTCFL
- CtBP2
- E2F1
- E2F4
- E2F6
- EBF1
- ELF1
- ELK1
- ELK4
- ERRA
- ERalpha_a
- ETS1
- EZH2
- Egr-1
- FOSL1
- FOSL2
- FOXA1
- FOXA2
- FOXM1
- FOXP2
- GABP
- GATA-1
- GATA-2
- GATA2
- GATA3
- GR
- GRp20
- GTF2B
- GTF2F1
- HA-E2F1
- HDAC1
- HDAC2
- HDAC6
- HMGN3
- HNF4A
- HNF4G
- HSF1
- IKZF1
- IRF1
- IRF3
- IRF4
- Ini1
- JARID1A
- JunD
- KAP1
- MAZ
- MBD4
- MEF2A
- MEF2C
- MTA3
- MYBL2
- MafF
- MafK
- Max
- Mxi1
- NANOG
- NELFe
- NF-E2
- NF-YA
- NF-YB
- NFATC1
- NFIC
- NFKB
- NR2F2
- NRSF
- Nrf1
- PAX5-C20


In [None]:
import os
import csv
import requests
import time

# Input file with TF names
input_csv = '/bml/shreya/TF_binding_site/dataset_test/DEEPSEA_dataextraction/data/tf_celltype_list.csv'

# Output file for TF to UniProt mapping
output_csv = '/bml/shreya/TF_binding_site/dataset_test/DEEPSEA_dataextraction/data/tf_uniprot_mapping.csv'

# Extract unique TF names from the CSV
unique_tfs = set()
with open(input_csv, 'r', newline='') as file:
    reader = csv.reader(file)
    next(reader)  # Skip header
    for row in reader:
        if row:  # Make sure the row is not empty
            unique_tfs.add(row[0])  # TF is in the first column

# Function to query UniProt API
def get_uniprot_id(tf_name):
    # Base URL for UniProt API
    base_url = "https://rest.uniprot.org/uniprotkb/search"
    
    # Construct query
    # Searching for human proteins with the name and filtering for reviewed entries
    params = {
        'query': f'({tf_name}) AND (organism_id:9606) AND (reviewed:true)',
        'format': 'tsv',
        'fields': 'accession,protein_name,gene_names'
    }
    
    try:
        response = requests.get(base_url, params=params)
        if response.status_code == 200:
            lines = response.text.strip().split('\n')
            if len(lines) > 1:  # Header + at least one result
                # Return first result (accession, protein name, gene names)
                parts = lines[1].split('\t')
                if len(parts) >= 3:
                    return parts[0], parts[1], parts[2]
        
        # If no results or error, return None
        return None, None, None
    
    except Exception as e:
        print(f"Error querying UniProt for {tf_name}: {str(e)}")
        return None, None, None

# Create a mapping of TF names to UniProt IDs
tf_uniprot_map = {}
count = 0
total = len(unique_tfs)

print(f"Querying UniProt for {total} transcription factors...")

for tf in sorted(unique_tfs):
    count += 1
    print(f"Processing {count}/{total}: {tf}")
    
    # Query UniProt
    uniprot_id, protein_name, gene_names = get_uniprot_id(tf)
    
    # Store in map
    tf_uniprot_map[tf] = {
        'uniprot_id': uniprot_id,
        'protein_name': protein_name,
        'gene_names': gene_names
    }
    
    # Add a small delay to avoid hitting rate limits
    time.sleep(0.5)

# Write the mapping to CSV
with open(output_csv, 'w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(['Transcription Factor', 'UniProt ID', 'Protein Name', 'Gene Names'])
    
    for tf in sorted(tf_uniprot_map.keys()):
        writer.writerow([
            tf,
            tf_uniprot_map[tf]['uniprot_id'] or 'Not found',
            tf_uniprot_map[tf]['protein_name'] or 'Not found',
            tf_uniprot_map[tf]['gene_names'] or 'Not found'
        ])

print(f"Mapping saved to {output_csv}")

Querying UniProt for 171 transcription factors...
Processing 1/171: AP-2alpha
Processing 2/171: AP-2gamma
Processing 3/171: ARID3A
Processing 4/171: ATF1
Processing 5/171: ATF2
Processing 6/171: ATF3
Processing 7/171: BAF155
Processing 8/171: BAF170
Processing 9/171: BATF
Processing 10/171: BCL11A
Processing 11/171: BCL3
Processing 12/171: BCLAF1
Processing 13/171: BDP1
Processing 14/171: BHLHE40
Processing 15/171: BRCA1
Processing 16/171: BRF1
Processing 17/171: BRF2
Processing 18/171: Bach1
Processing 19/171: Brg1
Processing 20/171: CBX3
Processing 21/171: CCNT2
Processing 22/171: CEBPB
Processing 23/171: CEBPD
Processing 24/171: CHD1
Processing 25/171: CHD2
Processing 26/171: COREST
Processing 27/171: CREB1
Processing 28/171: CTCF
Processing 29/171: CTCFL
Processing 30/171: CtBP2
Processing 31/171: E2F1
Processing 32/171: E2F4
Processing 33/171: E2F6
Processing 34/171: EBF1
Processing 35/171: ELF1
Processing 36/171: ELK1
Processing 37/171: ELK4
Processing 38/171: ERRA
Processing 39/

In [None]:
import pandas as pd
import numpy as np

# Path to the UniProt mapping CSV file
uniprot_mapping_file = '/bml/shreya/TF_binding_site/dataset_test/DEEPSEA_dataextraction/data/tf_uniprot_mapping.csv'

# Read the CSV file
df = pd.read_csv(uniprot_mapping_file)

# Count unique UniProt IDs (excluding 'Not found')
unique_uniprot_ids = df[df['UniProt ID'] != 'Not found']['UniProt ID'].nunique()

# Count NaN (entries where UniProt ID is 'Not found')
nan_count = df[df['UniProt ID'] == 'Not found'].shape[0]

# Print the results
print(f"Number of unique UniProt IDs: {unique_uniprot_ids}")
print(f"Number of transcription factors without UniProt IDs: {nan_count}")
print(f"Total number of transcription factors: {df.shape[0]}")

# Optional: Print TFs without UniProt IDs
print("\nTranscription factors without UniProt IDs:")
for tf in df[df['UniProt ID'] == 'Not found']['Transcription Factor']:
    print(f"- {tf}")

# Optional: Calculate percentage of TFs with UniProt IDs
percentage_with_uniprot = (unique_uniprot_ids / df.shape[0]) * 100
print(f"\nPercentage of TFs with UniProt IDs: {percentage_with_uniprot:.2f}%")

Number of unique UniProt IDs: 161
Number of transcription factors without UniProt IDs: 0
Total number of transcription factors: 171

Transcription factors without UniProt IDs:

Percentage of TFs with UniProt IDs: 94.15%


In [None]:
# Import required library
import pandas as pd

# Load both datasets
cell_type_df = pd.read_csv("/bml/shreya/TF_binding_site/dataset_test/DEEPSEA_dataextraction/data/tf_celltype_list.csv")  # Contains: Transcription Factor, Cell Type, Filename
uniprot_df = pd.read_csv("/bml/shreya/TF_binding_site/dataset_test/DEEPSEA_dataextraction/data/tf_uniprot_mapping.csv")      # Contains: Transcription Factor, UniProt ID, Protein Name, Gene Names

# Display the first few rows of each dataset to verify
print("Cell Type Data:")
display(cell_type_df.head())

print("\nUniProt Data:")
display(uniprot_df.head())

# Merge the datasets on 'Transcription Factor'
merged_df = pd.merge(
    cell_type_df,
    uniprot_df[['Transcription Factor', 'UniProt ID']],  # Only take needed columns from uniprot data
    on='Transcription Factor',
    how='left'  # Keep all rows from cell_type_df even if no match
)

# Select only the columns we want in the final output
final_df = merged_df[['Transcription Factor', 'Cell Type', 'Filename', 'UniProt ID']]

# Display the merged result
print("\nMerged Data:")
display(final_df.head())

# Save to file if needed
final_df.to_csv("/bml/shreya/TF_binding_site/dataset_test/DEEPSEA_dataextraction/data/tf_celltype_list_uniprotID.csv", index=False)


Cell Type Data:


Unnamed: 0,Transcription Factor,Cell Type,Filename
0,AP-2alpha,HeLa-S3,wgEncodeAwgTfbsSydhHelas3Ap2alphaUniPk.narrowP...
1,AP-2gamma,HeLa-S3,wgEncodeAwgTfbsSydhHelas3Ap2gammaUniPk.narrowP...
2,ARID3A,HepG2,wgEncodeAwgTfbsSydhHepg2Arid3anb100279IggrabUn...
3,ARID3A,K562,wgEncodeAwgTfbsSydhK562Arid3asc8821IggrabUniPk...
4,ATF1,K562,wgEncodeAwgTfbsSydhK562Atf106325UniPk.narrowPe...



UniProt Data:


Unnamed: 0,Transcription Factor,UniProt ID,Protein Name,Gene Names
0,AP-2alpha,P05549,Transcription factor AP-2-alpha (AP2-alpha) (A...,TFAP2A AP2TF TFAP2
1,AP-2gamma,Q92754,Transcription factor AP-2 gamma (AP2-gamma) (A...,TFAP2C
2,ARID3A,Q99856,AT-rich interactive domain-containing protein ...,ARID3A DRIL1 DRIL3 DRX E2FBP1
3,ATF1,P18846,Cyclic AMP-dependent transcription factor ATF-...,ATF1
4,ATF2,P15336,Cyclic AMP-dependent transcription factor ATF-...,ATF2 CREB2 CREBP1



Merged Data:


Unnamed: 0,Transcription Factor,Cell Type,Filename,UniProt ID
0,AP-2alpha,HeLa-S3,wgEncodeAwgTfbsSydhHelas3Ap2alphaUniPk.narrowP...,P05549
1,AP-2gamma,HeLa-S3,wgEncodeAwgTfbsSydhHelas3Ap2gammaUniPk.narrowP...,Q92754
2,ARID3A,HepG2,wgEncodeAwgTfbsSydhHepg2Arid3anb100279IggrabUn...,Q99856
3,ARID3A,K562,wgEncodeAwgTfbsSydhK562Arid3asc8821IggrabUniPk...,Q99856
4,ATF1,K562,wgEncodeAwgTfbsSydhK562Atf106325UniPk.narrowPe...,P18846



Saved merged data to 'merged_transcription_factors.csv'


In [None]:
# Import required library
import pandas as pd

# Load both datasets
cell_type_df = pd.read_csv("/bml/shreya/TF_binding_site/dataset_test/DEEPSEA_dataextraction/data/tf_celltype_list.csv")  # Contains: Transcription Factor, Cell Type, Filename
uniprot_df = pd.read_csv("/bml/shreya/TF_binding_site/dataset_test/DEEPSEA_dataextraction/data/tf_uniprot_mapping.csv")      # Contains: Transcription Factor, UniProt ID, Protein Name, Gene Names


# Merge the datasets
merged_df = pd.merge(
    cell_type_df,
    uniprot_df[['Transcription Factor', 'UniProt ID']],
    on='Transcription Factor',
    how='left'
)

# Create final dataframe with selected columns
final_df = merged_df[['Transcription Factor', 'Cell Type', 'Filename', 'UniProt ID']]

# Check for missing UniProt IDs
missing_uniprot = final_df[final_df['UniProt ID'].isna()]

print(f"Total entries in final data: {len(final_df)}")
print(f"Entries with UniProt IDs: {len(final_df) - len(missing_uniprot)}")
print(f"Entries missing UniProt IDs: {len(missing_uniprot)}")

if not missing_uniprot.empty:
    print("\nEntries missing UniProt IDs:")
    display(missing_uniprot)
    
    print("\nSummary of missing IDs by Cell Type:")
    display(missing_uniprot['Cell Type'].value_counts())
else:
    print("\nAll entries have UniProt IDs!")

# Display final merged data sample
print("\nSample of final merged data:")
display(final_df.head())

# Save to file
final_df.to_csv("/bml/shreya/TF_binding_site/dataset_test/DEEPSEA_dataextraction/data/tf_celltype_list_uniprotID.csv", index=False)


Total entries in final data: 690
Entries with UniProt IDs: 690
Entries missing UniProt IDs: 0

All entries have UniProt IDs!

Sample of final merged data:


Unnamed: 0,Transcription Factor,Cell Type,Filename,UniProt ID
0,AP-2alpha,HeLa-S3,wgEncodeAwgTfbsSydhHelas3Ap2alphaUniPk.narrowP...,P05549
1,AP-2gamma,HeLa-S3,wgEncodeAwgTfbsSydhHelas3Ap2gammaUniPk.narrowP...,Q92754
2,ARID3A,HepG2,wgEncodeAwgTfbsSydhHepg2Arid3anb100279IggrabUn...,Q99856
3,ARID3A,K562,wgEncodeAwgTfbsSydhK562Arid3asc8821IggrabUniPk...,Q99856
4,ATF1,K562,wgEncodeAwgTfbsSydhK562Atf106325UniPk.narrowPe...,P18846



Saved merged data to 'merged_transcription_factors.csv'


In [3]:
import requests

uniprot_ids = ["P00750", "P12345"]  # Your UniProt IDs
base_url = "http://jaspar.genereg.net/api/v1/matrix/"

for uid in uniprot_ids:
    response = requests.get(f"{base_url}?uniprot={uid}&format=json")
    if response.status_code == 200:
        with open(f"{uid}_pwm.json", "w") as f:
            f.write(response.text)
    else:
        print(f"Failed for {uid}")

In [2]:
#unique cell type 
import pandas as pd

# Path to your CSV file
csv_file = '/bml/shreya/TF_binding_site/dataset_test/DEEPSEA_dataextraction/data/tf_celltype_list.csv'

# Read the CSV
df = pd.read_csv(csv_file)

# Get unique cell types
unique_cell_types = sorted(df['CellType'].unique())

# Print the unique cell types
print("Unique cell types:")
for cell_type in unique_cell_types:
    print(cell_type)


Unique cell types:
A549
AG04449
AG04450
AG09309
AG09319
AG10803
AoAF
BE2_C
BJ
Caco-2
Dnd41
ECC-1
Fibrobl
GM06990
GM08714
GM10847
GM12801
GM12864
GM12865
GM12872
GM12873
GM12874
GM12875
GM12878
GM12891
GM12892
GM15510
GM18505
GM18526
GM18951
GM19099
GM19193
GM19238
GM19239
GM19240
Gliobla
H1-hESC
HA-sp
HAc
HBMEC
HCFaa
HCM
HCPEpiC
HCT-116
HEEpiC
HEK293
HEK293-T-REx
HFF
HFF-Myc
HL-60
HMEC
HMF
HPAF
HPF
HRE
HRPEpiC
HSMM
HSMMtube
HUVEC
HVMF
HeLa-S3
HepG2
IMR90
K562
MCF-7
MCF10A-Er-Src
NB4
NH-A
NHDF-Ad
NHDF-neo
NHEK
NHLF
NT2-D1
Osteobl
PANC-1
PBDE
PBDEFetal
PFSK-1
ProgFib
RPTEC
Raji
SAEC
SH-SY5Y
SK-N-MC
SK-N-SH
SK-N-SH_RA
T-47D
U2OS
U87
WERI-Rb-1
WI-38


In [1]:
import pandas as pd

# Load the file (choose .csv or .xlsx)
input_file = '/bml/shreya/TF_binding_site/dataset_test/DEEPSEA_dataextraction/data/tf_celltype_list.csv'  # or 'tf_celltype_list.xlsx'
df = pd.read_csv(input_file) if input_file.endswith('.csv') else pd.read_excel(input_file)

# Extract unique cell types from the 'CellType' column
unique_cell_types = sorted(df['CellType'].dropna().unique())

# Save to a new file
output_file = '/bml/shreya/TF_binding_site/dataset_test/DEEPSEA_dataextraction/data/unique_cell_types.txt'
with open(output_file, 'w') as f:
    for cell_type in unique_cell_types:
        f.write(f"{cell_type}\n")

print(f"Saved {len(unique_cell_types)} unique cell types to {output_file}")


Saved 91 unique cell types to /bml/shreya/TF_binding_site/dataset_test/DEEPSEA_dataextraction/data/unique_cell_types.txt
