In [None]:
# create hgnc_symbols.csv

import json
import csv

symbols = []

with open('hgnc_all.json', 'r') as f:
    data = json.load(f)  # Load the entire file as a list

for obj in data:
    symbols.append(obj['approvedSymbol'])

with open('hgnc_symbols_all.csv', 'w', newline='') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['approvedSymbol'])  # header
    for symbol in symbols:
        writer.writerow([symbol])

In [None]:
# filter out olfaction + taste hgnc symbols

import pandas as pd

# Read the original CSV
df = pd.read_csv('hgnc_symbols.csv')

# Filter out rows where 'approvedSymbol' starts with 'OR' or 'TAS'
df = df[~df['approvedSymbol'].str.startswith(('OR', 'TAS'))]

# Overwrite the original CSV with the filtered data
df.to_csv('hgnc_symbols.csv', index=False)

In [None]:
# write human orthos txt to csv file

import pandas as pd

rows = []
with open('human_orthos_2025.05.16.txt') as f:
    for i, line in enumerate(f, 1):
        if i < 3:
            continue  # skip the first line
        fields = line.rstrip('\n').split('\t')
        if len(fields) == 13:
            rows.append(fields) 

columns = [
    'ZFIN ID', 'ZFIN Symbol', 'ZFIN Name', 'Human Symbol', 'Human Name', 'OMIM ID', 'Gene ID', 'HGNC ID',
    'Evidence', 'Pub ID', 'ZFIN Abbreviation Name', 'ECO ID', 'ECO Term Name'
]

df = pd.DataFrame(rows, columns=columns)
df.to_csv('human_orthos.csv', index=False)

In [5]:
# create list of ALL human matches (including duplicates)

import pandas as pd

# Read the list of symbols
hgnc_df = pd.read_csv('hgnc_symbols.csv')
symbols = hgnc_df['approvedSymbol'].tolist()

# Read the human orthologs table
orthos_df = pd.read_csv('human_orthos.csv')

# Filter rows where 'Human Symbol' matches any symbol in the list
filtered = orthos_df[orthos_df['Human Symbol'].isin(symbols)]

# Select only the desired columns
result = filtered[['Human Symbol', 'ZFIN ID', 'ZFIN Symbol', 'Pub ID']]

# Remove results with identical ZFIN IDs
result_unique = result.drop_duplicates(subset=['ZFIN Symbol'])

# Save to CSV
result_unique.to_csv('matched_human_orthos.csv', index=False)

In [7]:
# sort matched_human_orthos.csv into matched_human_orthos_unique.csv and matched_human_orthos_dupes.csv

import pandas as pd

# Read the matched human orthos file
df = pd.read_csv('matched_human_orthos.csv')

# Count occurrences of each ZFIN Symbol
counts = df['Human Symbol'].value_counts()
idxs_unique = counts[counts == 1].index
idxs_dupes = counts[counts > 1].index

# Filter entries
unique = df[df['Human Symbol'].isin(idxs_unique)]
dupes = df[df['Human Symbol'].isin(idxs_dupes)]


# Save to CSV
unique.to_csv('matched_human_orthos_unique.csv', index=False)
dupes.to_csv('matched_human_orthos_dupes.csv', index=False)

In [None]:
# create unmatched_human_orthos.csv, which contains any HGNC symbols that were not matched
import pandas as pd

# Read the list of symbols
hgnc_df = pd.read_csv('hgnc_symbols.csv')
symbols = hgnc_df['approvedSymbol'].tolist()

# Read the human orthologs table
orthos_df = pd.read_csv('human_orthos.csv')

# Find symbols not present in the Human Symbol column
not_matched = [s for s in symbols if s not in orthos_df['Human Symbol'].values]

pd.DataFrame({'HGNC_symbol': not_matched}).to_csv('unmatched_human_orthos.csv', index=False)

In [16]:
# calculate numbers of distinct genes (to be added manually to csvs)

import pandas as pd

df_unique = pd.read_csv('matched_human_orthos_unique.csv')
unique_count = len(df_unique['Human Symbol'])
print(f"Number of unique HGNC symbols: {unique_count}")

df_dupes = pd.read_csv('matched_human_orthos_dupes.csv')
dupes_count = df_dupes['Human Symbol'].nunique()
print(f"Number of duplicate HGNC symbols: {dupes_count}")

df_unmatched = pd.read_csv('unmatched_human_orthos.csv')
unmatched_count = len(df_unmatched['HGNC_symbol'])
print(f"Number of unmatched HGNC symbols: {unmatched_count}")

Number of unique HGNC symbols: 158
Number of duplicate HGNC symbols: 115
Number of unmatched HGNC symbols: 225
