# Ridge regression to discover sex-associated genes

In this notebook, we download the GTEX whole-blood dataset and look at which genes are most closely associated with sex.  We also include age in our feature set, but don't do anything with it yet (besides include it in the inferece).  We will first just take a look a the sex specific genes to make sure our results make sense.

In [5]:
import pandas as pd
from pathlib import Path
import requests
import gzip
import shutil

download_directory_path = Path.cwd().parent / "downloads"


def download_if_not_exists(url: str, download_dir: Path = download_directory_path) -> Path:
    """
    Download a file from a URL to the specified directory if it doesn't exist.
    
    Args:
        url: URL to download from
        download_dir: Directory to save the file in
    
    Returns:
        Path to the downloaded file
    """
    download_dir.mkdir(exist_ok=True)
    file_path = download_dir / Path(url).name
    
    if not file_path.exists():
        print(f"Downloading {file_path.name}...")
        response = requests.get(url, stream=True)
        response.raise_for_status()
        
        with open(file_path, 'wb') as f:
            for chunk in response.iter_content(chunk_size=8192):
                f.write(chunk)
        print("Download complete!")
    else:
        print(f"File already exists at {file_path}")
    
    return file_path

def ungzip_if_not_exists(gzip_path: Path) -> Path:
    """
    Ungzip a file if the unzipped version doesn't exist.
    
    Args:
        gzip_path: Path to the gzipped file
    
    Returns:
        Path to the unzipped file
    """
    unzipped_path = gzip_path.parent / gzip_path.stem
    
    if not unzipped_path.exists():
        print(f"Unzipping to {unzipped_path}...")
        with gzip.open(gzip_path, 'rb') as f_in:
            with open(unzipped_path, 'wb') as f_out:
                shutil.copyfileobj(f_in, f_out)
        print("Unzipping complete!")
    else:
        print(f"Unzipped file already exists at {unzipped_path}")
    
    return unzipped_path

def load_data(url: str, download_dir: Path = download_directory_path) -> pd.DataFrame:
    """
    Download (if needed), unzip (if needed), and load data into a pandas DataFrame.
    Handles special case for GCT files by reading their headers first.
    Uses existing download_if_not_exists and ungzip_if_not_exists functions.
    
    Args:
        url: URL to download from
        download_dir: Directory to save the file in
    
    Returns:
        pandas DataFrame with the loaded data
    """
    # Use existing function to download
    file_path = download_if_not_exists(url, download_dir)
    
    # Use existing function to unzip if needed
    if file_path.suffix == '.gz':
        file_path = ungzip_if_not_exists(file_path)
    
    # Handle GCT files specially
    if file_path.suffix == '.gct':
        print("Reading GCT file header...")
        with open(file_path, 'r') as f:
            version = f.readline().strip()
            dims = f.readline().strip().split('\t')
            print(f"GCT version: {version}")
            print(f"Dimensions: {dims}")
        
        df = pd.read_csv(file_path,
                        sep='\t',
                        skiprows=2,
                        index_col=0)
    else:
        # For regular TSV files
        df = pd.read_csv(file_path,
                        sep='\t',
                        index_col=0)
    
    print(f"\nLoaded dataset shape: {df.shape}")
    return df

In [None]:
gtex_whole_blood_url = "https://storage.googleapis.com/adult-gtex/bulk-gex/v8/rna-seq/tpms-by-tissue/gene_tpm_2017-06-05_v8_whole_blood.gct.gz"
gtex_sample_attributes_url = "https://storage.googleapis.com/adult-gtex/annotations/v8/metadata-files/GTEx_Analysis_v8_Annotations_SampleAttributesDS.txt"
gtex_subject_phenotypes_url = "https://storage.googleapis.com/adult-gtex/annotations/v8/metadata-files/GTEx_Analysis_v8_Annotations_SubjectPhenotypesDS.txt"

gtex_whole_blood_pdf = load_data(gtex_whole_blood_url)
gtex_sample_attributes_pdf = load_data(gtex_sample_attributes_url)
gtex_subject_phenotypes_pdf = load_data(gtex_subject_phenotypes_url)


File already exists at /Users/rj/personal/ucr-aiml-multiomics/downloads/gene_tpm_2017-06-05_v8_whole_blood.gct.gz
Unzipped file already exists at /Users/rj/personal/ucr-aiml-multiomics/downloads/gene_tpm_2017-06-05_v8_whole_blood.gct
Reading GCT file header...
GCT version: #1.3
Dimensions: ['56200', '757', '0', '0']

Loaded dataset shape: (56200, 757)
File already exists at /Users/rj/personal/ucr-aiml-multiomics/downloads/GTEx_Analysis_v8_Annotations_SampleAttributesDS.txt

Loaded dataset shape: (22951, 62)
File already exists at /Users/rj/personal/ucr-aiml-multiomics/downloads/GTEx_Analysis_v8_Annotations_SubjectPhenotypesDS.txt

Loaded dataset shape: (980, 3)


## Lets take a peek

In [11]:
gtex_whole_blood_pdf.describe()

Unnamed: 0,GTEX-111YS-0006-SM-5NQBE,GTEX-1122O-0005-SM-5O99J,GTEX-1128S-0005-SM-5P9HI,GTEX-113IC-0006-SM-5NQ9C,GTEX-113JC-0006-SM-5O997,GTEX-117XS-0005-SM-5PNU6,GTEX-117YW-0005-SM-5NQ8Z,GTEX-1192W-0005-SM-5NQBQ,GTEX-1192X-0005-SM-5NQC3,GTEX-11DXW-0006-SM-5NQ7Y,...,GTEX-ZVE2-0006-SM-51MRW,GTEX-ZVP2-0005-SM-51MRK,GTEX-ZVT2-0005-SM-57WBW,GTEX-ZVT3-0006-SM-51MT9,GTEX-ZVT4-0006-SM-57WB8,GTEX-ZVTK-0006-SM-57WBK,GTEX-ZVZP-0006-SM-51MSW,GTEX-ZVZQ-0006-SM-51MR8,GTEX-ZXES-0005-SM-57WCB,GTEX-ZXG5-0005-SM-57WCN
count,56200.0,56200.0,56200.0,56200.0,56200.0,56200.0,56200.0,56200.0,56200.0,56200.0,...,56200.0,56200.0,56200.0,56200.0,56200.0,56200.0,56200.0,56200.0,56200.0,56200.0
mean,17.794537,17.793619,17.792364,17.79349,17.793386,17.79264,17.794828,17.794302,17.792765,17.794426,...,17.794108,17.793754,17.794705,17.793275,17.793327,17.793359,17.793472,17.793396,17.793937,17.793941
std,1081.161119,587.092745,1056.083201,654.524008,1889.558379,1524.129507,1056.38786,1440.107773,3033.979748,734.351225,...,2370.234571,763.125824,1400.430075,1634.083758,892.602992,1256.701664,541.250303,1422.299039,1075.194903,657.646352
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.00395,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.4718,0.7519,0.869075,1.62125,0.3711,0.2696,0.543,0.471475,0.08165,0.896525,...,0.323,0.73115,0.4876,0.509025,0.63975,0.737125,0.4912,0.523125,0.7075,0.8295
max,234900.0,112900.0,192600.0,136200.0,415100.0,321400.0,197300.0,287200.0,704500.0,155000.0,...,554200.0,156000.0,312300.0,357000.0,190000.0,242900.0,92390.0,307300.0,232400.0,137700.0


In [12]:
gtex_whole_blood_pdf.transpose().describe()

id,0,1,2,3,4,5,6,7,8,9,...,56190,56191,56192,56193,56194,56195,56196,56197,56198,56199
count,757.0,757.0,757.0,757.0,757.0,757.0,757.0,757.0,757.0,757.0,...,757.0,757.0,757.0,757.0,757.0,757.0,757.0,757.0,757.0,757.0
unique,291.0,702.0,26.0,119.0,93.0,164.0,223.0,231.0,616.0,725.0,...,727.0,288.0,165.0,264.0,726.0,710.0,687.0,704.0,255.0,306.0
top,0.0,2.251,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,4027.0,0.0,0.0,0.0,280.8,1162.0,0.0,1758.0,0.0,0.0
freq,396.0,3.0,732.0,630.0,645.0,573.0,481.0,476.0,86.0,3.0,...,2.0,463.0,592.0,492.0,3.0,3.0,49.0,3.0,498.0,438.0


In [14]:
gtex_whole_blood_pdf.eq(0).sum()

Name                            0
Description                     0
GTEX-111YS-0006-SM-5NQBE    31729
GTEX-1122O-0005-SM-5O99J    30553
GTEX-1128S-0005-SM-5P9HI    28836
                            ...  
GTEX-ZVTK-0006-SM-57WBK     30504
GTEX-ZVZP-0006-SM-51MSW     31113
GTEX-ZVZQ-0006-SM-51MR8     29839
GTEX-ZXES-0005-SM-57WCB     29687
GTEX-ZXG5-0005-SM-57WCN     29762
Length: 757, dtype: int64

In [7]:
gtex_sample_attributes_pdf.SMTSD.value_counts()

SMTSD
Whole Blood                                  3288
Muscle - Skeletal                            1132
Lung                                          867
Skin - Sun Exposed (Lower leg)                849
Thyroid                                       812
Artery - Tibial                               770
Adipose - Subcutaneous                        763
Nerve - Tibial                                722
Heart - Left Ventricle                        689
Skin - Not Sun Exposed (Suprapubic)           638
Esophagus - Mucosa                            622
Adipose - Visceral (Omentum)                  564
Esophagus - Muscularis                        559
Cells - Cultured fibroblasts                  527
Breast - Mammary Tissue                       480
Heart - Atrial Appendage                      452
Artery - Aorta                                450
Colon - Transverse                            432
Brain - Frontal Cortex (BA9)                  425
Testis                                      

 gtex_whole_blood_pdf

In [9]:
import plotly.express as px

# Define the correct order of age categories
age_order = ["20-29", "30-39", "40-49", "50-59", "60-69", "70-79"]

# Create the histogram with category ordering
fig = px.histogram(gtex_subject_phenotypes_pdf.AGE, 
                  category_orders={"value": age_order})

# Update layout if needed
fig.update_layout(
    xaxis_title="Age Range",
    yaxis_title="Count"
)

fig.show()

We don't actually need the sample attributes right now

In [10]:
with pd.option_context('display.max_columns', None, 'display.width', None, 'display.max_colwidth', None):
  display(gtex_sample_attributes_pdf.head(2))

Unnamed: 0_level_0,SMATSSCR,SMCENTER,SMPTHNTS,SMRIN,SMTS,SMTSD,SMUBRID,SMTSISCH,SMTSPAX,SMNABTCH,SMNABTCHT,SMNABTCHD,SMGEBTCH,SMGEBTCHD,SMGEBTCHT,SMAFRZE,SMGTC,SME2MPRT,SMCHMPRS,SMNTRART,SMNUMGPS,SMMAPRT,SMEXNCRT,SM550NRM,SMGNSDTC,SMUNMPRT,SM350NRM,SMRDLGTH,SMMNCPB,SME1MMRT,SMSFLGTH,SMESTLBS,SMMPPD,SMNTERRT,SMRRNANM,SMRDTTL,SMVQCFL,SMMNCV,SMTRSCPT,SMMPPDPR,SMCGLGTH,SMGAPPCT,SMUNPDRD,SMNTRNRT,SMMPUNRT,SMEXPEFF,SMMPPDUN,SME2MMRT,SME2ANTI,SMALTALG,SME2SNSE,SMMFLGTH,SME1ANTI,SMSPLTRD,SMBSMMRT,SME1SNSE,SME1PCTS,SMRRNART,SME1MPRT,SMNUM5CD,SMDPMPRT,SME2PCTS
SAMPID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1
GTEX-1117F-0003-SM-58Q7G,,B1,,,Blood,Whole Blood,13756,1188.0,,BP-38516,DNA isolation_Whole Blood_QIAGEN Puregene (Manual),05/02/2013,LCSET-4574,01/15/2014,Standard Exome Sequencing v3 (ICE),WES,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
GTEX-1117F-0003-SM-5DWSB,,B1,,,Blood,Whole Blood,13756,1188.0,,BP-38516,DNA isolation_Whole Blood_QIAGEN Puregene (Manual),05/02/2013,GTEx_OM25_Dec_01,01/28/2014,Illumina OMNI SNP Array,OMNI,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


# Ok, lets build our dataframe

In [15]:
import numpy as np

gwbt_pdf = (
    gtex_whole_blood_pdf
    .drop(columns=["Name", "Description"])
    .apply(lambda x: np.log10(x + 1))
    .transpose()
)
gwbt_pdf.columns = gtex_whole_blood_pdf.Name
gwbt_pdf['SUBJID'] = gwbt_pdf.index.map(lambda x: '-'.join(x.split('-')[:2]))
gwbt_pdf = (
    gwbt_pdf
    .join(gtex_subject_phenotypes_pdf, on='SUBJID', how='left')
    .drop(columns=["DTHHRDY", "SUBJID"])
    
    # .join(gtex_sample_attributes_pdf, how='left')
)
# Create a mapping dictionary for age ranges to midpoints
age_to_midpoint = {
    '20-29': 25,
    '30-39': 35,
    '40-49': 45,
    '50-59': 55,
    '60-69': 65,
    '70-79': 75
}

# Map the age ranges to midpoints
gwbt_pdf['AGE_midpoint'] = gwbt_pdf['AGE'].map(age_to_midpoint)
gwbt_pdf.drop(columns=["AGE"], inplace=True)


In [16]:
gwbt_pdf

Unnamed: 0,ENSG00000223972.5,ENSG00000227232.5,ENSG00000278267.1,ENSG00000243485.5,ENSG00000237613.2,ENSG00000268020.3,ENSG00000240361.1,ENSG00000186092.4,ENSG00000238009.6,ENSG00000233750.3,...,ENSG00000210184.1,ENSG00000210191.1,ENSG00000198786.2,ENSG00000198695.2,ENSG00000210194.1,ENSG00000198727.2,ENSG00000210195.2,ENSG00000210196.2,SEX,AGE_midpoint
GTEX-111YS-0006-SM-5NQBE,0.009323,0.407391,0.000000,0.000000,0.000000,0.000000,0.016992,0.000000,0.015569,0.330211,...,0.000000,0.000000,3.028571,3.295127,0.876737,3.487138,0.000000,0.00000,1,65
GTEX-1122O-0005-SM-5O99J,0.012879,0.573684,0.000000,0.000000,0.000000,0.050805,0.000000,0.023993,0.080157,0.584896,...,0.274943,0.239074,2.955207,3.060320,0.679155,3.394101,0.000000,0.00000,2,65
GTEX-1128S-0005-SM-5P9HI,0.008728,0.735679,0.000000,0.000000,0.024321,0.000000,0.000000,0.000000,0.021685,0.219873,...,0.340444,0.000000,3.133539,3.410946,1.286007,3.320146,0.000000,0.00000,2,65
GTEX-113IC-0006-SM-5NQ9C,0.067257,1.059563,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.012078,0.097223,...,0.000000,0.000000,2.884739,3.125156,1.009961,3.269513,0.000000,0.00000,1,65
GTEX-113JC-0006-SM-5O997,0.009026,0.359266,0.000000,0.000000,0.000000,0.000000,0.047703,0.000000,0.007620,0.079036,...,0.000000,0.179638,2.801061,2.845718,0.560504,3.241795,0.000000,0.18608,2,55
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
GTEX-ZVTK-0006-SM-57WBK,0.047080,0.591732,0.000000,0.047002,0.017284,0.000000,0.022263,0.000000,0.020403,0.132676,...,0.638888,0.229631,3.464042,3.573800,1.013722,3.653213,0.511750,0.00000,1,25
GTEX-ZVZP-0006-SM-51MSW,0.010385,0.646404,0.000000,0.000000,0.000000,0.021107,0.018908,0.019324,0.108700,0.856850,...,0.000000,0.200987,3.043755,3.372175,1.028978,3.726238,0.000000,0.00000,1,55
GTEX-ZVZQ-0006-SM-51MR8,0.000000,0.713407,0.000000,0.000000,0.000000,0.000000,0.000000,0.015066,0.013469,0.042536,...,0.000000,0.281511,3.201397,3.202488,0.581495,3.697055,0.173361,0.00000,2,65
GTEX-ZXES-0005-SM-57WCB,0.009153,0.573220,0.000000,0.000000,0.000000,0.000000,0.016657,0.033464,0.022717,0.403121,...,0.000000,0.181329,2.833020,2.999392,0.801541,3.265054,0.325310,0.00000,2,35


In [17]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_scaled = scaler.fit_transform(gwbt_pdf.drop(columns="SEX"))  # X: samples × genes
y = gwbt_pdf.SEX - 1

In [18]:
X_scaled.shape

(755, 56201)

# Now lets build our model

We can use sklearn's built in model and optimizer

In [19]:
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split
import numpy as np
import timeit

# Start timing
start_time = timeit.default_timer()

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42
)

# Create and train the Ridge regression model
# alpha=1e-3 corresponds to the l2_lambda in your implementation
ridge = Ridge(alpha=1e-3, max_iter=50)
ridge.fit(X_train, y_train)

# Make predictions and compute loss on both training and test sets
train_pred = ridge.predict(X_train)
test_pred = ridge.predict(X_test)

train_loss = np.mean((train_pred - y_train) ** 2)
test_loss = np.mean((test_pred - y_test) ** 2)

# End timing
end_time = timeit.default_timer()

print(f"Training completed in {end_time - start_time:.2f} seconds")
print(f"Training Loss: {train_loss:.4f}")
print(f"Test Loss: {test_loss:.4f}")

Training completed in 0.49 seconds
Training Loss: 0.0000
Test Loss: 0.1071


Nice 0.49 second!

Now lets figure out the feature contributions

In [20]:
# Get the weights from the Ridge model
weights = ridge.coef_  # shape (n_features,)

# Calculate mean of features
X_mean = X_scaled.mean(axis=0)

# Calculate feature contributions
feature_contributions = (X_scaled - X_mean) * weights  # shape (n_samples, n_features)

# Get average absolute contribution per feature
feature_importance = np.abs(feature_contributions).mean(axis=0)

# Create a DataFrame with gene names and their importance scores
gene_importance = pd.DataFrame({
    'gene': gwbt_pdf.drop(columns=['SEX']).columns,
    'importance': feature_importance
})

# Get gene descriptions from original data
gene_descriptions = gtex_whole_blood_pdf[['Name', 'Description']].reset_index(drop=True)

# Join the importance scores with descriptions
gene_importance_with_desc = (
    gene_importance
    .merge(gene_descriptions, left_on='gene', right_on='Name')
    # .drop(columns=['Name'])
    .sort_values('importance', ascending=False)
)

# Show top 10 genes with descriptions
print("Top 10 genes by feature importance:")
print(gene_importance_with_desc[['gene', 'Description', 'importance']].head(10))

Top 10 genes by feature importance:
                     gene Description  importance
55641  ENSG00000129824.15      RPS4Y1    0.006658
55946   ENSG00000198692.9      EIF1AY    0.006369
55939  ENSG00000012817.15       KDM5D    0.005881
55825  ENSG00000067048.16       DDX3Y    0.005435
55830  ENSG00000183878.15         UTY    0.005303
55929   ENSG00000176728.7      TTTY14    0.004523
55648   ENSG00000231535.5   LINC00278    0.004449
55695  ENSG00000099725.14        PRKY    0.004420
55832   ENSG00000154620.5      TMSB4Y    0.004012
55936  ENSG00000131002.11      TXLNGY    0.003946


We can use the `ensembl` ids to download descriptions of these top gene to check whether they are associated with sex

In [21]:
import requests
import time

def fetch_ensembl_info(gene_id):
    """
    Fetch gene information from Ensembl REST API
    
    Args:
        gene_id: Ensembl gene ID (e.g., 'ENSG00000129824')
    
    Returns:
        dict containing gene information or None if not found
    """
    # Remove version number from gene id (e.g., ENSG00000129824.15 -> ENSG00000129824)
    base_gene_id = gene_id.split('.')[0]
    
    # Ensembl REST API endpoint
    url = f"https://rest.ensembl.org/lookup/id/{base_gene_id}"
    
    headers = {
        "Content-Type": "application/json",
        "Accept": "application/json"
    }
    
    try:
        response = requests.get(url, headers=headers)
        response.raise_for_status()
        return response.json()
    except Exception as e:
        print(f"Error fetching info for {gene_id}: {str(e)}")
        return None
    
# Get top 20 genes (you can adjust this number)
top_genes = gene_importance_with_desc.head(20).copy()

# Add Ensembl information
ensembl_info = []
for gene_id in top_genes['gene']:
    info = fetch_ensembl_info(gene_id)
    ensembl_info.append(info)
    # Be nice to the Ensembl API - don't overwhelm it
    time.sleep(0.1)

# Extract relevant information from the API response
def extract_gene_info(info):
    if info is None:
        return {
            'chromosome': None,
            'start': None,
            'end': None,
            'biotype': None,
            'description': None
        }
    
    return {
        'chromosome': info.get('seq_region_name'),
        'start': info.get('start'),
        'end': info.get('end'),
        'biotype': info.get('biotype'),
        'description': info.get('description')
    }

# Add the information to our DataFrame
ensembl_data = pd.DataFrame([extract_gene_info(info) for info in ensembl_info])
top_genes_with_ensembl = pd.concat([top_genes.reset_index(drop=True), ensembl_data], axis=1)


Error fetching info for ENSG00000233864.7: 400 Client Error: Bad Request for url: https://rest.ensembl.org/lookup/id/ENSG00000233864


In [22]:
for description in top_genes_with_ensembl.description:
  print(description)


ribosomal protein S4 Y-linked 1 [Source:HGNC Symbol;Acc:HGNC:10425]
eukaryotic translation initiation factor 1A Y-linked [Source:HGNC Symbol;Acc:HGNC:3252]
lysine demethylase 5D [Source:HGNC Symbol;Acc:HGNC:11115]
DEAD-box helicase 3 Y-linked [Source:HGNC Symbol;Acc:HGNC:2699]
ubiquitously transcribed tetratricopeptide repeat containing, Y-linked [Source:HGNC Symbol;Acc:HGNC:12638]
testis expressed transcript, Y-linked 14 [Source:HGNC Symbol;Acc:HGNC:18495]
long intergenic non-protein coding RNA 278 [Source:HGNC Symbol;Acc:HGNC:38712]
protein kinase Y-linked (pseudogene) [Source:HGNC Symbol;Acc:HGNC:9444]
thymosin beta 4 Y-linked [Source:HGNC Symbol;Acc:HGNC:11882]
taxilin gamma Y-linked (pseudogene) [Source:HGNC Symbol;Acc:HGNC:18473]
X inactive specific transcript [Source:HGNC Symbol;Acc:HGNC:12810]
anosmin 2, pseudogene [Source:HGNC Symbol;Acc:HGNC:6214]
zinc finger protein Y-linked [Source:HGNC Symbol;Acc:HGNC:12870]
None
ubiquitin specific peptidase 9 Y-linked [Source:HGNC Symbol;

Nice, these are mostly "Y-linked"