In [47]:
## Filtering_script
import numpy as np
import pandas as pd
from scipy.stats import pearsonr, spearmanr

In [48]:
# downloading and preprocessing metadata
meta_df = pd.read_csv('/metadata.csv')
meta_df['tissue'] = meta_df['source name'].str.replace(r'_\d+$', '', regex=True)
meta_df = meta_df[~meta_df["tissue"].str.startswith('NA')]
meta_df.head()

Unnamed: 0,Sample name,title,source name,organism,characteristics: age,characteristics: developmental stage,characteristics: sex,molecule,description,processed data file,raw file,BioSample,Instrument Model
0,A1_384Bulk_Plate1_S1,Tabula Muris Senis (bulk RNA seq),BAT_24,Mus musculus C57/BL6,6,months postnatal,m,total RNA,,190214_A00111_0269_AHH3J3DSXX__190214_A00111_0...,SRR9127205,SAMN11854566,Illumina NovaSeq 6000
1,A1_384Bulk_Plate3_S1,Tabula Muris Senis (bulk RNA seq),SCAT_43,Mus musculus C57/BL6,3,months postnatal,m,total RNA,,190214_A00111_0269_AHH3J3DSXX__190214_A00111_0...,SRR9127206,SAMN11854567,Illumina NovaSeq 6000
2,A10_384Bulk_Plate1_S10,Tabula Muris Senis (bulk RNA seq),Brain_47,Mus musculus C57/BL6,18,months postnatal,m,total RNA,,190214_A00111_0269_AHH3J3DSXX__190214_A00111_0...,SRR9127203,SAMN11854568,Illumina NovaSeq 6000
3,A10_384Bulk_Plate2_S10,Tabula Muris Senis (bulk RNA seq),GAT_39,Mus musculus C57/BL6,1,months postnatal,f,total RNA,,190214_A00111_0269_AHH3J3DSXX__190214_A00111_0...,SRR9127204,SAMN11854569,Illumina NovaSeq 6000
4,A10_384Bulk_Plate3_S10,Tabula Muris Senis (bulk RNA seq),Lung_6,Mus musculus C57/BL6,12,months postnatal,f,total RNA,,190214_A00111_0269_AHH3J3DSXX__190214_A00111_0...,SRR9127371,SAMN11854570,Illumina NovaSeq 6000


In [176]:
# downloading tpm, tmm 
# tmm_df: genes x samples 
# tpm_df: genes x samples 

path_to_TMM = '/home/veteer/scripts/tmm_filtered_salmon_genes.csv'
path_to_TPM = '/home/veteer/scripts/tpm_salmon_gene.csv'


tmm_df = pd.read_csv(path_to_TMM, index_col=0)
tpm_df = pd.read_csv(path_to_TPM, index_col=0)


In [1]:
# remove outliers
samples = [
    "P7_384Bulk_Plate2_S367",
    "I2_384Bulk_Plate3_S194",
    "C3_384Bulk_Plate2_S51",
    "I7_384Bulk_Plate1_S199",
    "L14_384Bulk_Plate1_S278",
    "F4_384Bulk_Plate1_S124",
    "K6_384Bulk_Plate1_S246",
    "J4_384Bulk_Plate3_S220",
    "D7_384Bulk_Plate1_S79",
    "H5_384Bulk_Plate1_S173",
    "M14_384Bulk_Plate1_S302",
    "E8_384Bulk_Plate3_S104",
    "P6_384Bulk_Plate1_S366"
]
tpm_df = tpm_df.drop(columns = samples)

tmm_df = tmm_df.drop(columns = samples)

NameError: name 'tpm_df' is not defined

In [188]:
# -------------------------
# HELPER FUNCTIONS
# -------------------------

#For each gene, non-zero expression in all samples. 
def filter_continuous_expression(df):
    """Remove genes with any 0 or NaN values."""
    mask = (df > 0).all(axis=1)
    return df[mask]

#For each gene, the SD of the log2 normalized gene (x) expression for all samples (i) is less than 1.
def filter_low_variance(df_log2):
    """Remove genes with SD < 1."""
    sd = df_log2.std(axis=1)
    mask = sd <= 1
    return df_log2[mask]

#For each gene, log2 normalized values are within two units of the gene’s mean
def filter_no_outliers(df_log2):
    """Remove genes with any sample value outside mean ± 2."""
    mean_vals = df_log2.mean(axis=1)
    lower = mean_vals - 2
    upper = mean_vals + 2
    mask = ~((df_log2.lt(lower, axis=0)) | (df_log2.gt(upper, axis=0))).any(axis=1)
    return df_log2[mask]

#For each gene, the log2 normalized expression mean is above the mean of all the genes expressed in the particular tissue
def filter_medium_high_expression(df_log2):
    """Keep genes with mean > mean of all genes in tissue."""
    global_mean = df_log2.mean().mean()  # mean of all gene means
    mask = df_log2.mean(axis=1) > global_mean
    return df_log2[mask]

#For each gene, the percent coefficient of variation (%CV) is lower than 20%
def filter_low_cv(df_log2):
    """CV = SD / mean, keep CV% ≤ 20%."""
    mean_vals = df_log2.mean(axis=1)
    sd_vals = df_log2.std(axis=1)
    cv_percent = (sd_vals / mean_vals) * 100
    mask = cv_percent <= 20
    return df_log2[mask]

In [222]:
# -------------------------
# PIPELINE
# -------------------------

def run_filters_for_tissue(df_norm, tissue, norm_type, meta_df):
    """
    Apply filters 1–5 sequentially for one normalization type (TMM or TPM)
    in one tissue. Returns filtered log2 dataframe.
    """
    samples = meta_df.loc[meta_df["tissue"] == tissue, "Sample name"]
    samples = samples[samples.isin(df_norm.columns)]            
    df = df_norm[samples].copy()

    print('Total genes:', df.shape[0])
    
    # Filter 1: continuous expression
    df1 = filter_continuous_expression(df)
    print("1 step. Remained: ", df1.shape[0])

    # log2 transform
    df_log2 = np.log2(df1 + 1)

    # Filter 2: low variance
    df2 = filter_low_variance(df_log2)
    print("2 step. Remained: ", df2.shape[0])

    # Filter 3: no outliers
    df3 = filter_no_outliers(df2)
    print("3 step. Remained: ", df3.shape[0])
    
    # Filter 4: medium-high expression
    df4 = filter_medium_high_expression(df3)
    print("4 step. Remained: ", df4.shape[0])

    for gene in df4.index:
         n_tissues_passed_TPM[gene] = n_tissues_passed_TPM.get(gene, 0) + 1
    
    # Filter 5: low CV
    df5 = filter_low_cv(df4)
    print("5 step. Remained: ", df5.shape[0])
    
    print('Finished first 5 filters')
    return df5




In [223]:
# -------------------------
# RUN FOR ALL TISSUES. FILTERS 1-5
# -------------------------

tissues = meta_df["tissue"].unique()
results_tmm = {}
results_tpm = {}
n_tissues_passed_TPM = {}  # track counts for filter 6

for tissue in tissues:
    print('Analysis of tissue:', tissue)
    
    print('TMM')
    df_tmm_filt = run_filters_for_tissue(tmm_df, tissue, "TMM", meta_df)

    print('TPM')
    df_tpm_filt = run_filters_for_tissue(tpm_df, tissue, "TPM", meta_df)

    results_tmm[tissue] = set(df_tmm_filt.index)
    results_tpm[tissue] = set(df_tpm_filt.index)

    # #For filter 6 count: only TPM genes that passed filters 1–4
    # for gene in df_tpm_filt.index:
    #      n_tissues_passed_TPM[gene] = n_tissues_passed_TPM.get(gene, 0) + 1
        
    print('-'*100)



Analysis of tissue: BAT
TMM
Total genes: 31637
1 step. Remained:  12816
2 step. Remained:  12342
3 step. Remained:  10434
4 step. Remained:  5158
5 step. Remained:  5158
Finished first 5 filters
TPM
Total genes: 51454
1 step. Remained:  12745
2 step. Remained:  12703
3 step. Remained:  12543
4 step. Remained:  5518
5 step. Remained:  5508
Finished first 5 filters
----------------------------------------------------------------------------------------------------
Analysis of tissue: SCAT
TMM
Total genes: 31637
1 step. Remained:  17395
2 step. Remained:  15414
3 step. Remained:  11710
4 step. Remained:  6255
5 step. Remained:  6255
Finished first 5 filters
TPM
Total genes: 51454
1 step. Remained:  17274
2 step. Remained:  16893
3 step. Remained:  16227
4 step. Remained:  7717
5 step. Remained:  7692
Finished first 5 filters
----------------------------------------------------------------------------------------------------
Analysis of tissue: Brain
TMM
Total genes: 31637
1 step. Remained

In [224]:
# -------------------------
# HELPER FUNCTION TO COUNT CORRELATION
# -------------------------
def filter_no_age_correlation(df_log2, ages, n_tissues_passed):
    """
    df_log2: DataFrame (genes x samples) — log2 TPMafter filters 1–5
    ages: age in months from meta_df
    n_tissues_passed: dict {gene: n}
    """
    keep_genes = []

    for gene, row in df_log2.iterrows():
        if gene not in n_tissues_passed:
            continue 
        n = n_tissues_passed[gene]

        p_thresh = 0.05 / n
        expr_values = row.values

        corr, pval = pearsonr(expr_values, ages)

        # оставляем ген, если p-value > порога
        if pval > p_thresh:
            keep_genes.append(gene)

    return df_log2.loc[keep_genes]


In [225]:
# -------------------------
# FILTER 6: correlation with age (TPM only)
# -------------------------
results_tpm_corr = {}
for tissue in tissues:
    genes = results_tpm[tissue]
    df_tpm_filt = tpm_df.loc[list(genes)]
    ages = meta_df[meta_df['Sample name'].isin(df_tpm_filt.columns)]['characteristics: age'].astype('int').values

    df_tpm_corr = filter_no_age_correlation(df_tpm_filt, ages, n_tissues_passed_TPM)
    results_tpm_corr[tissue] = set(df_tpm_corr.index)


In [2]:
# convert genes_id to one format
genes_id = pd.read_csv('gencode_vM22.gene_data.tsv', sep='\t')
genes_id['gene_id'] = genes_id['gene_id'].str.split('.').str[0]
mapping = dict(zip(genes_id['gene_name'], genes_id['gene_id']))
mapping_back = dict(zip(genes_id['gene_id'], genes_id['gene_name']))

def convert_symbols_to_ensembl(gene_symbols, mapping_dict):
    ensembl_ids = set()
    for gene in gene_symbols:
        ensembl_id = mapping_dict.get(gene)
        if ensembl_id and ensembl_id != "N/A":
            ensembl_ids.add(ensembl_id)
    return ensembl_ids

# Transform results_tpm_corr to Ensembl ID
results_tpm_corr_ensembl = {}
for tissue, genes in results_tpm_corr.items():
    results_tpm_corr_ensembl[tissue] = convert_symbols_to_ensembl(genes, mapping)



NameError: name 'pd' is not defined

In [280]:
# -------------------------
# genes are required to pass the filter for both TMM and TPM.
# FINAL INTERSECTION TMM & TPM
# -------------------------
final_results = {}
pan_tissue_sets = []
for tissue in tissues:
    common_genes = results_tmm[tissue] & results_tpm_corr_ensembl[tissue]
    final_results[tissue] = common_genes
    print(f"Tissue: {tissue}, Final genes: {len(common_genes)}")

    if len(common_genes)> 50:
        pan_tissue_sets.append(common_genes)

if pan_tissue_sets:
    final_results['pan_tissue'] = set.intersection(*pan_tissue_sets)
else:
    final_results['pan_tissue'] = set()



Tissue: BAT, Final genes: 4323
Tissue: SCAT, Final genes: 5520
Tissue: Brain, Final genes: 6480
Tissue: GAT, Final genes: 5608
Tissue: Lung, Final genes: 6876
Tissue: Limb_Muscle, Final genes: 2972
Tissue: WBC, Final genes: 501
Tissue: Small_Intestine, Final genes: 4167
Tissue: Liver, Final genes: 3852
Tissue: Spleen, Final genes: 40
Tissue: Pancreas, Final genes: 131
Tissue: Bone, Final genes: 4668
Tissue: MAT, Final genes: 5393
Tissue: Heart, Final genes: 5249
Tissue: Skin, Final genes: 4671
Tissue: Marrow, Final genes: 5176
Tissue: Kidney, Final genes: 5160


In [286]:
## Change the gene names in tpm_df to ensembl_ids

idx_series = tpm_df.index.to_series()
new_index = idx_series.map(mapping).fillna(idx_series)
tpm_df.index = pd.Index(new_index, name='feature')

In [288]:
# -------------------------
# SAVE
# -------------------------

ensembl_to_symbol = {}
for symbol, ensembl_id in mapping.items():
    if ensembl_id and ensembl_id != "N/A":
        ensembl_to_symbol[ensembl_id] = symbol


rows = []
for tissue, genes in final_results.items():
    variant_genes = sorted(set(genes_list) - genes)
    genes_sorted = sorted(genes)
    symbols = [ensembl_to_symbol.get(gene, "N/A") for gene in genes_sorted]
 
    genes_str = ",".join(genes_sorted)
    variant_genes_str = ",".join(variant_genes)
    symbols_str = ",".join(symbols)
    
    rows.append({
        "Tissue": tissue,
        "Invariant_genes": genes_str,
        "Variant_genes": variant_genes_str
    })

    invariant_df = tpm_df.loc[list(genes)]
    invariant_df.to_csv(f"tpm_invariant_genes_{tissue}.csv")


    

df_all = pd.DataFrame(rows)
df_all.to_csv("tissue_group_genes.csv", index=False)
