In [1]:
# Imports
%matplotlib inline
import csv
import glob
import json
import pandas as pd

In [2]:
# User Input
cluster_path = '/home/jovyan/work/single_cell/week_13_no_batch_correction/gene_for_gsea_list.tsv'

# List of available genesets
gene_set_path = ['go_heart.gmt', 'KEGG_CELL_Cycle.gmt', 'ex.gmt']

In [3]:
def load_gene_set(gene_set_path):
    gene_sets = dict()
    total = set()
    for paths in gene_set_path:
        if glob.glob(paths):
            with open(paths) as file:
                for line in file:
                    line = line.strip().split('\t')
                    set_name = line[0]
                    set_description = line[1]
                    gene_subset = set(line[2:])
                    gene_sets[set_name] = [set_description, gene_subset]
                    for items in line[2:]:
                        total.add(items)
        else:
            print("The path provided does not contain geneset: {}".format(paths))

    print("Loaded %d gene sets" % len(gene_sets))
    print("Total number of genes: {}".format(len(total)))   
    return gene_sets, total

In [4]:
def load_cluster(cluster_path):
    """
    Load cluster output file into a pandas dataframe
    """
    if glob.glob(cluster_path):
        cluster_df = pd.read_csv(cluster_path, sep='\t')
        cluster_df = cluster_df.set_index('cluster_number')
        return cluster_df
    else:
        print("The path provided does not contain cluster output file!")

In [5]:
def intersection(cluster_df, total):
    cluster_list = list(set(cluster_df.index.tolist()))
    cluster_zero = cluster_df.loc[cluster_list[0]]
    cluster_zero_genes = set(cluster_zero['gene'])
    number_of_interesecting = cluster_zero_genes.intersection(total)
    print("Total number of unique genes in genesets: {}".format(len(number_of_interesecting)))
    print("Intersecting genes:")
    print(number_of_interesecting)

In [6]:
def output_combined_gmt_file(gene_dict):
    with open ('combined_genesets.gmt', "a") as f:
        for key, value in gene_dict.items():
            f.write("{}\t{}\t{}\n".format(key,value[0], '\t'.join(map(str,value[1]))))

In [7]:
def main(cluster_path, gene_set_path):
    gene_sets, total = load_gene_set(gene_set_path)
    cluster_df = load_cluster(cluster_path)
    intersection(cluster_df, total)
    output_combined_gmt_file(gene_sets)
    return gene_sets

In [8]:
gene_sets = main(cluster_path, gene_set_path)

The path provided does not contain geneset: ex.gmt
Loaded 46 gene sets
Total number of genes: 821
Total number of unique genes in genesets: 209
Intersecting genes:
{'SCN5A', 'GADD45G', 'CASQ2', 'ADAMTS1', 'TEK', 'COL5A1', 'GADD45B', 'CDKN2D', 'CDC20', 'SMAD6', 'TTN', 'HSPB7', 'MYLK', 'HEY2', 'HPGD', 'CCNA2', 'ADAM15', 'IRX4', 'PDE4B', 'AVPR1A', 'SEMA3C', 'SLC8A3', 'PTGER4', 'FLRT3', 'ACE', 'ALPK3', 'MYH6', 'BMP2', 'VEGFA', 'SMYD1', 'HEY1', 'LOXL1', 'SOX18', 'HOPX', 'ITPR2', 'CHRM2', 'CSRP3', 'RDH10', 'FREM2', 'PDLIM5', 'LEFTY1', 'HHEX', 'IRX5', 'FOXC2', 'PITX2', 'CDKN1A', 'BASP1', 'PLK1', 'NRG1', 'ERBB4', 'XIRP2', 'MAD2L1', 'PAM', 'RYR2', 'MYBPC3', 'ADAMTS6', 'FHOD3', 'RBM20', 'MEF2C', 'SEMA3A', 'NKX2-5', 'ABCC9', 'CORIN', 'KCND3', 'S1PR1', 'AKAP6', 'MYL2', 'CACNA1G', 'PHOX2B', 'SHOX2', 'CALCRL', 'DKK1', 'CLDN5', 'SNAI2', 'JAG1', 'MYL3', 'PLXNA4', 'ATP1A2', 'CPE', 'TAC1', 'CRIP1', 'HTR2B', 'DES', 'TBX5', 'SNTA1', 'ATP1B1', 'CCND1', 'HOXA3', 'ERG', 'ECE1', 'SORBS2', 'AGT', 'CCNB1', 'XIR