In [1]:
# Imports
%matplotlib inline
import csv
import glob
import json
import pandas as pd

In [2]:
# User Input
cluster_path = '/home/jovyan/work/single_cell/week_13_no_batch_correction/gene_for_gsea_list.tsv'

# List of available genesets
gene_set_path = ['go_heart.gmt', 'KEGG_CELL_Cycle.gmt', 'ex.gmt']

In [3]:
def load_gene_set(gene_set_path):
    gene_sets = dict()
    total = set()
    for paths in gene_set_path:
        if glob.glob(paths):
            with open(paths) as file:
                for line in file:
                    line = line.strip().split('\t')
                    set_name = line[0]
                    set_description = line[1]
                    gene_subset = set(line[2:])
                    gene_sets[set_name] = [set_description, gene_subset]
                    for items in line[2:]:
                        total.add(items)
        else:
            print("The path provided does not contain geneset: {}".format(paths))

    print("Loaded %d gene sets" % len(gene_sets))
    print("Total number of genes: {}".format(len(total)))   
    return gene_sets, total

In [4]:
def load_cluster(cluster_path):
    """
    Load cluster output file into a pandas dataframe
    """
    if glob.glob(cluster_path):
        cluster_df = pd.read_csv(cluster_path, sep='\t')
        cluster_df = cluster_df.set_index('cluster_number')
        return cluster_df
    else:
        print("The path provided does not contain cluster output file!")

In [5]:
def intersection(cluster_df, total):
    cluster_list = list(set(cluster_df.index.tolist()))
    cluster_zero = cluster_df.loc[cluster_list[0]]
    cluster_zero_genes = set(cluster_zero['gene'])
    number_of_interesecting = cluster_zero_genes.intersection(total)
    print("Total number of unique genes in genesets: {}".format(len(number_of_interesecting)))
    print("Intersecting genes:")
    print(number_of_interesecting)

In [27]:
def output_combined_gmt_file(gene_dict):
    with open ('combined_genesets.gmt', "a") as f:
        for key, value in gene_sets.items():
            f.write("{}\t{}\n".format(key,value))
#             f.write("{}\t{}\n".format(key[0], value))

In [25]:
def main(cluster_path, gene_set_path):
    gene_sets, total = load_gene_set(gene_set_path)
    cluster_df = load_cluster(cluster_path)
    intersection(cluster_df, total)
    output_combined_gmt_file(gene_sets)
    return gene_sets

In [14]:
# for items, values in gene_sets.items():
#     print(items, values[0])
#     for items in values[1]:
#         print(items)
#     break

In [28]:
gene_sets = main(cluster_path, gene_set_path)

The path provided does not contain geneset: ex.gmt
Loaded 46 gene sets
Total number of genes: 821
Total number of unique genes in genesets: 209
Intersecting genes:
{'PTGER4', 'CXADR', 'GJA1', 'AGT', 'COL5A1', 'HEG1', 'BVES', 'ADM', 'DLL1', 'NKX2-5', 'EDN1', 'PAM', 'SLC8A3', 'KCNH2', 'PLK1', 'HEY2', 'FHOD3', 'ADAMTS1', 'STC1', 'RDH10', 'CCND1', 'PLN', 'CLIC2', 'SOX11', 'MYOCD', 'ERBB3', 'ADAP2', 'GFRA3', 'TBX2', 'FOXC2', 'CDKN1A', 'SCN5A', 'MYH7', 'PDGFB', 'NRG1', 'CXCR4', 'DSP', 'MYLK', 'NPPC', 'ITPR2', 'S1PR1', 'GADD45B', 'MSX1', 'COL11A1', 'XIRP1', 'ATP2A2', 'GADD45A', 'CCNA2', 'CORIN', 'SOX17', 'TRDN', 'CLDN5', 'AVPR1A', 'NRP1', 'BMP2', 'MYL2', 'NRP2', 'FGF9', 'ACE', 'MEF2C', 'FGF12', 'CHRM2', 'MAD2L1', 'ZC3H12A', 'KCNJ2', 'MYC', 'CDKN2C', 'KCNA5', 'SHOX2', 'ALPK3', 'VEGFA', 'PCNA', 'CTNNA3', 'NOTCH3', 'FHL2', 'SMYD1', 'HPGD', 'PHOX2B', 'CALCRL', 'AKAP6', 'IRX5', 'PDLIM3', 'TNNI1', 'PDGFRB', 'RNF207', 'TCAP', 'PLXNA4', 'BUB1', 'AGTR2', 'NTRK3', 'SOX18', 'DUSP6', 'HES1', 'MYLK3', 'MS