# Dataframe

This notebook merges all the BEL files contained in the curation folder and gives dataframe for all entity and interaction types in HemeMap.

In [None]:
import os
import pybel

from pybel.struct import union 
import pandas as pd
from pandas.plotting import table
import matplotlib.pyplot as plt

from pybel.struct.summary import count_functions, edge_summary

import heme_knowledge

In [None]:
HERE = os.path.abspath(os.path.dirname('__file__'))

BEL_DIRECTORY = os.path.abspath(os.path.join(HERE, os.pardir, 'hememap'))

def get_bel_files():
    """Return all bel files."""
    return [
        os.path.join(BEL_DIRECTORY, filename)
        for filename in os.listdir(BEL_DIRECTORY)
    ]

In [None]:
def get_files_in_folder(path):
    """Return the files in a given folder.
    :param path: folder path
    :return: file names in folder
    """
    return [
        file for file in os.listdir(path)
    ]

In [None]:
def get_bel_types(bel_path):
    """Get BEL node and edge type statistics.
    :param str path: path to BEL graph
    :return: count of all nodes and edges in a BEL graph
    :rtype: dict
    """
    bel_stats = {}
    bel_graph = pybel.from_path(bel_path)
    
    bel_stats['nodes'] = bel_graph.number_of_nodes()
    bel_stats['edges'] = bel_graph.number_of_edges()
    print(bel_stats)

    # Get count of all BEL function types
    bel_functions_dict = count_functions(bel_graph)
    bel_stats.update(bel_functions_dict)

    # Get count of all BEL edge types
    bel_edges_dict = edge_summary.count_relations(bel_graph)
    bel_stats.update(bel_edges_dict)

    return bel_stats

In [None]:
BEL_STATS_COLUMN_NAMES = {
    'nodes': 'Nodes',
    'edges': 'Edges',
    'Protein': 'Proteins',
    'Gene': 'Genes',
    'RNA': 'RNA Entities',
    'Complex': 'Complexes',
    'Abundance': 'Compounds',
    'BiologicalProcess': 'Biological Processes',
    'Pathology':'Pathology',
    'Reaction': 'Reactions',
    'increases': 'Increase Relations',
    'decreases': 'Decrease Relations',
    'association': 'Association Relations',
    'hasComponent': 'Component Edges',
    'hasVariant': 'Variant Edges',
    'hasReactant': 'Reactants Edges',
    'hasProduct': 'Products Edges',
    'positiveCorrelation':'Positive Correlation',
    'negativeCorrelation':'Negative Correlation',
    'causesNoChange':'CauseNoChange Relations',
    'directlyIncreases':'Directly Increase Relations'

}

In [None]:
def get_bel_stats(resource_folder):
    """Get all BEL node and edge type statistics.
    :param str resource_folder: path to BEL folder
    :return: count of all nodes and edges in all BEL graphs from one resource
    :rtype: dict
    """
    df = pd.DataFrame()
    
    files_list = get_files_in_folder(resource_folder)
    for file in files_list:
        pathway_names = []
        pathway_names.append(file)

        bel_statistics_dict = get_bel_types(os.path.join(resource_folder, file))

        all_bel_statistics = {
            BEL_STATS_COLUMN_NAMES[key]: value
            for key, value in bel_statistics_dict.items()
        }
        print(all_bel_statistics)

        # Add pathway statistic rows to DataFrame
        pathway_data = pd.DataFrame(
            all_bel_statistics,
            index=pathway_names,
            columns=BEL_STATS_COLUMN_NAMES.values(),
            dtype=int
        )

        df = df.append(pathway_data.fillna(0).astype(int))

    return df

In [None]:
heme_statistics_df = get_bel_stats(BEL_DIRECTORY)

In [None]:
heme_statistics_df

In [None]:
#heme_statistics_df.to_csv("heme_statistics.csv", sep='\t', encoding='utf-8')