# Data Visualization - Sankey Diagram

Author: Bruno Conde Costa da Silva.

### Sankey Diagram 02
Mirnas and Species.

In [2]:
# Libraries and paths

import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

# Get current work directory and input files directory
dir_cwd = os.getcwd()
dir_input = os.path.join(dir_cwd, 'input')

# Get miRDB predictions directory
dir_predictions = os.path.join(dir_input, 'miRDB_prediction_files')
dir_predictions

# Get miRDB prediction files
list_prediction_files = os.listdir(dir_predictions)
list_prediction_files

['Plasmodium falciparum.xlsx',
 'Plasmodium vivax P01.xlsx',
 'Plasmodium ovale.xlsx',
 'Plasmodium knowlesi.xlsx',
 'Plasmodium reichenowi CDC.xlsx',
 'Plasmodium inui San Antonio 1.xlsx',
 'Plasmodium malariae.xlsx',
 'Plasmodium cynomolgi strain B.xlsx']

## Getting Data

In [3]:
# storage mirnas 100 names for all species, removing duplicates mirnas,
# and count the number of ocorrences of each mirna

# iterate over predictions
list_df = []
for index, prediction in enumerate(list_prediction_files):
    df = pd.read_excel(open(os.path.join(dir_predictions, prediction), 'rb'), sheet_name='Planilha1')
    list_df.append(df)

labels = []

list_source = []
list_target = []
list_value = []

for elem in list_prediction_files:
    specie = elem.split(sep='.')[0]
    if not specie in labels:
        labels.append(specie)

    
i = 0
for df in list_df:
    df_mirnas_100 = df.groupby('nomes 100').size()
    specie = df['Organismo'][0]        

    for index in df_mirnas_100.index:
        mirna = index
        num_ocorrences = df_mirnas_100.loc[index]

        if not mirna in labels:
            labels.append(mirna)

#         if num_ocorrences > 10:

#         list_source.append(labels.index(mirna))
#         list_target.append(labels.index(specie))
#         list_value.append(num_ocorrences)        
        list_source.append(mirna)
        list_target.append(specie)
        list_value.append(num_ocorrences)
        
        list_index_source.append(labels.index(mirna))
        list_index_target.append(labels.index(target))        
        
        with open(os.path.join(os.getcwd(), 'output', 'sankey_data.txt'), 'a') as f:
            f.write(f"['{mirna}', '{specie}', {num_ocorrences}],")
            f.close()
        
        
        labels
        i += 1                

In [4]:
"""
dataframe 01: source, target, value
"""    

dictionary = {"source":list_source,
              "target":list_target,
              "value":list_value}

df_final = pd.DataFrame(dictionary)
df_final.head()
path = os.path.join(os.getcwd(), 'output', 'df_google_sheets_sankey_formated.xlsx')
df_final.to_excel(path, sheet_name='Sheet01', index = False)

In [1]:
source = 'x'
target = 'y'
value = 10
f"['{source}', '{target}', '{value}',]"

"['x', 'y', '10',]"

In [19]:
import pandas as pd
import holoviews as hv
from holoviews import opts, dim
from bokeh.sampledata.les_mis import data

hv.extension('bokeh')
hv.output(size=200)

# links = pd.DataFrame(data['links'])
# print(links.head(3))
print(df_final.head(3))

hv.Chord(df_final)

            source                 target  value
0    hsa-let-7c-3p  Plasmodium falciparum      2
1     hsa-miR-1-5p  Plasmodium falciparum      1
2  hsa-miR-1229-3p  Plasmodium falciparum      1


In [18]:
# nodes = hv.Dataset(df_final[['source', 'target', 'value']], 'index')
# nodes.data.head()

# df1 = pd.DataFrame(list_source)
# df2 = pd.DataFrame(list_target)

# nodes = hv.Dataset(df_final[['source', 'target', 'value']], 'index')

nodes = hv.Dataset(df_final[['source'], ])

['hsa-let-7c-3p',
 'hsa-miR-1-5p',
 'hsa-miR-1229-3p',
 'hsa-miR-1250-3p',
 'hsa-miR-126-5p',
 'hsa-miR-1261',
 'hsa-miR-1266-3p',
 'hsa-miR-1267',
 'hsa-miR-1273g-5p',
 'hsa-miR-1277-5p',
 'hsa-miR-1279',
 'hsa-miR-130a-5p',
 'hsa-miR-136-5p',
 'hsa-miR-137',
 'hsa-miR-141-5p',
 'hsa-miR-153-5p',
 'hsa-miR-154-3p',
 'hsa-miR-16-2-3p',
 'hsa-miR-190a-3p',
 'hsa-miR-195-3p',
 'hsa-miR-197-3p',
 'hsa-miR-205-5p',
 'hsa-miR-2053',
 'hsa-miR-210-3p',
 'hsa-miR-215-3p',
 'hsa-miR-216b-3p',
 'hsa-miR-218-2-3p',
 'hsa-miR-22-5p',
 'hsa-miR-223-5p',
 'hsa-miR-224-3p',
 'hsa-miR-2355-3p',
 'hsa-miR-23a-3p',
 'hsa-miR-23b-3p',
 'hsa-miR-23c',
 'hsa-miR-296-3p',
 'hsa-miR-297',
 'hsa-miR-3065-5p',
 'hsa-miR-3074-5p',
 'hsa-miR-3115',
 'hsa-miR-3121-3p',
 'hsa-miR-3123',
 'hsa-miR-3133',
 'hsa-miR-3149',
 'hsa-miR-3161',
 'hsa-miR-3163',
 'hsa-miR-3182',
 'hsa-miR-335-3p',
 'hsa-miR-338-5p',
 'hsa-miR-33a-3p',
 'hsa-miR-340-5p',
 'hsa-miR-34c-3p',
 'hsa-miR-3529-3p',
 'hsa-miR-3613-3p',
 'hsa-miR-

In [17]:
chord = hv.Chord((df_final, nodes)).select(value=(5, None))
chord.opts(
    opts.Chord(cmap='Category20', edge_cmap='Category20', edge_color=dim('source').str(), 
               labels='name', node_color=dim('index').str()))

TypeError: '<' not supported between instances of 'int' and 'str'