In [35]:
from tqdm import tqdm
from pyrdf2vec import RDF2VecTransformer
from pyrdf2vec.graphs import KG, Vertex
from pyrdf2vec.embedders import FastText,Word2Vec
from pyrdf2vec.walkers import RandomWalker
from pyvis.network import Network
from sklearn.manifold import TSNE
from umap import UMAP
from SPARQLWrapper import SPARQLWrapper
from jcopml.plot import plot_missing_value

import plotly.express as px
import pandas as pd
import numpy as np
import requests
import os
import networkx as nx
import matplotlib.pyplot as plt

from modul.vectorReferenced import get_taxon_vector,cek_ncbi_id_by_wiki_id_via_string
from modul.filterNodeEdge import removeNodeAndEdgeByFilter,removeEdgesNotInNodes
from modul.helper_umum import contains_string_entire_column,contains_string_entire_column_boolean, minmax, std_scale
#from process import cek_bfs, nx_to_pyviz
from modul.grafHelper import _set_networkx_graph, _plot_nx_by_matplotlib
from modul.visualisasiHelper import embeddingPlot,plotly_graph
from modul.embeddingHelper import df_serangga_to_rdf, rdf_KG_to_embeddings, df_to_dictionary_taxon
from modul.custom_degree_centrality import degree_centrality_custom

In [36]:
data=[
    # ('begomovirus_contoh_hasil','Pepper yellow leaf curl virus','Aleyrodidae','Bemisia Tabaci'),
    ('1cucu','Cucumber mosaic virus','Aphididae','Myzus persicae'),
    ('2cri','Tomato chlorosis virus','Aleyrodidae','Bemisia Tabaci'),
    ('3wai','Maize chlorotic dwarf virus','Cicadellidae','Graminella nigrifrons'),
    ('4beg','Tomato yellow leaf curl China virus','Aleyrodidae','Bemisia Tabaci'),
    ('5pol','Cereal yellow dwarf virus','Aphididae','Schizaphis graminum'),
    ('6pea','Pea enation mosaic virus 1','Aphididae','Acyrthosiphon pisum'),
    ('7cucur','Cucurbit yellow stunting disorder virus','Aleyrodidae','Bemisia Tabaci'),
    ('8ten','Rice stripe tenuivirus','Delphacidae','Laodelphax striatellus'),
    ('9fiji','Southern rice black-streaked dwarf virus','Delphacidae','Sogatella furcifera'),
    ('10capchlo','Capsicum chlorosis orthotospovirus','Thripidae','Thrips Palmi'),
    ('11barley','Barley yellow dwarf virus GAV','Aphididae','Sitobion avenae'),
    ('12tospot','Tomato spotted wilt orthotospovirus','Thripidae','Frankliniella occidentalis'),
    ('13svyv','squash vein yellowing virus','Aleyrodidae','Bemisia Tabaci'),
    ('14sbmv','soybean mosaic virus','Aphididae','Aphis glycines'),
    ('15blv','bean leafroll virus','Aphididae','Acyrthosiphon pisum'),
    ('16rgdv','rice gall dwarf virus','Cicadellidae','Recilia dorsalis'), #sedikit
    ('17srbsdv','southern rice black-streaked dwarf virus','Delphacidae','Sogatella furcifera'),
    ('18tsrv','tomato severe rugose virus','Aleyrodidae','Bemisia tabaci'),
    ('19gbnv','groundnut bud necrosis virus','Thripidae','Thrips palmi'),
    ('20wbnv','Watermelon bud necrosis virus','Thripidae','Thrips palmi'),
    # error dibawah ini
    # ('+13Poty','Potyvirus','Aphididae','Myzus'),
    # ('+11tung','Tungrovirus','Nilaparvata','Nilaparvata'),
]

data_,nama_virus,acuan_,ujian_=data[0] # vektor acuan  #data virus
# link enpoint sparql ncbi_ontology
ncbi_ontology_url = 'http://localhost:3030/mydataset/query'

In [37]:
def all_proses(data_,nama_virus,acuan_,ujian_,ncbi_ontology_url):
    #1
    #baca data
    df_node=pd.read_csv('dari_praproses/'+data_+'_node.csv',index_col=0) 
    df_edge=pd.read_csv('dari_praproses/'+data_+'_edge.csv',index_col=0)

    # pra-proses khusus proses
    # hapus serangga yg cuma famili (mengikuti acuan). soalnya klo cuma tampil famili apa gunanya?
    filter_genus_sampai_species_null=(
        (df_node.genus.isnull()) &
        (df_node.species.isnull()) &
        (df_node['class']=='NCBI:50557_Insecta')
    )
    df_node,df_edge = removeNodeAndEdgeByFilter(df_node[filter_genus_sampai_species_null], df_node,df_edge)

    # pra-proses khusus proses
    #isi data kosong. mengisi takson kosong, dengan takson sebelumnya, untuk tambalan
    takson=[
        'superkingdom','kingdom','phylum','class','order','family','genus','species'
    ]

    for x,i in enumerate(takson):
        if (i!='superkingdom'): #selain superkingdom update dengan data sebelumnya
            for idx, row in df_node[pd.isnull(df_node[i])].iterrows():
                df_node.loc[idx,[i]] = row[takson[x-1]]+'^'+i
        else: 
            for idx, row in df_node[pd.isnull(df_node[i])].iterrows():
                df_node.loc[idx,[i]] = row[takson[x+1]]+'^'+i

    #satu virus saja #perlu ini soalnya kalo tida 2cri nol
    virus_utama=df_node[df_node.virus_utama==True].taxon_id.to_list()
    serangga_acuan=contains_string_entire_column(df_node,acuan_).taxon_id.to_list()
    print(len(df_edge))
    for j in serangga_acuan:
        dict_serangga = {'source_taxon_id':virus_utama[0],'target_taxon_id':j,'interaction_type':'pathogenOf'}
        df_edge = pd.concat([pd.DataFrame(dict_serangga,index=[0]), df_edge], ignore_index = True)
        # df_edge.loc[len(df_edge.index),['source_taxon_id','target_taxon_id','interaction_type']] = [i,j,'pathogenOf']
    print(len(df_edge))

    # Ini harusnya di praproses. tapi belum fix baiknya hapus atau tidak
    # hapus yang bukan virus utama, terakhir akurasi 0.85, kalau berkurang hapus saja ini
    bukan_virus_utama=(df_node['group']=="virus") & (df_node.virus_utama!=True)
    df_node,df_edge = removeNodeAndEdgeByFilter(df_node[bukan_virus_utama], df_node,df_edge)

    if(len(df_node[df_node['group']=="serangga"])<=2):
        print("cuma dua serangga")


    #3
    #konversi graph 
    gnx = _set_networkx_graph(df_node, df_edge)


    # from modul.custom_degree_centrality import degree_centrality_custom
    # import importlib, sys
    # importlib.reload(sys.modules['modul.custom_degree_centrality'])

    #4 
    # Degree Centrality Custom
    virus_utama_ids=list(df_node[df_node['virus_utama']==True].taxon_id)
    serangga_ids=list(df_node[df_node['group']=="serangga"].taxon_id)
    results_dc = degree_centrality_custom(gnx,virus_utama_ids,serangga_ids, print_relasi=False)
    allnodes = gnx.nodes

    urutan=sorted(results_dc.items(), key=lambda item: item[1], reverse=True)
    urutan

    #5
    # Ambil data NCBI
    # data acuan
    data_acuan=get_taxon_vector(acuan_,ncbi_ontology_url,False)
    print(data_acuan)
    data_ujian=get_taxon_vector(ujian_,ncbi_ontology_url)
    print(data_ujian)


    import importlib, sys
    importlib.reload(sys.modules['modul.visualisasiHelper'])
    from modul.visualisasiHelper import embeddingPlot,plotly_graph

    #6 #konversi node networkx ke RDF
    # input : df_node, URL, data_acuan
    URL = "http://pyRDF2Vec"
    df_serangga = df_node[df_node['group']=="serangga"]
    CUSTOM_KG = df_serangga_to_rdf(df_serangga, URL, data_acuan)

    #7 #embedding
    list_serangga=df_node[df_node['group']=="serangga"].taxon_id.to_list()
    # list entity yang akan diembedd. serangga acuan urutan terakhir
    list_of_entities = [ URL+"#"+taxon_id for taxon_id in list_serangga ]
    list_of_entities.append(f"{URL}#SERANGGA_ACUAN")
    transformer, embeddings, _ = rdf_KG_to_embeddings(CUSTOM_KG, list_of_entities)
    # list_of_entities == transformer._entities # True # artinya sama dua2nya

    # dictionary serangga
    dictionary_serangga = df_to_dictionary_taxon(df_serangga)
    # output
    # embeddings, list_of_entities, dictionary_serangga

    #8
    #euclidean distance

    # buat dataframe
    data_to_count=pd.DataFrame(embeddings, columns=list(range(0,100)))

    # buat kolom label
    ent=[data['label'] for index,data in gnx.nodes(data=True) if(data['group']=='serangga')] #jika serangga
    ent.append("#SERANGGA_ACUAN")
    data_to_count['label']=ent

    #buat kolom entity
    data_to_count['entity']=[i.replace("http://pyRDF2Vec#","") for i in transformer._entities]

    # buat kolom hasil dc
    for idx,row in data_to_count.iterrows(): #jika serangga acuan maka DC di isi nilai 1
        data_to_count.loc[idx,['dc_result']] = results_dc[row['entity']] if(row['entity']!="SERANGGA_ACUAN") else 1

    #ambil koordinat acuan
    acuan=next(data_to_count[data_to_count['label']=='#SERANGGA_ACUAN'].iterrows())[1]
    acuan=np.array(tuple(acuan[i] for i in range(0,100)))
    acuan

    #hitung ED
    for idx, row in data_to_count.iterrows():
        temp = np.array(tuple(row[i] for i in range(0,100)))
        data_to_count.loc[idx,['ed_result']] = np.linalg.norm(temp - acuan)

    #drop data acuan
    data_to_count.drop(data_to_count[data_to_count.label=="#SERANGGA_ACUAN"].index,inplace=True)

    #drop kolom embedding
    data_to_count.drop(columns=list(range(0,100)), inplace=True)

    # simple scaling ed_result
    # data_to_count["ed_result_scaled"] = data_to_count["ed_result"] / data_to_count["ed_result"].max()
    data_to_count['ed_result_scaled'] = minmax(data_to_count['ed_result'])


    # scaling dc 
    # # dengan std
    # data_to_count['dc_result_scaled'] = data_to_count['dc_result'] - data_to_count['dc_result'].mode()[0]
    # # data_to_count['dc_result_scaled'] = data_to_count['dc_result']
    # # lagi dengan minmax
    # data_to_count["dc_result_scaled"] = data_to_count["dc_result_scaled"] / data_to_count["dc_result_scaled"].max()

    # Perform scaling using standard deviation
    # data_to_count['dc_result_scaled'] = std_scale(data_to_count['dc_result'])
    # data_to_count['dc_result_scaled'] = minmax(data_to_count['dc_result_scaled'])

    data_to_count['dc_result_scaled'] = minmax(data_to_count['dc_result'])

    #9
    #hitung kombinasi
    # for idx, row in data_to_count.iterrows():
    #     _dc = row['dc_result_scaled']
    #     _ed=( (row['ed_result_scaled']) if row['ed_result_scaled']!=0 else 1)
    #     data_to_count.loc[idx,['result']] = _dc/_ed

    data_to_count['result'] = (1+data_to_count['dc_result_scaled']) / (1+data_to_count['ed_result_scaled'])

    # simple scaling result (final/kombinasi)
    data_to_count['result'] = data_to_count['result'] / data_to_count['result'].max()

    #10
    #pengujian

    # final score kombinasi
    data_to_count=data_to_count.sort_values('result',ascending=False).reset_index(drop=True)
    data_to_count#[['label','dc_result','ed_result','result']]

    # Pengujian kombinasi
    kembalian=[]
    ujung=10
    if len(data_to_count) < ujung:
        print("data kurang dari 3")
        ujung = len(data_to_count)
    for urutan in range(0,ujung):
        takson=[i[0] for i in data_ujian if i[0] not in ["superkingdom","kingdom","filum","kelas"]]
        id_hasil=data_to_count.iloc[urutan].entity
        cek_hasil= { k:v for k,v in reversed(allnodes[id_hasil].items()) if k in takson }
        cek_ujian= { k:v for k,v in data_ujian if k in takson }
        # print(acuan_,'->', data_)
        # print('ujian ',cek_ujian)
        print('hasil ',cek_hasil)
        cek=0
        for i in reversed(takson):
            cekk=cek_hasil[i]==cek_ujian[i]
            cek+=cekk
            # print(i, cekk)
        print(cek/len(takson))
        kembalian.append(cek/len(takson))
    kembalian.sort(reverse=True)
    return kembalian[0]

In [38]:
hasil_pengujian={}
for i in data:
    data_,nama_virus,acuan_,ujian_=i # vektor acuan  #data virus
    print("untuk data : ", data_)
    hasil_pengujian[data_] = all_proses(data_,nama_virus,acuan_,ujian_,ncbi_ontology_url)

untuk data :  1cucu
removeNodeAndEdgeByFilter
sebelum : 1819 3164
sesudah : 1805 3119
3119
3133
removeNodeAndEdgeByFilter
sebelum : 1805 3133
sesudah : 987 1698
[('family', 'NCBI:27482_Aphididae'), ('order', 'NCBI:7524_Hemiptera'), ('class', 'NCBI:50557_Insecta'), ('phylum', 'NCBI:6656_Arthropoda'), ('kingdom', 'NCBI:33208_Metazoa'), ('superkingdom', 'NCBI:2759_Eukaryota')]
[('spesies', 'NCBI:13164_Myzus persicae'), ('genus', 'NCBI:13163_Myzus'), ('famili', 'NCBI:27482_Aphididae'), ('ordo', 'NCBI:7524_Hemiptera'), ('kelas', 'NCBI:50557_Insecta'), ('filum', 'NCBI:6656_Arthropoda'), ('kingdom', 'NCBI:33208_Metazoa'), ('superkingdom', 'NCBI:2759_Eukaryota')]
hasil  {'spesies': 'EOL:3689195_Aphisglycines', 'genus': 'NCBI:80764_Aphis <genus>', 'famili': 'NCBI:27482_Aphididae', 'ordo': 'NCBI:7524_Hemiptera'}
0.5
hasil  {'spesies': 'NCBI:13131_Macrosiphum euphorbiae', 'genus': 'NCBI:13130_Macrosiphum', 'famili': 'NCBI:27482_Aphididae', 'ordo': 'NCBI:7524_Hemiptera'}
0.5
hasil  {'spesies': 'NC

In [39]:
hasil_pengujian

{'1cucu': 0.5,
 '2cri': 1.0,
 '3wai': 0.25,
 '4beg': 1.0,
 '5pol': 1.0,
 '6pea': 1.0,
 '7cucur': 1.0,
 '8ten': 1.0,
 '9fiji': 0.5,
 '10capchlo': 1.0,
 '11barley': 1.0,
 '12tospot': 1.0,
 '13svyv': 1.0,
 '14sbmv': 0.75,
 '15blv': 1.0,
 '16rgdv': 0.5,
 '17srbsdv': 0.5,
 '18tsrv': 1.0,
 '19gbnv': 1.0,
 '20wbnv': 1.0}

In [40]:
nilai=list(hasil_pengujian.values())
sum(nilai)/len(nilai)

0.85

In [20]:
import plotly.graph_objects as go

# Data dari grup_edge
labels = [1, 3, 5, 10]
values = [0.7142857142857143, 0.7857142857142857, 0.7976190476190477, 0.8809523809523809]

# Membuat diagram garis dengan nilai jumlah
fig = go.Figure(data=go.Scatter(x=labels, y=values, mode='lines+markers', line=dict(color='#636cfb', width=2)))

# Menambahkan label pada sumbu x sebagai kategori
fig.update_xaxes(title_text='Tolerance Range of Predicted Insect', type='category', tickvals=labels, ticktext=labels)

# Menambahkan label pada sumbu y
fig.update_yaxes(title_text='Average Precision')

# Menambahkan judul diagram
fig.update_layout(title_text='Diagram Presisi pada tiap Range Prediksi')

# Tampilkan diagram
fig.show()
