In [1]:
import pandas as pd

from Utils.Data_retrieval import load_dataset
from Utils.Utils import normalize_meshId
from Utils.NodeRank import GraphRanker
from Metrics.Coccurrences_class import Co_occurrencesGraph


ModuleNotFoundError: No module named 'Graph_class'

# Phase 1: Retrieving relevant documents

In [None]:
df_texts, df_entities = load_dataset()

### df_texts

Contains pubmed documents represented in 2 columns:
- pmid: pubmed document id.
- text: text of the document (or only the abstract).

In [None]:
df_texts.head()

### df_entities
Contains pubmed documents entities represented in 8 columns:
- id: entity id.
- pmid: pubmed document id.
- mention: mention of entity inside the text.
- obj: tag assigned to the entity.
- prob: probability of tag assignment.
- span_begin: index of first character in the entity inside the document.
- span_end: index of the last character in the entity inside the document.

In [None]:
df_entities.head()

In [None]:
df_entities['obj'].value_counts()

## Test set creation

To test the knowledge graph is used the SCMFDD dataset, there are three kinds of files: diseases, drugs and diseases-drugs associations.

The dataframe `diseases-drug` associations contains:
- drug_id: id of drug
- drug_name: plain name of drug
- disease_id: id of disease
- disease_name: plain name of disease

In [None]:
SCMFF_DIR = 'dataset/SCMFDD/SCMFDD-L/'

scmff_dda = pd.read_csv(f'{SCMFF_DIR}drug-disease association.csv')
scmff_drugs = pd.read_csv(f'{SCMFF_DIR}drug.csv')
scmff_diseases = pd.read_csv(f'{SCMFF_DIR}disease.csv')

scmff_dda

# Phase 2: Apply preprocessing

We delete rows in 'df_entities' if the probability is null or smaller than 0.8.

In [None]:
df_entities = df_entities[~df_entities['prob'].isna()]
df_entities = df_entities[df_entities['prob'] > 0.8]
df_entities = df_entities[['id', 'pmid', 'mention', 'obj', 'span_begin', 'span_end']]

df_entities.head()

Add column `source` that is `true` for each disease of `df_entities` that is also inside the test set.

In [None]:
scmff_diseases_labels = scmff_diseases['disease_id'].unique()
union_diseases = list(filter(lambda x: normalize_meshId(x) in scmff_diseases_labels,
                             df_entities[df_entities['obj'] == 'disease']['id'].unique()))

df_entities['source'] = (df_entities['obj'] == 'disease') & (df_entities['id'].isin(union_diseases))

sum(df_entities['source'])

In [None]:
scmff_dda['drug_id'] = scmff_dda['drug_id'].apply(normalize_meshId)
scmff_dda['disease_id'] = scmff_dda['disease_id'].apply(normalize_meshId)


normalized_union_diseases = map(normalize_meshId, union_diseases)
scmff_dda = scmff_dda[scmff_dda['disease_id'].isin(normalized_union_diseases)]

In [None]:
test_set = {
    (
        v['drug_id'],
        v['disease_id']
    ): True for v in scmff_dda[['drug_id', 'disease_id']].to_dict(orient='records')
}

In [None]:
df_entities.head()

In [None]:
occurrences_k_graph = Co_occurrencesGraph(df_entities)
occurrences_k_graph.populate_adj_matrix(k=20)

In [None]:
occurrences_k_graph.statistics()

In [None]:
occurrences_k_graph.draw_example()

# Phase 4.1: Ranking

In [None]:

occur_graph_ranker = GraphRanker(occurrences_k_graph)

Top 10 Nodes

In [None]:
occur_graph_ranker.print_nodes_rank(max_=10)

Top 10 Edges

In [None]:
occur_graph_ranker.print_edges_rank(max_=10)