In [None]:
from Data_retrieval import load_dataset
from NodeRank import GraphRanker
from Metrics.Coccurrences_class import Co_occurrencesGraph
from Metrics.Word2Vec_class import Word2VecGraph
from Utils import preprocess_text

from tqdm.notebook import tqdm
tqdm.pandas()

# Phase 1: Retrieving relevant documents

In [None]:
df_texts, df_entities = load_dataset()

### df_texts

Contains pubmed documents represented in 2 columns:
- pmid: pubmed document id.
- text: text of the document (or only the abstract).

In [None]:
df_texts.head()

### df_entities
Contains pubmed documents entities represented in 8 columns:
- id: entity id.
- pmid: pubmed document id.
- mention: mention of entity inside the text.
- obj: tag assigned to the entity.
- prob: probability of tag assignment.
- span_begin: index of first character in the entity inside the document.
- span_end: index of the last character in the entity inside the document.

In [None]:
df_entities.head()

In [None]:
df_entities['obj'].value_counts()

# Phase 2: Apply preprocessing

We delete rows in 'df_entities' if the probability is null or smaller than 0.8.

In [None]:
df_entities = df_entities[~df_entities['prob'].isna()]
df_entities = df_entities[df_entities['prob'] > 0.8]
df_entities = df_entities[['id', 'pmid', 'mention', 'obj', 'span_begin', 'span_end']]

df_entities.head()

In [None]:
df_entities.head()

In [None]:
occurrences_k_graph = Co_occurrencesGraph(df_entities)
occurrences_k_graph.populate_adj_matrix(k=20)

In [None]:
occurrences_k_graph.statistics()

In [None]:
occurrences_k_graph.draw_example()

# Phase 4.1: Ranking

In [None]:
occur_graph_ranker = GraphRanker(occurrences_k_graph)

Top 10 Nodes

In [None]:
occur_graph_ranker.print_nodes_rank(max_=10)

Top 10 Edges

In [None]:
occur_graph_ranker.print_edges_rank(max_=10)

# Variant: Word2Vec

In [None]:
id_to_wuid = {id: f'unique_{i}' for i, id in enumerate(df_entities['id'].unique())}

list(id_to_wuid.items())[:10]

In [None]:
texts = df_texts.progress_apply(preprocess_text(df_entities,id_to_wuid), axis=1).tolist()

In [None]:
w2v_graph = Word2VecGraph(df_entities, texts)
w2v_graph.populate_adj_matrix(
    min_count=2,
    vector_size=100,
    window=10,
    sg=True,
    learning_rate=0.1,
    epochs=50,
)

In [None]:
w2v_graph.statistics()

In [None]:
w2v_graph.draw_example()

In [None]:
w2v_graph_ranker = GraphRanker(w2v_graph)

Top nodes

In [None]:
w2v_graph_ranker.print_nodes_rank(max=10)

Top edges

In [None]:
w2v_graph_ranker.print_edges_rank(max=10)