In [7]:
import pydgraph
import pandas as pd
import numpy as np
import json
from scipy.sparse import coo_matrix
import networkx as nx

In [10]:


def create_client():
    """Create and return a Dgraph client."""
    client_stub = pydgraph.DgraphClientStub('localhost:9080')  # Update with your Dgraph server address
    return pydgraph.DgraphClient(client_stub)

def fetch_graph_edges(client, offset=0, limit=10000):
    """Fetch graph edges from Dgraph in batches."""
    query = f"""
    {{
      edges(func: has(edge), first: {limit}, offset: {offset}) {{
        uid
        edge {{ uid }}
      }}
    }}
    """
    txn = client.txn(read_only=True)
    try:
        response = txn.query(query)
        return json.loads(response.json)
    finally:
        txn.discard()

def compute_pagerank(num_nodes, edges, alpha=0.85, max_iter=100, tol=1e-6):
    """Compute PageRank using a sparse adjacency matrix."""
    rows, cols = zip(*edges)
    values = np.ones(len(edges))
    # use COO (Coordinate List) to build sparce matrix 
    # COO is often used for constructing sparse matrices, but it is less memory efficient than CSR (Compressed Sparse Row) 
    # 
    adj_matrix = coo_matrix((values, (rows, cols)), shape=(num_nodes, num_nodes))
    out_degree = np.array(adj_matrix.sum(axis=1)).flatten()
    out_degree[out_degree == 0] = 1  # Avoid division by zero
    stochastic_matrix = adj_matrix.multiply(1 / out_degree[:, None])

    # Initialize PageRank values
    pagerank = np.ones(num_nodes) / num_nodes
    for _ in range(max_iter):
        new_pagerank = alpha * stochastic_matrix.dot(pagerank) + (1 - alpha) / num_nodes
        if np.linalg.norm(new_pagerank - pagerank, ord=1) < tol:
            break
        pagerank = new_pagerank
    return pagerank

def main():
    client = create_client()
    
    # Fetch data in batches and process
    offset = 0
    limit = 10000
    edges = []
    while True:
        data = fetch_graph_edges(client, offset=offset, limit=limit)
        if not data['edges']:
            break
        for edge in data['edges']:
            from_uid = edge['uid']
            to_uids = [to['uid'] for to in edge['edge']]
            edges.extend((from_uid, to_uid) for to_uid in to_uids)
        offset += limit
    
    # Extract unique nodes and map them to indices
    unique_nodes = list(set([e[0] for e in edges] + [e[1] for e in edges]))
    node_to_index = {node: i for i, node in enumerate(unique_nodes)}
    num_nodes = len(unique_nodes)

    # Map edges to indices
    indexed_edges = [(node_to_index[from_uid], node_to_index[to_uid]) for from_uid, to_uid in edges]
    print(indexed_edges)
    G = nx.from_edgelist(indexed_edges)
    pagerank_scores = nx.pagerank(G, alpha=0.85, max_iter=100, tol=1e-6)
    print(pagerank_scores)
    # Compute PageRank
    pagerank_scores = compute_pagerank(num_nodes, indexed_edges)
    
    # Print or store results
    for node, score in zip(unique_nodes, pagerank_scores):
        print(f"Node {node}: PageRank Score {score}")

main()

[(6, 3), (6, 8), (6, 2), (5, 7), (5, 8), (5, 2), (9, 3), (9, 8), (1, 8), (1, 2), (0, 3), (0, 6), (0, 5), (0, 9), (0, 1), (0, 7), (0, 4), (0, 8), (0, 2), (4, 3), (4, 7), (8, 7), (8, 2), (2, 3), (2, 7)]
{6: 0.08131032931520808, 3: 0.10178603295551947, 8: 0.13553515444121514, 2: 0.13455018408126865, 5: 0.08096615556059335, 7: 0.09976244603340291, 9: 0.06497208456270827, 1: 0.06400653297713094, 0: 0.1716373607031675, 4: 0.06547371936978569}
Node 0x2755: PageRank Score 0.040315980902777784
Node 0x2753: PageRank Score 0.040890468750000006
Node 0x2758: PageRank Score 0.027750000000000004
Node 0x274f: PageRank Score 0.015000000000000003
Node 0x2756: PageRank Score 0.027750000000000004
Node 0x2751: PageRank Score 0.0365103125
Node 0x2750: PageRank Score 0.0365103125
Node 0x2754: PageRank Score 0.015000000000000003
Node 0x2757: PageRank Score 0.03316875000000001
Node 0x2752: PageRank Score 0.035471718750000006


In [10]:
!pip install birankpy
import birankpy



[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.2[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [11]:
df = pd.read_csv('/Users/raphaelderbier/GitHub/proj-devolver-twitch/data/data-1K/game_played.csv')
df

Unnamed: 0,avg_ccv,count_times_streamed,last_streamed,minutes_streamed,minutes_watched,twitch_game_id,twitch_id
0,12,5,2024-12-19 14:48:13.940 -0800,197,2352,27284,44494543
1,11,1,2024-12-04 14:39:20.167 -0800,14,154,80607,44494543
2,10,1,2023-10-24 13:16:06.000 -0700,35,361,2692,44494543
3,15,1,2024-03-04 17:56:03.621 -0800,36,529,512710,48207490
4,16,1,2024-04-25 21:53:01.554 -0700,12,187,27471,48207490
...,...,...,...,...,...,...,...
8117,11,2,2024-11-11 00:05:01.638 -0800,58,622,456845141,277014659
8118,11,1,2024-05-27 08:11:02.327 -0700,5,55,31376,277014659
8119,12,16,2024-11-29 13:37:03.425 -0800,2980,34570,509660,59741163
8120,9,1,2024-01-31 09:39:03.151 -0800,100,936,512923,59741163


In [13]:
bn = birankpy.BipartiteNetwork()

bn.set_edgelist(df,  top_col='twitch_id', bottom_col='twitch_game_id')

top_birank_df, bottom_birank_df = bn.generate_birank()
# top sorted by birank score

top_birank_df.sort_values('twitch_id_birank', ascending=False)

Unnamed: 0,twitch_id,twitch_id_birank
0,44494543,0.000056
1,48207490,0.000591
2,161122945,0.000500
3,237322139,0.000104
4,97315700,0.000101
...,...,...
995,495592766,0.000608
996,759677939,0.000103
997,277014659,0.002908
998,59741163,0.000296
