In [1]:
import random
import networkx as nx

In [2]:
G = nx.read_graphml("export/post2tag.nx")
b = nx.get_node_attributes(G, "bipartite")

In [30]:
from typing import List, Tuple, Mapping

class WeightedBipartiteGraph:
    def __init__(self, G):
        self.G = G
        self.bipartite = nx.get_node_attributes(G, "bipartite")

        # check that graph is bipartite so that the algorithm will work as expected
        assert nx.bipartite.is_bipartite(self.G), "Graph is not bipartite"

    def get_weight(self, a, b) -> int:
        return self.G[a][b]["weight"]

    def weighted_hop(self, node: str, k: int = 5) -> List[Tuple[str, int]]:
        if node not in self.G:
            return []
        else:
            weighted_neighbors = []

            for n in self.G[node]:
                weighted_neighbors.append((n, self.get_weight(node, n)))

            return sorted(weighted_neighbors, key=lambda x: x[1], reverse=True)[:k]

    def weighted_two_hop(self, node: str, k: int = 5) -> List[Tuple[str, int]]:
        neighbors_and_weights: Mapping[str, int] = {}
        for n, w in self.weighted_hop(node, k):
            for nn, ww in self.weighted_hop(n, k):
                if node != nn:
                    if nn not in neighbors_and_weights:
                        neighbors_and_weights[nn] = w+ww
                    else:
                        neighbors_and_weights[nn] += w+ww

        return sorted(
            neighbors_and_weights.items(),
            key=lambda x: x[1],
            reverse=True
        )[:k]
    
    def get_top_nodes(self) -> List[int]:
        top_nodes = []
        for node in self.G:
            if self.bipartite[node] == 1:
                top_nodes.append(node)
        return top_nodes
    
    def get_bottom_nodes(self) -> List[int]:
        bottom_nodes = []
        for node in self.G:
            if self.bipartite[node] == 0:
                bottom_nodes.append(node)
        return bottom_nodes    

In [31]:
g = WeightedBipartiteGraph(G)

In [40]:
precomputed = dict()
for tag in g.get_top_nodes():
    precomputed[tag] = g.weighted_two_hop(tag)



In [42]:
precomputed['obi-wan kenobi']

[('star wars', 6),
 ('anakin skywalker', 4),
 ('darth vader', 2),
 ('art', 2),
 ('clone trooper army', 2)]

In [10]:
data = Dataset.load_from_df(tags_by_user_df, reader=reader)

In [11]:
sim_options = {
    "name": "cosine",
    "user_based": False,  # compute  similarities between items
}
algo = KNNBasic(sim_options=sim_options)

# Run 5-fold cross-validation and print results.
cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

TypeError: __init__() got an unexpected keyword argument 'sim_options'

In [33]:
def get_neighbors(algo, x, k=5):
    iid = algo.trainset.to_inner_iid(x)
    return [
        algo.trainset.to_raw_iid(niid)
        for niid in algo.get_neighbors(iid, 5)
    ]

In [35]:
get_neighbors(algo, "jolyne cujoh")

['giorno giovanna',
 'josuke higashikata',
 'feng xin',
 'so true',
 'kitten sized']

In [55]:
import math

def post_df_to_dataset(post_df: pd.DataFrame, min_tag_count: int, verbose=True):
    """
    Surprise library expects three fields: users, items, and ratings.

    In our case:
     - "users" can be the post_url
     - "items" can be the tags
     - "ratings" can be the number of times a post_url contains a tag

    :param post_df:
    :return:
    """
    tag_counts_by_blog_url = defaultdict(int)
    for i, row in post_df.iterrows():
        blog_url = row["blog_url"]
        for tag in row["tags"] + row["root_tags"]:
            tag_counts_by_blog_url[(blog_url, tag)] += 1

    input_df = pd.DataFrame([
        {
            "user": blog_url,
            "item": tag,
            "rating": count
        }
        for (blog_url, tag), count in tag_counts_by_blog_url.items()
        if count >= min_tag_count
    ], columns=["user", "item", "rating"])  # this order is assumed by the Surprise library
    
    return input_df

In [57]:
unnorm_df = post_df_to_dataset(prep_df, min_tag_count=0).sort_values("rating", ascending=False)

In [79]:
rating = unnorm_df.rating
from collections import Counter
Counter(np.ceil((rating - rating.min()) / (rating.max() - rating.min()) * 5))

Counter({5.0: 3, 4.0: 5, 3.0: 36, 2.0: 108, 1.0: 109446, 0.0: 867877})