In [1]:
import pandas as pd
import math
import numpy as np
import stellargraph as sg
import xgboost as xgb
import category_encoders as ce
import pprint
import pickle
import os
from collections import OrderedDict, defaultdict
from sklearn.metrics import roc_auc_score, f1_score, recall_score, precision_score
from tensorflow import keras
from stellargraph.mapper import HinSAGELinkGenerator
from stellargraph.layer import HinSAGE, link_regression
from stellargraph.mapper import GraphSAGELinkGenerator
from stellargraph.layer import GraphSAGE, link_classification
from stellargraph.data import UniformRandomWalk
from stellargraph.data import UnsupervisedSampler
from stellargraph.mapper import GraphSAGENodeGenerator
from stellargraph.utils import plot_history

  import pandas.util.testing as tm


In [2]:
renew_df= pd.read_csv('../data/renewal_scoring_raw.csv')

In [3]:
account_df = renew_df[['SALES_ACCOUNT_UUID']].rename(columns={'SALES_ACCOUNT_UUID': 'uuid'})
location_df = renew_df[['LOCATION_UUID']].rename(columns={'LOCATION_UUID': 'uuid'})
account_df['type'] = 0
location_df['type'] = 1
node_df = pd.concat([account_df, location_df], axis=0).reset_index(drop=True)
node_df = node_df[~node_df.duplicated('uuid')].set_index('uuid')
                                                    

In [4]:
edge_df = renew_df[['SALES_ACCOUNT_UUID', 'LOCATION_UUID', 'TOTAL_GROSS_PRICE']]
edge_df = edge_df.groupby(['SALES_ACCOUNT_UUID', 'LOCATION_UUID']).agg('mean').reset_index(drop=False)
edge_df = edge_df.rename(columns={'TOTAL_GROSS_PRICE': 'weight',
                                'SALES_ACCOUNT_UUID': 'source',
                                 'LOCATION_UUID': 'target'})
edge_df['weight'] = edge_df.apply(lambda row: math.log(row['weight']), axis=1)


In [5]:
batch_size = 64
epochs = 5
num_samples = [10, 10, 5]
number_of_walks = 1
length = 3
layer_sizes = [64, 64, 32]

graph = sg.StellarGraph(node_df, edge_df)
nodes = list(graph.nodes())


unsupervised_samples = UnsupervisedSampler(
    graph, nodes=nodes, length=length, number_of_walks=number_of_walks
)


generator = GraphSAGELinkGenerator(graph, batch_size, num_samples, weighted=True)
train_gen = generator.flow(unsupervised_samples)

graphsage = GraphSAGE(
    layer_sizes=layer_sizes, generator=generator, bias=True, dropout=0, normalize="l2"
)
x_inp, x_out = graphsage.in_out_tensors()
prediction = link_classification(
    output_dim=1, output_act="sigmoid", edge_embedding_method="ip"
)(x_out)
model = keras.Model(inputs=x_inp, outputs=prediction)

model.compile(
    optimizer=keras.optimizers.Adam(lr=1e-3),
    loss=keras.losses.binary_crossentropy,
    metrics=[keras.metrics.binary_accuracy],
)
history = model.fit(
    train_gen,
    epochs=epochs,
    verbose=1,
    use_multiprocessing=False,
    workers=4,
    shuffle=True,
)

x_inp_src = x_inp[0::2]
x_out_src = x_out[0]
embedding_model = keras.Model(inputs=x_inp_src, outputs=x_out_src)
node_embeddings = graph_predict(nodes_feat.index, graph, embedding_model)

link_classification: using 'ip' method to combine node embeddings into edge embeddings
  ...
    to  
  ['...']
Train for 2639 steps
Epoch 1/5
  70/2639 [..............................] - ETA: 34:20 - loss: 0.8119 - binary_accuracy: 0.4973

KeyboardInterrupt: 

In [32]:
class UF:
    """An implementation of union find data structure.
    It uses weighted quick union by rank with path compression.
    """

    def __init__(self, N):
        """Initialize an empty union find object with N items.

        Args:
            N: Number of items in the union find object.
        """

        self._id = list(range(N))
        self._count = N
        self._rank = [0] * N

    def find(self, p):
        """Find the set identifier for the item p."""

        id = self._id
        while p != id[p]:
            p = id[p] = id[id[p]]   # Path compression using halving.
        return p

    def count(self):
        """Return the number of items."""

        return self._count

    def connected(self, p, q):
        """Check if the items p and q are on the same set or not."""

        return self.find(p) == self.find(q)

    def union(self, p, q):
        """Combine sets containing p and q into a single set."""

        id = self._id
        rank = self._rank

        i = self.find(p)
        j = self.find(q)
        if i == j:
            return

        self._count -= 1
        if rank[i] < rank[j]:
            id[i] = j
        elif rank[i] > rank[j]:
            id[j] = i
        else:
            id[j] = i
            rank[i] += 1

    def __str__(self):
        """String representation of the union find object."""
        return " ".join([str(x) for x in self._id])

    def __repr__(self):
        """Representation of the union find object."""
        return "UF(" + str(self) + ")"


N = renew_df.SALES_ACCOUNT_UUID.nunique() + renew_df.LOCATION_UUID.nunique()
uf = UF(N)
vocab = {}
for acc_uid, loc_uid in zip(renew_df['SALES_ACCOUNT_UUID'].tolist(), renew_df['LOCATION_UUID'].tolist()):
    if not acc_uid in vocab:
        vocab[acc_uid] = len(vocab)
    if not loc_uid in vocab:
        vocab[loc_uid] = len(vocab)
    aid = vocab[acc_uid]
    lid = vocab[loc_uid]
    if not uf.connected(aid, lid):
        uf.union(aid, lid)
    

In [33]:
len([i for i, root in enumerate(uf._id) if i == root])

85