In [4]:
import pyspark
from pyspark.sql import *
from pyspark import SparkContext, SparkConf, StorageLevel
from pyspark.sql.types import *
from graphframes import *
from pyspark.sql.functions import collect_set, expr
from tqdm import tqdm
import random
import community
import numpy as np
from typing import Dict
%load_ext autotime

time: 295 µs (started: 2022-05-25 12:20:30 +02:00)


In [5]:
sc = pyspark.SparkContext("local[*]")
spark = SparkSession \
    .builder\
   .appName('BigClam-ISEBEL')\
    .getOrCreate()
SparkContext.setSystemProperty('spark.executor.memory', '8g')
SparkContext.setSystemProperty('spark.driver.memory', '8g')
SparkContext.setSystemProperty('spark.sql.shuffle.partitions', '4')
ss = SparkSession(sc)

time: 178 ms (started: 2022-05-25 12:20:31 +02:00)


In [6]:
def create_graph_frames():
    """
    Import nodes and edge csv files and create graphframes with them
    """
    vertices_fields = [
        StructField("id",IntegerType(), True),
        StructField("label",StringType(), True),   
        StructField("type",StringType(), True)
    ]
    vertices = (spark.read.option("mode","DROPMALFORMED")
                .option("columnNameOfCorruptRecord","corrupt_record")
                .csv("/data/wossidia-nodes.csv",
                     header=True,schema=StructType(vertices_fields)))

    edge_fields = [
        StructField("id",IntegerType(), True),
        StructField("src",StringType(), True),   
        StructField("dst",StringType(), True),
        StructField("label",StringType(), True)
    ]
    edges = (spark.read.option("mode","DROPMALFORMED")
                .option("columnNameOfCorruptRecord","corrupt_record")
                .csv("/data/wossidia-edges.csv",
                     header=True,schema=StructType(edge_fields)))

    return GraphFrame(vertices, edges)

time: 1.56 ms (started: 2022-05-25 12:20:32 +02:00)


In [7]:
K = create_graph_frames().vertices.select('type').distinct().count()

29

time: 4.97 s (started: 2022-05-25 12:20:34 +02:00)


In [8]:
def process(entry):
    revisedEntries= entry[0].split(',')
    return (revisedEntries[0], revisedEntries[1])

time: 719 µs (started: 2022-05-25 12:20:39 +02:00)


In [9]:
def generate_community_users(user_business_map, filter_threshold):
    nearby_users_map = {}
    users = user_business_map.keys()

    for u1 in users:
        related_users = set()
        for u2 in users:
            if u1 != u2:
                u1_businesses = set(user_business_map.get(u1))
                u2_businesses = set(user_business_map.get(u2))
                common_businesses = u1_businesses.intersection(u2_businesses)

                if len(common_businesses) >= filter_threshold:
                    related_users.add(u2)
        if len(related_users) > 0:
            nearby_users_map.update({u1:related_users})

    return nearby_users_map

time: 2.08 ms (started: 2022-05-25 12:20:39 +02:00)


In [10]:
def get_neighbors():
    input_file_path = "/data/wossidia-edges.csv"
    filter_threshold = 1
    user_businessRdd = sc.textFile(input_file_path)\
                    .map(lambda entry: entry.split('\n'))\
                    .map(lambda entry: process(entry))
    headers = user_businessRdd.take(1)
    finalRdd = user_businessRdd.filter(lambda entry: entry[0] != headers[0][0]).persist()
    user_business_map = finalRdd\
            .groupByKey()\
            .mapValues(lambda entry: list(set(entry)))\
            .collectAsMap()
    nearby_users_map = generate_community_users(user_business_map, filter_threshold)
    return nearby_users_map

time: 1.83 ms (started: 2022-05-25 12:20:41 +02:00)


In [11]:
neighbors = get_neighbors()

time: 1.94 s (started: 2022-05-25 12:20:43 +02:00)


In [12]:
def bfs(graph, node):
    
    visited = []
    
    queue = [node]
 
    while queue:
        
        node = queue.pop(0)
        if node not in visited:
            
            visited.append(node)
            try:
                neighbours = graph[node]
            except KeyError as e:
                continue
            
            for neighbour in neighbours:
                queue.append(neighbour)
    return visited

time: 1.05 ms (started: 2022-05-25 12:20:45 +02:00)


In [13]:
class BigClamISEBEL(object):
    
    def __init__(
        self,
        dimensions: int = K,
        iterations: int = 50,
        learning_rate: int = 0.005,
        seed: int = 42,
    ):
        self.dimensions = dimensions
        self.iterations = iterations
        self.learning_rate = learning_rate
        self.seed = seed
    
    def _initialize_features(self, number_of_nodes):
        
        self._embedding = np.random.uniform(0, 1, (number_of_nodes, self.dimensions))
        self._global_features = np.sum(self._embedding, axis=0)
        
    def _calculate_gradient(self, node_feature, neb_features):
        
        raw_scores = node_feature.dot(neb_features.T)
        raw_scores = np.clip(raw_scores, -15, 15)
        scores = np.exp(-raw_scores) / (1 - np.exp(-raw_scores))
        scores = scores.reshape(-1, 1)
        neb_grad = np.sum(scores * neb_features, axis=0)
        without_grad = (
            self._global_features - node_feature - np.sum(neb_features, axis=0)
        )
        grad = neb_grad - without_grad
        
        return grad
    
    def _do_updates(self, node, gradient, node_feature):
        
        self._embedding[node] = self._embedding[node] + self.learning_rate * gradient
        self._embedding[node] = np.clip(self._embedding[node], 0.00001, 10)
        self._global_features = (
            self._global_features - node_feature + self._embedding[node]
        )
        
    def get_memberships(self) -> Dict[int, int]:
        
        indices = np.argmax(self._embedding, axis=1)
        memberships = {i: membership for i, membership in enumerate(indices)}
        
        return memberships
    
    def get_embedding(self) -> np.array:
        
        embedding = self._embedding
        return embedding
    
    def get_graph_nodes(self,graph):
        nodes = graph.vertices.rdd.map(lambda x: x.id).collect()
        self.graph_nodes = nodes
    
    def neighbours(self,graph):
        connected = graph.connectedComponents()
        connected.persist(StorageLevel.DISK_ONLY)
        group = connected.select("*").groupby("component").agg(
                collect_set('id').alias('nodes')).sort("component")
        self.group = group
        
    def get_neighbours(self,nodeid):
        group = self.group
        node_1 = group.select('nodes').where("component="''+nodeid+'')
        node_2 = node_1.rdd.map(lambda x: x.nodes).collect()
        return node_2[0]

    def fit(self, graph):
        
        self.graph = graph
        #self.neighbours(graph)
        self.get_graph_nodes(graph)
        
        number_of_nodes = graph.vertices.count()
        self._initialize_features(number_of_nodes)
        nodes = [node for node in self.graph_nodes]
        for i in range(self.iterations):
            random.shuffle(nodes)
            for node in nodes:
                try:
                    nebs = [neb for neb in (bfs(neighbors,str(node)))]
                    neb_features = self._embedding[nebs, :]
                    node_feature = self._embedding[node, :]
                except IndexError as e:
                    continue
                gradient = self._calculate_gradient(node_feature, neb_features)
                self._do_updates(node, gradient, node_feature)

time: 6.13 ms (started: 2022-05-25 12:20:47 +02:00)


In [14]:
big = BigClamISEBEL()
graph = create_graph_frames()
big.fit(graph)

time: 1.25 s (started: 2022-05-25 12:20:50 +02:00)


In [15]:
membership = big.get_memberships()

time: 969 µs (started: 2022-05-25 12:20:53 +02:00)


In [16]:
print(membership)

{0: 0, 1: 2, 2: 1, 3: 2, 4: 1, 5: 4, 6: 5, 7: 4, 8: 2, 9: 3, 10: 5, 11: 4, 12: 5, 13: 3, 14: 1, 15: 6, 16: 6, 17: 7, 18: 0, 19: 2, 20: 7, 21: 7, 22: 4, 23: 3, 24: 0, 25: 3, 26: 1, 27: 4, 28: 4, 29: 3, 30: 5, 31: 7, 32: 2, 33: 6, 34: 4, 35: 2, 36: 7, 37: 5, 38: 3, 39: 6, 40: 6, 41: 4, 42: 6, 43: 2, 44: 1, 45: 5, 46: 7, 47: 2, 48: 2, 49: 2, 50: 7, 51: 0, 52: 6, 53: 0, 54: 7, 55: 6, 56: 1, 57: 4, 58: 0, 59: 5, 60: 0, 61: 3, 62: 6, 63: 7, 64: 6, 65: 5, 66: 3, 67: 3, 68: 2, 69: 2, 70: 0, 71: 6, 72: 5, 73: 0, 74: 1, 75: 3, 76: 1, 77: 3, 78: 4, 79: 3, 80: 1, 81: 0, 82: 6, 83: 7, 84: 4, 85: 5, 86: 3, 87: 7, 88: 7, 89: 0, 90: 0, 91: 2, 92: 4, 93: 7, 94: 2, 95: 1, 96: 1, 97: 7, 98: 7, 99: 2, 100: 7, 101: 4, 102: 0, 103: 4, 104: 6, 105: 5, 106: 0, 107: 1, 108: 2, 109: 5, 110: 4, 111: 7, 112: 5, 113: 7, 114: 0, 115: 2, 116: 5, 117: 7, 118: 5, 119: 6, 120: 7, 121: 1, 122: 4, 123: 0, 124: 6, 125: 7, 126: 1, 127: 2, 128: 0, 129: 0, 130: 5, 131: 3, 132: 5, 133: 7, 134: 0, 135: 5, 136: 6, 137: 4, 138: 

In [17]:
import numpy as np

time: 394 µs (started: 2022-05-25 12:20:58 +02:00)


In [18]:
nodes = graph.vertices.rdd.map(lambda x: x.id).collect()
edges = graph.edges.select("id").rdd.map(lambda x: x.id).collect()

time: 201 ms (started: 2022-05-25 12:21:01 +02:00)


In [19]:
memberships = [v for k, v in membership.items()]
nodes = [node for node in nodes]
edges = [edge for edge in edges]
print(memberships)

[0, 2, 1, 2, 1, 4, 5, 4, 2, 3, 5, 4, 5, 3, 1, 6, 6, 7, 0, 2, 7, 7, 4, 3, 0, 3, 1, 4, 4, 3, 5, 7, 2, 6, 4, 2, 7, 5, 3, 6, 6, 4, 6, 2, 1, 5, 7, 2, 2, 2, 7, 0, 6, 0, 7, 6, 1, 4, 0, 5, 0, 3, 6, 7, 6, 5, 3, 3, 2, 2, 0, 6, 5, 0, 1, 3, 1, 3, 4, 3, 1, 0, 6, 7, 4, 5, 3, 7, 7, 0, 0, 2, 4, 7, 2, 1, 1, 7, 7, 2, 7, 4, 0, 4, 6, 5, 0, 1, 2, 5, 4, 7, 5, 7, 0, 2, 5, 7, 5, 6, 7, 1, 4, 0, 6, 7, 1, 2, 0, 0, 5, 3, 5, 7, 0, 5, 6, 4, 2, 5, 3, 1, 0, 4, 4, 2, 6, 7, 7, 1, 6, 4, 4, 7, 3, 7, 7, 2, 6, 7, 0, 7, 7, 2, 4, 0, 4, 6, 4, 5, 0, 0, 2, 6, 1, 0, 5, 2, 1, 6, 0, 1, 3, 0, 7, 0, 1, 6, 1, 1, 4, 3, 5, 1, 3, 4, 5, 7, 0, 0, 0, 0, 3, 3, 7, 1, 7, 0, 0, 6, 3, 3, 5, 6, 1, 4, 1, 4, 0, 3, 4, 3, 6, 2, 5, 1, 1, 1, 6, 6, 0, 4, 4, 0, 2, 5, 4, 3, 5, 3, 3, 5, 5, 6, 1, 0, 1, 1, 4, 5, 1, 1, 6, 2, 5, 0, 1, 7, 3, 2, 2, 0, 5, 1, 6, 5, 7, 4, 4, 4, 7, 1, 4, 0, 0, 1, 1, 0, 4, 6, 1, 3, 0, 1, 3, 7, 4, 6, 7, 0, 2, 6, 6, 1, 5, 3, 1, 3, 3, 6, 4, 1, 3, 0, 3, 1, 0, 1, 0, 5, 6, 4, 2, 6, 1, 3, 7, 5, 7, 1, 3, 6, 1, 3, 6, 4, 4, 6, 7, 6, 6, 4, 4, 

In [20]:
print(edges)

[1, 2, 3, 4, 5, 6, 7, 8, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251,

In [21]:
dictA = dict(zip(nodes,memberships))

time: 746 µs (started: 2022-05-25 12:21:09 +02:00)


In [22]:
print(dictA)

{1: 0, 2: 2, 3: 1, 4: 2, 5: 1, 6: 4, 7: 5, 8: 4, 9: 2, 12: 3, 13: 5, 14: 4, 15: 5, 16: 3, 17: 1, 18: 6, 19: 6, 20: 7, 21: 0, 22: 2, 23: 7, 24: 7, 27: 4, 28: 3, 29: 0, 30: 3, 31: 1, 34: 4, 35: 4, 36: 3, 37: 5, 38: 7, 41: 2, 42: 6, 43: 4, 44: 2, 45: 7, 48: 5, 49: 3, 50: 6, 51: 6, 52: 4, 53: 6, 54: 2, 55: 1, 56: 5, 58: 7, 59: 2, 60: 2, 61: 2, 64: 7, 65: 0, 66: 6, 67: 0, 68: 7, 74: 6, 75: 1, 78: 4, 79: 0, 80: 5, 81: 0, 82: 3, 85: 6, 86: 7, 87: 6, 94: 5, 95: 3, 96: 3, 97: 2, 98: 2, 100: 0, 101: 6, 102: 5, 103: 0, 104: 1, 107: 3, 108: 1, 109: 3, 110: 4, 111: 3, 112: 1, 113: 0, 118: 6, 119: 7, 120: 4, 121: 5, 122: 3, 123: 7, 124: 7, 127: 0, 128: 0, 129: 2, 130: 4, 131: 7, 134: 2, 135: 1, 138: 1, 139: 7, 140: 7, 141: 2, 142: 7, 146: 4, 147: 0, 148: 4, 149: 6, 150: 5, 151: 0, 153: 1, 154: 2, 155: 5, 158: 4, 159: 7, 161: 5, 164: 7, 165: 0, 166: 2, 167: 5, 168: 7, 169: 5, 170: 6, 171: 7, 172: 1, 175: 4, 176: 0, 177: 6, 178: 7, 181: 1, 182: 2, 183: 0, 186: 0, 187: 5, 188: 3, 189: 5, 190: 7, 191: 0

In [23]:
import csv

with open('/output/result-wossidia-minned-nodes.csv', 'w', newline='') as csvfile:
    header_key = ['id', 'memberships']
    new_val = csv.DictWriter(csvfile, fieldnames=header_key)

    new_val.writeheader()
    for new_k in dictA:
        new_val.writerow({'id': new_k, 'memberships': dictA[new_k]})

time: 5.44 ms (started: 2022-05-25 12:21:16 +02:00)


In [3]:
sc.stop()