In [1]:
!pip3 install findspark

Collecting findspark
  Downloading https://files.pythonhosted.org/packages/73/8d/d2e876e358be7ee77f6f8c3dbb5b1728a268939ea6438ca0b774a6b5080a/findspark-1.4.1-py2.py3-none-any.whl
Installing collected packages: findspark
Successfully installed findspark-1.4.1


In [2]:
!pip3 install networkx



In [0]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://www-us.apache.org/dist/spark/spark-2.3.4/spark-2.3.4-bin-hadoop2.7.tgz
!tar xf spark-2.3.4-bin-hadoop2.7.tgz

In [0]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-2.3.4-bin-hadoop2.7"

In [0]:
import findspark
findspark.init('')

import pyspark
import random

In [0]:
import networkx as nx
import pandas as pd
import sys
import time
from networkx.algorithms.clique import find_cliques

In [0]:
df = pd.read_csv('/content/finaledges.csv', skiprows = 0, header=0, names=["from","to"], engine='python')
G = nx.from_pandas_edgelist(df, source='from', target='to')
count = 1
start = time.time()
with open('/content/clique_out.csv', 'w') as outfile: 
    for i in find_cliques(G):
        outfile.write(str(i)+'\n')
        if count % 100 == 0:
            print("Written out ", count, "cliques")
            print("Time elapsed ", time.time() - start)

In [0]:
from collections import defaultdict
import sys
import pyspark
from pyspark import SparkContext, SparkConf
import json

In [0]:
conf = SparkConf().setAppName("clique_percolation")
conf.set("spark.driver.memory", "8g")
conf.set("spark.executor.memory", "8g")
conf.set("spark.network.timeout","4000s")
conf.set("spark.executor.heartbeatInterval","3600s")
conf.set("spark.storage.blockManagerSlaveTimeoutMs","3600s")
conf.set("spark.cores.max", "4")
sc = SparkContext('local[4]', '',conf=conf)

In [0]:
data = sc.textFile('/content/clique_out.csv').filter(lambda x: x!= '').map(lambda x: eval(x))
data = data.map(lambda x: sorted(x)) # sort the list so that we can use it to compare
data = data.zipWithIndex()

In [0]:
# Get all combination of cliques
carte_rdd = data.cartesian(data)

# count the length of both cliques and their intersection
lengths_rdd = carte_rdd.map(lambda x: (x[0][0], x[0][1], x[1][0], x[1][1],
                        len(x[0][0]), len(x[1][0]), len(set(x[0][0]).intersection(set(x[1][0])))))
# in each row, we have clique1, clique1_id, clique2, clique2_id, size(clique1), size(clique2), overlap of clique 1 and 2

lengths_rdd.persist()
# see the powerpoint by Eugene
diag_rdd = lengths_rdd.filter(lambda x: x[1] == x[3])
triangle_rdd = lengths_rdd.filter(lambda x: x[1] != x[3])

k = 6

In [0]:
triangle_rdd_filtered = triangle_rdd.filter(lambda x: x[6] >= k-1)

if diag_rdd.isEmpty():
    print("THIS IS AN EMPTY LIST")
        
clique_list = diag_rdd.filter(lambda x: x[6] >= k).map(lambda x: x[1]).collect()

In [0]:
community_dict = defaultdict(int)
community_counter = 0

In [14]:
for i in clique_list:
    print('Processing', i)
    # get the cliques who are in a community with clique i
    clique_community = triangle_rdd_filtered.filter(lambda x: (x[1] == i ) | (x[3]  == i))
    clique_community.collect()
    if clique_community.collect() == []: # if the clique i is a community on its own, it will return an empty list
        print(i, "clique is on its own")
        community_dict[i] = community_counter
        community_counter += 1
        # do soemtjhign here to store the clqieu
    else:
        # get the indexes of the cliques that is a community with clique i
        print("Getting cliques into community")
        clique_community_index = clique_community.map(lambda x: [x[1], x[3]]).collect()
        intersection_of_clique = set(sum(clique_community_index, [])) & set(community_dict.keys())
        if len(intersection_of_clique) == 0: # new community starts
            for j in set(sum(clique_community_index, [])):
                community_dict[j] = community_counter
            community_counter += 1
        else: # community exists
            # pick corresponding key and use the community value
            community_id = community_dict[list(intersection_of_clique)[0]]
            for j in set(sum(clique_community_index, [])):# all the cliques get the same community id
                community_dict[j] = community_id

        # write out results
        json.dump(community_dict,open('/content/sample_out.csv', 'w'))

Processing 4
Getting cliques into community
Processing 5
Getting cliques into community
Processing 6
Getting cliques into community
Processing 7
Getting cliques into community
Processing 15
15 clique is on its own
Processing 21
Getting cliques into community
Processing 22
Getting cliques into community
Processing 23
Getting cliques into community
Processing 25
Getting cliques into community
Processing 31
Getting cliques into community
Processing 32
32 clique is on its own
Processing 70
Getting cliques into community
Processing 71
Getting cliques into community
Processing 72
Getting cliques into community
Processing 73
Getting cliques into community
Processing 74
Getting cliques into community
Processing 75
Getting cliques into community
Processing 76
Getting cliques into community
Processing 77
Getting cliques into community
Processing 78
Getting cliques into community
Processing 79
Getting cliques into community
Processing 80
Getting cliques into community
Processing 81
Getting clique

In [15]:
sample_out = pd.read_csv('/content/sample_out.csv')
sample_out

Unnamed: 0,"{""4"": 0","""5"": 0","""6"": 0","""7"": 0","""21"": 0","""22"": 0","""23"": 0","""15"": 1","""25"": 2","""31"": 2","""32"": 3","""102"": 5","""70"": 5","""71"": 5","""139"": 5","""80"": 5","""158"": 5","""98"": 5","""79"": 5","""159"": 5","""72"": 5","""73"": 5","""82"": 5","""77"": 5","""160"": 5","""97"": 5","""132"": 5","""101"": 5","""74"": 5","""138"": 5","""168"": 5","""78"": 5","""81"": 5","""183"": 5","""161"": 5","""95"": 5","""100"": 5","""162"": 5","""75"": 5","""76"": 5",...,"""182"": 5","""83"": 5","""84"": 5","""85"": 5","""86"": 5","""87"": 5","""88"": 5","""89"": 5","""90"": 5","""91"": 5","""92"": 5","""165"": 5","""170"": 5","""94"": 5","""103"": 5","""180"": 5","""93"": 5","""104"": 5","""134"": 5","""172"": 5","""181"": 5","""96"": 5","""105"": 5","""171"": 5","""121"": 5","""173"": 5","""164"": 5","""175"": 5","""131"": 5","""112"": 5","""127"": 5","""184"": 5","""185"": 5","""186"": 5","""114"": 5","""113"": 5","""115"": 5","""155"": 7","""125"": 7","""193"": 5}"


In [0]:
data_file= '/content/clique_out.csv'

In [0]:
largest_column_count = 0
with open(data_file, 'r') as temp_f:
    # Read the lines
    lines = temp_f.readlines()

    for l in lines:
        # Count the column count for the current line
        column_count = len(l.split(',')) + 1

        # Set the new most column count
        largest_column_count = column_count if largest_column_count < column_count else largest_column_count
temp_f.close()
column_names = [str(i) for i in range(0, largest_column_count)]

In [0]:
clique_out = pd.read_csv("/content/clique_out.csv",
                        delimiter=',',names=column_names)

In [0]:
def is_nan(x):
    return (x != x)

In [0]:
import re
finallist=[]
for i in range(0,clique_out.shape[0]):
    temp=[]
    for j in clique_out.loc[i]:
        if is_nan(j)==False:
            j= re.sub(r'[^\w]', ' ', j)
            temp.append(int(j))
        else:
            temp.append(float(j))
    finallist.append(temp)

In [0]:
clique_out = pd.DataFrame(finallist)

In [23]:
clique_out

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,0,114,51.0,116.0,,,,,,
1,0,114,51.0,53.0,,,,,,
2,0,114,60.0,,,,,,,
3,1,2,62.0,,,,,,,
4,3,100,45.0,81.0,41.0,78.0,,,,
...,...,...,...,...,...,...,...,...,...,...
196,105,4,,,,,,,,
197,108,16,121.0,,,,,,,
198,115,114,38.0,,,,,,,
199,121,110,16.0,26.0,92.0,,,,,


In [0]:
output = {"4": 0, "5": 0,"6": 0,"7": 0,"21": 0,"22": 0,"23": 0,"15": 1,"25": 2,"31": 2,"32": 3,"102": 5,"70": 5,"71": 5,"139": 5,"80": 5,"158": 5,"98": 5,"79": 5,"159": 5,"72": 5,"73": 5,"82": 5,"77": 5,"160": 5,"97": 5,"132": 5,"101": 5,"74": 5,"138": 5,"168": 5,"78": 5,"81": 5,"183": 5,"161": 5,"95": 5,"100": 5,"162": 5,"75": 5,"76": 5,"99": 5,"163": 5,"166": 5,"167": 5,"169": 5,"174": 5,"176": 5,"177": 5,"178": 5,"179": 5,"182": 5,"83": 5,"84": 5,"85": 5,"86": 5,"87": 5,"88": 5,"89": 5,"90": 5,"91": 5,"92": 5,"165": 5,"170": 5,"94": 5,"103": 5,"180": 5,"93": 5,"104": 5,"134": 5,"172": 5,"181": 5,"96": 5,"105": 5,"171": 5,"121": 5,"173": 5,"164": 5,"175": 5,"131": 5,"112": 5,"127": 5,"184": 5,"185": 5,"186": 5,"114": 5,"113": 5,"115": 5,"155": 7,"125": 7,"193": 5}

In [0]:
community_ids = [i for i in set(output.values())]

In [26]:
community_ids

[0, 1, 2, 3, 5, 7]

In [0]:
community_dict={}
names = {}
names_0 = []
names_1 = []
names_2 = []
names_3 = []
names_5 = []
names_7 = []

In [32]:
nodes = pd.read_csv('/content/nodes.csv')
nodes

Unnamed: 0.1,Unnamed: 0,Id,Label,unique
0,0,ADDAM_MARBRAND,Addam,0
1,1,AEGON,Aegon,1
2,2,AERYS,Aerys,2
3,3,ALLISER_THORNE,Allister,3
4,4,ARYA,Arya,4
...,...,...,...,...
121,121,WALDER,Walder,121
122,122,WAYMAR_ROYCE,Waymar,122
123,123,WILL,Will,123
124,124,WINE_MERCHANT,Wine Merchant,124


In [0]:
for i,k in output.items():
    #temp=[]
    temp = [nodes.loc[int(j)].Id for j in clique_out.loc[int(i)] if is_nan(j)==False]
    if int(k)==0:
        names_0.append(temp)
        #print(names_0)
    elif int(k)==1:
        names_1.append(temp)
    elif int(k)==2:
        names_2.append(temp)
    elif int(k)==3:
        names_3.append(temp)
    elif int(k)==5:
        names_5.append(temp)
    elif int(k)==7:
        names_7.append(temp)

In [0]:
from pandas.core.common import flatten

names_0 = list(set((flatten(names_0))))
names_1 = list(set((flatten(names_1))))
names_2 = list(set((flatten(names_2))))
names_3 = list(set((flatten(names_3))))
names_5 = list(set((flatten(names_5))))
names_7 = list(set((flatten(names_7))))


for k in community_ids:
    if int(k)==0:
        community_dict[k] = names_0 
    elif int(k)==1:
        community_dict[k] = names_1
    elif int(k)==2:
        community_dict[k] = names_2
    elif int(k)==3:
        community_dict[k] = names_3
    elif int(k)==5:
        community_dict[k] = names_5
    elif int(k)==7:
        community_dict[k] = names_7

In [35]:
community_dict

{0: ['ALLISER_THORNE',
  'GRENN',
  'DAREON',
  'PYP',
  'SAM',
  'OTHELL_YARWYCK',
  'JEOR',
  'JON',
  'MAESTER_AEMON',
  'RAST'],
 1: ['ROBIN', 'TYRION', 'MORD', 'BRONN', 'CATELYN', 'LYSA'],
 2: ['RAKHARO', 'DROGO', 'IRRI', 'JORAH', 'DOREAH', 'DAENERYS', 'VISERYS'],
 3: ['MIRRI_MAZ_DUUR', 'RAKHARO', 'DROGO', 'JORAH', 'QOTHO', 'DAENERYS'],
 5: ['LITTLEFINGER',
  'ROBERT',
  'PYCELLE',
  'JON',
  'JEOR',
  'BARRISTAN',
  'JON_ARRYN',
  'ARYA',
  'BENJEN',
  'BRAN',
  'JOFFREY',
  'THEON',
  'YOREN',
  'MAESTER_LUWIN',
  'ILYN_PAYNE',
  'CERSEI',
  'ROS',
  'CATELYN',
  'RENLY',
  'TYRION',
  'JAIME',
  'VARYS',
  'MERYN_TRANT',
  'STANNIS',
  'HOUND',
  'GREATJON_UMBER',
  'TYWIN',
  'RODRIK',
  'SANSA',
  'NED',
  'SEPTA_MORDANE',
  'JANOS',
  'BAELOR',
  'ROBB'],
 7: ['LITTLEFINGER', 'HOUND', 'ROBERT', 'SANSA', 'NED', 'MOUNTAIN', 'LORAS']}