In [1]:
from AnomalousCommunityDetection.GraphSampler import GraphSampler
from AnomalousCommunityDetection.BiPartiteConverter import BiPartiteConverter
from AnomalousCommunityDetection.FeatureExtractor import FeatureExtractor
from AnomalousCommunityDetection.LinkPredictor import LinkPredictor
from AnomalousCommunityDetection.MetaFeatureExtractor import MetaFeatureExtractor
from AnomalousCommunityDetection.MetaFeatureRanker import MetaFeatureRanker

from AnomalousCommunityDetection.Utils import BiPartiteUtils
from AnomalyInfusedCommunityStructuredRandomNetworkGenerator.AnomalyInfusedCommunityStructuredRandomNetworkGenerator import AnomalyInfusedCommunityStructuredRandomNetworkGenerator

In [2]:
import matplotlib.pyplot as plt
import networkx as nx
from networkx.generators.random_graphs import erdos_renyi_graph, barabasi_albert_graph
import numpy as np
import random
import json
import pandas as pd
%matplotlib inline  

# Check against a generatd network

#### Generate a ranomd community-structured network

In [3]:
random_seed = 1

In [4]:
norm_comm_alg = barabasi_albert_graph
normal_community_sizes = [30, 35, 35, 40]*50
#normal_community_sizes = [10]*9

normal_m = 1

In [5]:
anom_comm_alg = erdos_renyi_graph
anom_community_sizes = [30, 35, 35, 40]*5
#anom_community_sizes = [10]*1
anom_m = 0.8

In [6]:
p_inter = 0.3
k_min = 1
k_max = 1

In [7]:
simulator = AnomalyInfusedCommunityStructuredRandomNetworkGenerator(norm_comm_alg, anom_comm_alg, p_inter, k_min, k_max, random_seed)

In [8]:
G = simulator.generate_partition_graph(normal_community_sizes, normal_m)

In [9]:
anom_infused_G = simulator.add_anomalous_comms(anom_community_sizes, anom_m)

In [10]:
#nx.draw(G)

In [11]:
#nx.draw(anom_infused_G)

In [12]:
list(simulator.get_partitions().keys())

['comm0001',
 'comm0002',
 'comm0003',
 'comm0004',
 'comm0005',
 'comm0006',
 'comm0007',
 'comm0008',
 'comm0009',
 'comm0010',
 'comm0011',
 'comm0012',
 'comm0013',
 'comm0014',
 'comm0015',
 'comm0016',
 'comm0017',
 'comm0018',
 'comm0019',
 'comm0020',
 'comm0021',
 'comm0022',
 'comm0023',
 'comm0024',
 'comm0025',
 'comm0026',
 'comm0027',
 'comm0028',
 'comm0029',
 'comm0030',
 'comm0031',
 'comm0032',
 'comm0033',
 'comm0034',
 'comm0035',
 'comm0036',
 'comm0037',
 'comm0038',
 'comm0039',
 'comm0040',
 'comm0041',
 'comm0042',
 'comm0043',
 'comm0044',
 'comm0045',
 'comm0046',
 'comm0047',
 'comm0048',
 'comm0049',
 'comm0050',
 'comm0051',
 'comm0052',
 'comm0053',
 'comm0054',
 'comm0055',
 'comm0056',
 'comm0057',
 'comm0058',
 'comm0059',
 'comm0060',
 'comm0061',
 'comm0062',
 'comm0063',
 'comm0064',
 'comm0065',
 'comm0066',
 'comm0067',
 'comm0068',
 'comm0069',
 'comm0070',
 'comm0071',
 'comm0072',
 'comm0073',
 'comm0074',
 'comm0075',
 'comm0076',
 'comm0077',

In [13]:
file_path = 'GeneratedPartitions.json'

In [14]:
simulator.save_partitions(file_path)

#### Convert to BiPartite

In [15]:
converter_train = BiPartiteConverter(file_path)
converter_test = BiPartiteConverter(file_path)

In [16]:
BPG_train = converter_train.generate_bipartite_graph(list(simulator.get_partitions().keys())[:50])
BPG_test = converter_test.generate_bipartite_graph(list(simulator.get_partitions().keys())[50:])

In [17]:
converter_train.print_properties()

BiPartite graph properties:
	Number of 'Community'-partite nodes: 50
	Number of 'Vertex'-partite nodes: 2179
	Total number of nodes: 2229
	Total number of edges: 2288


In [18]:
converter_test.print_properties()

BiPartite graph properties:
	Number of 'Community'-partite nodes: 170
	Number of 'Vertex'-partite nodes: 6357
	Total number of nodes: 6527
	Total number of edges: 7667


In [None]:
BPG_train.nodes

In [None]:
BPG_test.nodes

In [None]:
#nx.draw_shell(BPG_train, with_labels=True)

In [None]:
#nx.draw_shell(BPG_test, with_labels=True)

In [None]:
BiPartiteUtils.print_bipartite_properties(BPG_train)

In [None]:
BiPartiteUtils.print_bipartite_properties(BPG_test)

#### Sample Graph

In [None]:
graph_sampler = GraphSampler('Community', 'Vertex')

In [None]:
train_set_positive_edges, train_set_negative_edge = graph_sampler.sample_single_graph_edges(BPG_train, None, True)

In [None]:
test_set_positive_edges, _ = graph_sampler.sample_single_graph_edges(BPG_test, None, False)

In [None]:
def verify(pos_edges, neg_edges, bpg):
    print(f'num pos edges: {len(pos_edges)}')
    print(f'num neg edges: {len(neg_edges)}')
    
    # check that the graph contains all positive edges. sum should be equal to number of positive edges.
    print(f'graph contains positive edges: {np.array([bpg.has_edge(*edge) for edge in pos_edges]).sum()} == {len(pos_edges)}')
    
    # check that the graph contains all positive edges. sum should be equal to number of positive edges.
    print(f'graph contains negative edges: {np.array([bpg.has_edge(*edge) for edge in neg_edges]).sum()} == 0')


In [None]:
verify(train_set_positive_edges, train_set_negative_edge, BPG_train)

In [None]:
verify(test_set_positive_edges, [], BPG_test)

#### Extract topological features

In [None]:
train_feature_extractor = FeatureExtractor(BPG_train)
test_feature_extractor = FeatureExtractor(BPG_test)

In [None]:
topo_feat_df_train = train_feature_extractor.create_topological_features_df(train_set_positive_edges, train_set_negative_edge)

In [None]:
topo_feat_df_test = test_feature_extractor.create_topological_features_df(test_set_positive_edges, None, save_path='try.csv', save_csv=True)

In [None]:
topo_feat_df_train

In [None]:
topo_feat_df_test.head()

In [None]:
topo_feat_df_test_after_read = pd.read_csv('try.csv', index_col=0)
topo_feat_df_test_after_read.head()

In [None]:
topo_feat_df_test.index

In [None]:
topo_feat_df_test_after_read.index[0]

#### Classifier

In [None]:
from xgboost import XGBClassifier

In [None]:
# instantiate topological feature classifier, with a classifier object and train set
topo_feat_clf = LinkPredictor(XGBClassifier())

In [None]:
topo_feat_clf.fit(topo_feat_df_train, 'link_exist', verbose=True)

In [None]:
edges_exist_prob_dict = topo_feat_clf.get_edges_existence_prob(topo_feat_df_test, verbose=True)

In [None]:
topo_feat_df_test

#### Meta-Feature Extraction

In [None]:
meta_feat_extractor = MetaFeatureExtractor(edges_exist_prob_dict)
meta_feats_dict = meta_feat_extractor.get_comm_repr_vertices_meta_features(thresh=0.5)

In [None]:
pd.DataFrame.from_dict(meta_feats_dict, orient='index').head()

#### MetaFeatureRanker

In [None]:
meta_feat_ranker = MetaFeatureRanker(meta_feats_dict)

In [None]:
meta_feat_ranker.rank_columns()

In [None]:
meta_feat_ranker.sort_by_meta_features(2, 'bottom')[:5]

# Check against Wikipedia

In [None]:
WIKIPEDIA_PAGE_USER_DICT_PATH = 'E:\\Datasets\\WikipediaHE\\ProcessedData\\PageUserDict_2018-01-01_2019-01-01.json'
TRAIN_SET_COMMS_PATH = 'HeWikiTrainSetCats.json'

In [None]:
with open(TRAIN_SET_COMMS_PATH, 'r') as file:
    train_cats = list(json.load(file).keys())

In [None]:
len(train_cats)

In [None]:
convertor = BiPartiteConverter(WIKIPEDIA_PAGE_USER_DICT_PATH)

In [None]:
community_partite_label = 'Page'
vertex_partite_label = 'Wikipedian'

In [None]:
WikiBPG = convertor.generate_bipartite_graph(train_cats, community_partite_label, vertex_partite_label)

In [None]:
BiPartiteUtils.print_bipartite_properties(WikiBPG)

In [None]:
sampler = GraphSampler(WikiBPG, community_partite_label, vertex_partite_label)

In [None]:
positive_edges, negative_edges = sampler.sample_single_graph_edges(5000, True)

In [None]:
len(positive_edges)

In [None]:
len(negative_edges)

In [None]:
# check that the graph contains all positive edges. sum should be equal to number of positive edges.
np.array([WikiBPG.has_edge(*edge) for edge in positive_edges]).sum()

In [None]:
# check that the graph does not contain negative edges. sum should be 0.
np.array([WikiBPG.has_edge(*edge) for edge in negative_edges]).sum()