# Classification of a node feature classical ML

In [2]:
import numpy as np
import pandas as pd
import networkx as nx

# import Karate club graph data
kg = nx.karate_club_graph()

Each node of the graph has the attribute "club", which is divided into two classes:
1. Mr. Hi
2. officer

The goal is to train a classical ML to classify each node and assign the label "Mr. Hi" or "Officer" 

In [8]:
print(np.size(kg.nodes))
club_labels =nx.get_node_attributes(kg, "club")
club_labels

34


{0: 'Mr. Hi',
 1: 'Mr. Hi',
 2: 'Mr. Hi',
 3: 'Mr. Hi',
 4: 'Mr. Hi',
 5: 'Mr. Hi',
 6: 'Mr. Hi',
 7: 'Mr. Hi',
 8: 'Mr. Hi',
 9: 'Officer',
 10: 'Mr. Hi',
 11: 'Mr. Hi',
 12: 'Mr. Hi',
 13: 'Mr. Hi',
 14: 'Officer',
 15: 'Officer',
 16: 'Mr. Hi',
 17: 'Mr. Hi',
 18: 'Officer',
 19: 'Mr. Hi',
 20: 'Officer',
 21: 'Mr. Hi',
 22: 'Officer',
 23: 'Officer',
 24: 'Officer',
 25: 'Officer',
 26: 'Officer',
 27: 'Officer',
 28: 'Officer',
 29: 'Officer',
 30: 'Officer',
 31: 'Officer',
 32: 'Officer',
 33: 'Officer'}

In [13]:
degree = nx.degree(kg)
degree

DegreeView({0: 16, 1: 9, 2: 10, 3: 6, 4: 3, 5: 4, 6: 4, 7: 4, 8: 5, 9: 2, 10: 3, 11: 1, 12: 2, 13: 5, 14: 2, 15: 2, 16: 2, 17: 2, 18: 2, 19: 3, 20: 2, 21: 2, 22: 2, 23: 5, 24: 3, 25: 3, 26: 2, 27: 4, 28: 3, 29: 4, 30: 4, 31: 6, 32: 12, 33: 17})

## Data preparation

Features considered for the classification task:
0. degree of vertexes: number of edges that are incident to the vertex
1. clustering coefficient: measure of the degree to which nodes in a graph tend to cluster together.
2. degree centrality: simply its degree—the number of edges it has
3. closeness centrality: measure of centrality in a network, calculated as the reciprocal of count of all the other nodes over the sum of the length of the shortest paths between the node and all other nodes in the graph
4. betweenness centrality: measure of centrality in a graph based on shortest paths
5. eigenvector centrality: measure of the influence of a node in a network. A high eigenvector score means that a node is connected to many nodes who themselves have high scores
6. pagerank: measures the importance of each node within the graph, based on the number incoming relationships and the importance of the corresponding source nodes
7. hubs: sum of all the authority scores of pages it points t
8. authorities

To perform the graph classification, the graph must be converted into a table of features --> pandas dataframe whose rows are the nodes and columns all the features (including the extracted) of each node.

The goal is to train a classical ML to classify each node and assign the label "Mr. Hi" or "Officer" 

In [18]:
# calculate the graph features
cl_coeff = nx.clustering(kg)
dc = nx.degree_centrality(kg)
cc = nx.closeness_centrality(kg)
bc = nx.betweenness_centrality(kg)
eigc = nx.eigenvector_centrality(kg)
pagerank = nx.pagerank(kg)
hub, auth = nx.hits(kg)

# put all the features in one single list called data
degrees = list(dict(degree).values())
data = [list(x.values()) for x in (cl_coeff,dc,cc,bc,eigc,pagerank,hub,auth)]
data.append(degrees)
data.append(list(club_labels.values()))

In [19]:
data

[[0.15,
  0.3333333333333333,
  0.24444444444444444,
  0.6666666666666666,
  0.6666666666666666,
  0.5,
  0.5,
  1.0,
  0.5,
  0,
  0.6666666666666666,
  0,
  1.0,
  0.6,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  0.3333333333333333,
  1.0,
  1.0,
  1.0,
  0.4,
  0.3333333333333333,
  0.3333333333333333,
  1.0,
  0.16666666666666666,
  0.3333333333333333,
  0.6666666666666666,
  0.5,
  0.2,
  0.19696969696969696,
  0.11029411764705882],
 [0.48484848484848486,
  0.2727272727272727,
  0.30303030303030304,
  0.18181818181818182,
  0.09090909090909091,
  0.12121212121212122,
  0.12121212121212122,
  0.12121212121212122,
  0.15151515151515152,
  0.06060606060606061,
  0.09090909090909091,
  0.030303030303030304,
  0.06060606060606061,
  0.15151515151515152,
  0.06060606060606061,
  0.06060606060606061,
  0.06060606060606061,
  0.06060606060606061,
  0.06060606060606061,
  0.09090909090909091,
  0.06060606060606061,
  0.06060606060606061,
  0.06060606060606061,
  0.15151515151515152,
  0.090909090

In [32]:
# DATA PREPARATION:

# 1. convert the data list to a numpy array
data_arr = np.array(data)

# 2. conver t the data array into a pandas dataframe
data_df = pd.DataFrame(data_arr.T, columns=["clustering_coef", 
                                            "degree_centrality", 
                                            "closeness_centrality", 
                                            "betweenness_centrality",
                                           "eigenvalue_centrality",
                                           "pagerank",
                                            "hub",
                                            "authority", 
                                            "degrees_of_nodes",
                                            "club_of_nodes"
                                           ])
data_df

Unnamed: 0,clustering_coef,degree_centrality,closeness_centrality,betweenness_centrality,eigenvalue_centrality,pagerank,hub,authority,degrees_of_nodes,club_of_nodes
0,0.15,0.4848484848484848,0.5689655172413793,0.4376352813852814,0.3554834941851942,0.0885080739628001,0.0668777878017572,0.0668777878017572,16,Mr. Hi
1,0.3333333333333333,0.2727272727272727,0.4852941176470588,0.0539366883116883,0.2659538704545024,0.05741484049711,0.0646082013987078,0.0646082013987078,9,Mr. Hi
2,0.2444444444444444,0.303030303030303,0.559322033898305,0.1436568061568061,0.3171893899684447,0.0627668645460301,0.0772059370280727,0.0772059370280728,10,Mr. Hi
3,0.6666666666666666,0.1818181818181818,0.4647887323943662,0.0119092712842712,0.2111740783205705,0.0372120815363137,0.0425153895658715,0.0425153895658715,6,Mr. Hi
4,0.6666666666666666,0.0909090909090909,0.3793103448275862,0.0006313131313131,0.0759664588165738,0.0205039773475016,0.0119205679300852,0.0119205679300853,3,Mr. Hi
5,0.5,0.1212121212121212,0.3837209302325581,0.0299873737373737,0.0794805778859424,0.0338104425535772,0.0144370845482914,0.0144370845482914,4,Mr. Hi
6,0.5,0.1212121212121212,0.3837209302325581,0.0299873737373737,0.0794805778859424,0.031529011343455,0.0142272852406394,0.0142272852406394,4,Mr. Hi
7,1.0,0.1212121212121212,0.44,0.0,0.1709551149803543,0.0264646186788061,0.0382043011040342,0.0382043011040342,4,Mr. Hi
8,0.5,0.1515151515151515,0.515625,0.0559268278018278,0.2274050914716604,0.0333815556684644,0.0528748000842634,0.0528748000842634,5,Mr. Hi
9,0.0,0.0606060606060606,0.4342105263157895,0.0008477633477633,0.1026751903063775,0.0094632195657999,0.0107490220889662,0.0107490220889662,2,Officer


In [33]:
# convert string data into numeric
data_df.club_of_nodes = data_df.club_of_nodes.apply(lambda x : 0 if x=="Mr. Hi" else 1)
data_df = data_df.astype(float)
data_df # dataframe of float type

Unnamed: 0,clustering_coef,degree_centrality,closeness_centrality,betweenness_centrality,eigenvalue_centrality,pagerank,hub,authority,degrees_of_nodes,club_of_nodes
0,0.15,0.484848,0.568966,0.437635,0.355483,0.088508,0.066878,0.066878,16.0,0.0
1,0.333333,0.272727,0.485294,0.053937,0.265954,0.057415,0.064608,0.064608,9.0,0.0
2,0.244444,0.30303,0.559322,0.143657,0.317189,0.062767,0.077206,0.077206,10.0,0.0
3,0.666667,0.181818,0.464789,0.011909,0.211174,0.037212,0.042515,0.042515,6.0,0.0
4,0.666667,0.090909,0.37931,0.000631,0.075966,0.020504,0.011921,0.011921,3.0,0.0
5,0.5,0.121212,0.383721,0.029987,0.079481,0.03381,0.014437,0.014437,4.0,0.0
6,0.5,0.121212,0.383721,0.029987,0.079481,0.031529,0.014227,0.014227,4.0,0.0
7,1.0,0.121212,0.44,0.0,0.170955,0.026465,0.038204,0.038204,4.0,0.0
8,0.5,0.151515,0.515625,0.055927,0.227405,0.033382,0.052875,0.052875,5.0,0.0
9,0.0,0.060606,0.434211,0.000848,0.102675,0.009463,0.010749,0.010749,2.0,1.0


## Perform classification with classical scikitlearn ML

In [34]:
import sklearn
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [35]:
# split data into training and test
x = data_df.drop(columns=["club_of_nodes"])
y = data_df.club_of_nodes

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

# define the ML model
logisticRegr = LogisticRegression()

# fit the model
logisticRegr.fit(x_train, y_train)

In [36]:
# make predictions
y_pred = logisticRegr.predict(x_test)

# scores
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         0.0       0.50      0.67      0.57         3
         1.0       0.67      0.50      0.57         4

    accuracy                           0.57         7
   macro avg       0.58      0.58      0.57         7
weighted avg       0.60      0.57      0.57         7



In [37]:
print(accuracy_score(y_test, y_pred))

0.5714285714285714
