## Assignment 2
Analyzing Google+ data

In [1]:
import json
import csv
import random

import numpy as np
import pandas as pd
import networkx as nx

In [2]:
%matplotlib inline

In [3]:
g = nx.Graph() # Build the graph

In [4]:
with open("g_plusAnonymized.csv", "r") as in_file:
    reader = csv.reader(in_file)
    
    # iterate over each line 
    for line in reader:
 
     # convert each element in ine to integer, assigns first and second values in row
     # to person1 and person 2 
        person1, person2 = map(int, line)
        

        # add nodes to graph (g) with person1 as node identifier and 'connection' 
        # set to value of person2
        g.add_edge(person1, person2)
        
for node, degree in g.degree():
    print(f"Node {node} has degree {degree}")



Node 1 has degree 24
Node 113566 has degree 3
Node 65428 has degree 35
Node 148308 has degree 78
Node 201116 has degree 26
Node 55897 has degree 125
Node 53714 has degree 77
Node 33616 has degree 5
Node 126346 has degree 21
Node 59677 has degree 335
Node 8112 has degree 12
Node 116144 has degree 19
Node 2 has degree 3
Node 29600 has degree 3
Node 4 has degree 3
Node 22012 has degree 3
Node 41196 has degree 2
Node 137578 has degree 4
Node 5 has degree 2
Node 180423 has degree 4
Node 80530 has degree 3
Node 6 has degree 1
Node 112043 has degree 6
Node 7 has degree 1
Node 208820 has degree 2
Node 9 has degree 2
Node 2861 has degree 2
Node 168205 has degree 2
Node 10 has degree 2
Node 140343 has degree 3
Node 11 has degree 2
Node 30292 has degree 2
Node 140024 has degree 2
Node 12 has degree 1
Node 48722 has degree 3
Node 13 has degree 2
Node 7291 has degree 2
Node 14 has degree 2
Node 58434 has degree 3
Node 198882 has degree 1
Node 15 has degree 4
Node 39430 has degree 4
Node 23827 has d

In [5]:
print("Nodes:", len(g.nodes))

Nodes: 211187


## Create Gephi Graph

In [6]:
import os
# get path to desktop
desktop_path = os.path.join(os.path.expanduser("~"), "Desktop")

# extract every 100th node from the graph (for efficiency in loading)
selected_nodes = list(g.nodes())[::100]

# create subgraph with only selected nodes and edges
subgraph = g.subgraph(selected_nodes)

# have Gephi read
nx.write_graphml(subgraph, os.path.join(desktop_path, "googleplus100.graphml"))

## Compute Eigenvector Centrality

In [7]:
centrality = nx.eigenvector_centrality(g)
centrality

{1: 1.8700839801699563e-05,
 113566: 7.507965658728728e-08,
 65428: 1.9330926468542078e-05,
 148308: 1.9991225815995306e-05,
 201116: 1.6016821327750884e-05,
 55897: 2.8119229970412084e-05,
 53714: 2.4631160254895405e-05,
 33616: 1.4312036802872217e-05,
 126346: 1.6089320823867857e-05,
 59677: 3.9870853703988343e-05,
 8112: 3.883556374021769e-06,
 116144: 1.9983230383187393e-06,
 2: 6.362523140421802e-16,
 29600: 1.6336613855046314e-13,
 4: 5.712242364638012e-10,
 22012: 5.712242364668481e-10,
 41196: 2.193704870501006e-12,
 137578: 1.4828325191186674e-07,
 5: 4.3519809571191634e-09,
 180423: 1.143336858050398e-06,
 80530: 1.929227387851559e-09,
 6: 2.0344367011169302e-11,
 112043: 5.353624186473522e-09,
 7: 1.1130186734358725e-30,
 208820: 2.9298388074551543e-28,
 9: 3.508606995898422e-17,
 2861: 8.775051211476903e-15,
 168205: 1.9845786492350358e-17,
 10: 4.866054433177863e-09,
 140343: 2.891163110357041e-09,
 11: 4.599811399715816e-44,
 30292: 1.9334436363847633e-46,
 140024: 1.0954

In [8]:
df = pd.DataFrame(centrality.items(), columns = ["node_id", "centrality"])
df

Unnamed: 0,node_id,centrality
0,1,1.870084e-05
1,113566,7.507966e-08
2,65428,1.933093e-05
3,148308,1.999123e-05
4,201116,1.601682e-05
...,...,...
211182,211166,4.811355e-64
211183,211174,2.121941e-11
211184,211176,6.992100e-15
211185,211182,1.089605e-11


df.sort_values(by="centrality", ascending=False).head(20)

In [9]:
df.sort_values(by="centrality", ascending=False).head(20)

Unnamed: 0,node_id,centrality
1400,116002,0.101318
1163,136198,0.095797
1398,159894,0.093012
4898,130830,0.083432
4991,35778,0.082365
4852,209926,0.082104
263,146725,0.080401
4836,209716,0.079871
4920,57314,0.079187
13382,149179,0.077013


I would like to take the average eigenvector centrality to see if the top values are very representative.

In [10]:
# Calculate the average eigenvector centrality
average_eigenvector_centrality = sum(centrality.values()) / len(centrality)

print("Average Eigenvector Centrality:", average_eigenvector_centrality)

Average Eigenvector Centrality: 0.00018348986023527142


The average eigenvector centrality is not very high at all, especially wehn comoared to a top value of 0.1. We can see even in the top 20 data that there is a wide range between first and twentieth in terms of this centrality, with first having an eigenvector centrality of 0.101318 and twentieth having an eigenvector centrality of 0.069070. This indicates that centrality was held only by a few top users.

## Compute Degree Centrality

Degree centrality represents the number of edges connected to a node. 



In [11]:
# Compute degree centrality
degree_centrality = nx.degree_centrality(g)

# Create a DataFrame with node_id and degree centrality columns
df_degree = pd.DataFrame(degree_centrality.items(), columns=["node_id", "degree_centrality"])

# Print the top 20 nodes based on degree centrality
print(df_degree.sort_values(by="degree_centrality", ascending=False).head(20))


        node_id  degree_centrality
1163     136198           0.008476
5408       5381           0.007193
1400     116002           0.007098
1024     145647           0.006634
137248    66836           0.006454
5644      89827           0.006288
8652       5111           0.006274
1398     159894           0.006241
4898     130830           0.005659
6270     156733           0.005573
531      174980           0.005223
13382    149179           0.004835
4121      28860           0.004787
1289      72126           0.004693
360       12696           0.004352
2397      79717           0.004352
1369      47667           0.004276
263      146725           0.004276
4920      57314           0.004181
11459     61516           0.004172


Again, we see a quick drop-off in values from highest degree centrality to twentieth highest degree centrality. Let's look at the average degree centrality.

In [12]:
# Calculate the average degree centrality
average_degree_centrality = sum(degree_centrality.values()) / len(degree_centrality)

print("Average Degree Centrality:", average_degree_centrality)

Average Degree Centrality: 5.1274337333260864e-05


The average degree centrality is much smaller than the top values.

## Compute Closeness Centrality

In [13]:
# Compute closeness centrality with the subgroup for efficiency
closeness_centrality = nx.closeness_centrality(subgraph)


# Create a DataFrame with node_id and closeness centrality columns
df_closeness = pd.DataFrame(closeness_centrality.items(), columns=["node_id", "closeness_centrality"])

# Print the top 20 nodes based on closeness centrality
print(df_closeness.sort_values(by="closeness_centrality", ascending=False).head(20))

      node_id  closeness_centrality
373    116002              0.009252
955     44577              0.006579
515    190273              0.006168
37      57470              0.005921
1842   146309              0.005921
545    173991              0.005805
524     42838              0.005805
1685    71983              0.005694
338    181427              0.005694
2094   114623              0.005586
1615    30727              0.005483
1921   195804              0.005383
547     75690              0.005287
1640   153684              0.005287
1378   169094              0.005287
827      3088              0.005287
1511    30330              0.005287
419    140755              0.005287
1848   113578              0.005287
97      65855              0.004354


In [15]:
# Calculate the average closeness centrality
average_closeness_centrality = sum(closeness_centrality.values()) / len(closeness_centrality)

print("Average Closeness Centrality:", average_closeness_centrality)

Average Closeness Centrality: 0.00010653331131160463


## Search for isolated nodes

In [16]:
# Search for isolated nodes by creating variable with searched for degree

# For nodes with a degree of 24:
isolated_nodes_24 = [node for node, degree in g.degree() if degree == 24]
print("Isolated nodes:", isolated_nodes_24)

Isolated nodes: [1, 162522, 171671, 124862, 54707, 42466, 188224, 70854, 104, 69212, 10663, 179812, 143151, 47381, 67724, 22175, 48745, 137060, 112141, 39762, 131920, 1479, 5251, 169912, 100703, 86028, 160484, 80619, 166896, 59635, 162464, 83226, 104957, 28212, 198154, 23207, 116694, 124625, 86962, 17370, 163412, 167515, 60747, 29343, 4713, 44286, 200168, 117329, 46501, 208683, 18986, 203014, 112360, 168068, 35777, 206237, 125670, 30021, 140992, 3842, 45578, 133507, 64926, 99292, 198077, 33771, 62467, 152420, 104172, 30502, 107980, 140556, 59569, 101956, 134030, 68790, 109501, 103886, 195589, 1677, 155353, 97645, 66858, 145673, 138405, 171408, 147885, 26938, 166162, 23543, 93397, 19913, 170825, 84916, 134433, 165606, 9334, 51308, 60668, 162567, 42114, 19413, 88500, 2059, 15332, 42153, 172098, 147391, 42193, 22587, 38189, 78919, 173054, 63309, 193234, 150515, 53608, 170487, 33430, 36491, 62673, 33901, 70134, 186081, 89769, 31486, 11823, 143873, 119250, 64086, 137737, 9998, 22037, 26215,

In [17]:
# For nodes with a degree of 3:

isolated_nodes_3 = [node for node, degree in g.degree() if degree == 3]
print("Isolated nodes:", isolated_nodes_3)

Isolated nodes: [113566, 2, 29600, 4, 22012, 80530, 140343, 48722, 58434, 41622, 181787, 156094, 164322, 158109, 155309, 30, 32, 34, 36, 76179, 44, 86698, 143509, 47, 128609, 54901, 169039, 51, 113953, 112492, 40644, 146346, 95211, 170735, 147317, 139657, 195779, 171967, 78725, 76354, 19978, 92283, 142318, 88519, 193060, 84987, 47746, 49309, 151696, 72787, 152010, 81, 47526, 86, 118972, 34450, 149069, 99, 101, 74983, 109, 40884, 63186, 9164, 148971, 5560, 130, 94035, 82336, 132, 133, 164208, 166807, 63652, 124974, 127051, 140, 180026, 129055, 19226, 158498, 24390, 48575, 152132, 195151, 152, 205092, 157085, 173259, 150107, 43625, 85442, 193098, 168, 174, 93735, 176, 85411, 156976, 177, 31804, 181, 195266, 3756, 183, 189, 208302, 202391, 201, 8934, 116837, 207, 186603, 138139, 212, 213, 214, 189152, 1729, 219, 64344, 65208, 169616, 151794, 70952, 11417, 123987, 12463, 70275, 91208, 242, 144379, 243, 60349, 104938, 140039, 49350, 250, 255, 128874, 258, 42470, 197325, 24379, 11017, 20974,