In [1]:
import numpy as np
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt


In [2]:
df = pd.read_csv("datasets\dataset.csv")

In [3]:
#sort by user and date because we want to visualize in which order (date) the users (user_id) read the chapters
df = df.sort_values(by=["user_id", "date_created"])

In [4]:
# a chapter belongs to a specific book and chapter alone wouldn't be unique.
# the network is build from nodes (source, target) so we have to combine chapter and book in a single column
df["source"] = df["chapter"] + " " + df["book"]

In [5]:
#This is a very elegant way to do this. My first idea was about using a row based algorithm. So I learned a useful pattern
df["target"] = df["source"].shift(-1)

In [6]:
# And again: creation of additional columns using shifting and filter afterwards is a usefull pattern to deal with hierarchical
# structures in a flat table
df["user_check"] = df["user_id"].shift(-1)

In [7]:
# I am wondering whether pandas offers a similar way beside apply and a userdefined function to achieve this
df["keep_drop"] = np.where((df["user_check"] == df["user_id"]), df["user_id"], np.nan)

In [8]:
df = df.dropna(subset=["keep_drop"])

In [9]:
# create the networkx network
G = nx.from_pandas_edgelist(df, "source", "target", create_using=nx.DiGraph)

### 3.1 Analyzing the network

In [10]:
#calculate centrality measures and add them to the graph
out = nx.out_degree_centrality(G)
nx.set_node_attributes(G,out,"out-degree")

bb = nx.betweenness_centrality(G)
nx.set_node_attributes(G,bb,"betweenness")

eigen = nx.eigenvector_centrality(G)
nx.set_node_attributes(G,eigen,"eigen")


In [11]:
# create a list from the node data containing also the attributes
data_list = list(G.nodes(data=True))
data_list

[('chapter-8 fain3',
  {'out-degree': 0.0007132667617689016,
   'betweenness': 0.0,
   'eigen': 5.159156970479536e-46}),
 ('chapter-1 ramamurthy',
  {'out-degree': 0.0007132667617689016,
   'betweenness': 5.091126065445408e-07,
   'eigen': 3.4050436005164946e-44}),
 ('chapter-1 thomas',
  {'out-degree': 0.0, 'betweenness': 0.0, 'eigen': 1.1071550858649084e-42}),
 ('chapter-4 mcnamara',
  {'out-degree': 0.0021398002853067048,
   'betweenness': 0.0010951012166773072,
   'eigen': 7.421038983061692e-11}),
 ('chapter-1 mcnamara',
  {'out-degree': 0.0014265335235378032,
   'betweenness': 0.0015405747474037806,
   'eigen': 5.1056640257968935e-11}),
 ('chapter-2 mcnamara',
  {'out-degree': 0.0028530670470756064,
   'betweenness': 0.002719170431554393,
   'eigen': 8.256859417671816e-11}),
 ('chapter-9 hillard',
  {'out-degree': 0.0007132667617689016,
   'betweenness': 0.0,
   'eigen': 3.365361593998789e-11}),
 ('chapter-2 vos',
  {'out-degree': 0.0021398002853067048,
   'betweenness': 0.0119768

In [12]:
# create a dictionary from the list
data = {}

data["chapter"] = [x[0] for x in data_list]

data["out-degree"] = [x[1]["out-degree"] for x in data_list]
data["eigen"] = [x[1]["eigen"] for x in data_list]
data["betweenness"] = [x[1]["betweenness"] for x in data_list]


In [13]:
# create a dataframe from dictionary with columns/series chapter, out-degree, eigen, betweenness
df2 = pd.DataFrame(data)

In [14]:
# create a list with nodes that are connected to the defined node
row = [n for n in G[' chapter-1 munro']]
row

[' about-this-book lauret',
 ' chapter-3 fain4',
 ' chapter-11 lauret',
 ' chapter-1 ghosh',
 ' about-this-book hocking2',
 ' chapter-1 hudgeon',
 ' introduction fain4',
 ' chapter-1 munro',
 ' welcome lemaire',
 ' chapter-1 seemann2']

In [15]:
# create a dataframe with a single colum from list so it can be used for an inner join
df3 = pd.DataFrame(row)

In [16]:
# make the list of connected nodes the index of the dataframe
df3 = df3.set_index(0)
df3

about-this-book lauret
chapter-3 fain4
chapter-11 lauret
chapter-1 ghosh
about-this-book hocking2
chapter-1 hudgeon
introduction fain4
chapter-1 munro
welcome lemaire
chapter-1 seemann2


In [17]:
# make the chapter also the index of the df2 and inner join by chapter
# only nodes that are connected to the defined node will be part of the joined dataframe df4
df4 = df3.join(df2.set_index("chapter"))

In [18]:
# sort by a selected (here: betweenness) centrality in order to be able to make a recommendation
df4 = df4.sort_values(by=["betweenness"], ascending=False)

In [20]:
# When you read the chapter-1 of the book munro the recommended next chapter is "about-this-book" from the book "lauret"
df4

Unnamed: 0_level_0,out-degree,eigen,betweenness
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
chapter-1 munro,0.007133,0.205791,0.037793
about-this-book lauret,0.005706,0.114395,0.029768
chapter-1 seemann2,0.002853,0.125462,0.016467
chapter-11 lauret,0.003566,0.203417,0.010509
welcome lemaire,0.001427,0.054961,0.004219
about-this-book hocking2,0.000713,0.061837,0.002087
chapter-1 hudgeon,0.00214,0.109563,0.001448
chapter-1 ghosh,0.000713,0.054961,0.001247
chapter-3 fain4,0.000713,0.054961,0.000914
introduction fain4,0.000713,0.054961,0.0
