In [1]:
# Basic tools
import os
import sys

# Data tools
import numpy as np
import pandas as pd

# Viz tools
import matplotlib.pyplot as plt
import seaborn as sns
import networkx as nx

# Local
## Allow local relative imports
module_path = os.path.abspath('..')
include_path = os.path.join(module_path, 'include')
data_path = os.path.join(module_path, 'data')
if include_path not in sys.path:
    sys.path.append(include_path)

In [2]:
# From https://dumps.wikimedia.org/other/clickstream
# Removed spiders and bots by filtering requests for articles by a small number of clients hundreds of times per minute within some time window
clickstream_df = pd.read_csv(data_path + '/clickstream-enwiki-2018-08.tsv', sep = '\t', names = ['origin_title', 'target_title', 'type', 'n_clicks'])
clickstream_df = clickstream_df.loc[df.type == 'link'].drop(columns = ['type']).sort_values(by = ['n'], ascending = False).reset_index(drop = True)

forward_df = clickstream_df.rename(columns = {'n_clicks': 'n_clicks_forward'})
backward_df = clickstream_df.rename(columns = {'target_title': 'origin_title', 'origin_title': 'target_title', 'n_clicks': 'n_clicks_backward'})
symm_df = pd.merge(forward_df, backward_df, on = ['origin_title', 'target_title'])
sort_df = np.sort(symm_df[['origin_title', 'target_title']].values.astype(str), axis = 1)
drop_df = pd.DataFrame(sort_df).drop_duplicates()
bilinks_df = symm_df.loc[drop_df.index].sort_values(by = ['n_clicks_forward'], ascending = False)

bilinks_df.to_csv(data_path + '/clickstream-enwiki-2018-08-bilinks.tsv', sep = '\t', index = False)

# Simple Network Analysis

We can think of Wikipedia as a network with articles as nodes and links between articles as edges. With the clickstream data we can assign weights to the edges, which correspond to how often the edges where traversed. Remember, that if an edge was traversed less than ten times in the month, it is not included in the data. The data set is on the large side, but with some patience, we can load it into networkx and look at some basic properties of the network. 

In [41]:
clickstream = nx.DiGraph()

for i, row in df_post.iterrows():
    clickstream.add_edge(row['prev_title'], row['curr_title'], traffic = row['n'])

First lets look at the number and distribution of sizes of [strongly connected components]().

In [42]:
strongly_connected_components = nx.strongly_connected_component_subgraphs(clickstream)

In [43]:
component_sizes = []
for g in strongly_connected_components:
    component_sizes.append(g.number_of_nodes())

KeyboardInterrupt: 

In [90]:
component_sizes

(23141217, 4)

In [30]:
clickstream.number_of_nodes ()

4200855

In [None]:
len(strongly_connected_components)

In [76]:
pr = nx.pagerank(clickstream, alpha=0.9, weight = 'traffic')

In [13]:
df_in = df_post.groupby('curr').sum()  # pageviews per article
df_in.columns = ['in_count',]
df_out = df_post.groupby('prev').sum() # link clicks per article
df_out.columns = ['out_count',]
df_in_out = df_in.join(df_out)
df_in_out['ratio'] = df_in_out['out_count']/df_in_out['in_count']


In [14]:
df_in_out.sort('ratio', ascending = False)[:5]

Unnamed: 0_level_0,in_count,out_count,ratio
curr,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Inch,141,2144,15.205674
List_of_Polish_gminas_(B),41,447,10.902439
List_of_Polish_gminas_(A),38,342,9.0
International_League_Most_Valuable_Player_Award,33,275,8.333333
List_of_largest_known_nebulae,31,254,8.193548


In [15]:
df_post.head()

Unnamed: 0,prev,curr,n,type
0,other-empty,!!,66,other
1,other-google,!!,110,other
2,other-wikipedia,!!,31,other
3,!_(disambiguation),!!,10,other
4,other-google,!!!_(album),17,other


# Simple Network Analysis

In [71]:
clickstream = nx.DiGraph()
for i, row in df_post.iterrows():
    clickstream.add_edge(row['prev'], row['curr'], weight = row['n'])

KeyboardInterrupt: 

In [61]:
clickstream.add_edge('a', 'v', weight = 8)

In [36]:
df_norm = pd.merge(df_post, df_out, how='left', left_on='prev', right_index=True)

In [37]:
df_norm = df_norm.sort(['prev', 'curr'])

In [38]:
df_norm['w'] = df_norm['n'] / df_norm['out_count']

In [40]:
df_norm.head(40)

Unnamed: 0,prev,curr,n,type,out_count,w
11512688,!!,!!!,31,link,129,0.24031
2058414,!!,Chess_annotation_symbols,31,link,129,0.24031
14500433,!!,Double_factorial,26,link,129,0.20155
14928832,!!,Exclamation_mark,18,link,129,0.139535
20209148,!!,Retroflex_clicks,12,link,129,0.093023
22442410,!!,Universal_Character_Set_characters,11,other,129,0.085271
5,!!!,!!!_(album),311,link,1847,0.168381
11512757,!!!,!_(disambiguation),14,link,1847,0.00758
13312978,!!!,Cake_(band),10,link,1847,0.005414
2596570,!!!,Dance-punk,40,link,1847,0.021657


In [10]:
node = "Alive"
n=10

In [11]:
prev = df[df['curr'] == node].sort(columns='n', ascending=False)[:n]
tuples = [list(x) for x in prev.values]
from pprint import pprint 
pprint(tuples)

[['other-wikipedia', 'Alive', 2221485, 'other'],
 ['other-empty', 'Alive', 22109, 'other'],
 ['Main_Page', 'Alive', 540, 'other'],
 ['Alive', 'Alive', 215, 'other'],
 ['other-google', 'Alive', 140, 'other'],
 ['LAN_Chile_Flight_210', 'Alive', 98, 'other'],
 ['other-other', 'Alive', 42, 'other'],
 ['other-yahoo', 'Alive', 40, 'other'],
 ['other-bing', 'Alive', 26, 'other'],
 ['Live', 'Alive', 23, 'other']]


In [12]:
prev = df[df['prev'] == node].sort(columns='n', ascending=False)[:n]
tuples = [list(x) for x in prev.values]
from pprint import pprint 
pprint(tuples)

[['Alive', 'Alive_(1993_film)', 1948, 'link'],
 ['Alive', 'Alive:_The_Story_of_the_Andes_Survivors', 482, 'link'],
 ['Alive', 'Alive', 215, 'other'],
 ['Alive', 'Bat_Out_of_Hell_III:_The_Monster_Is_Loose', 202, 'link'],
 ['Alive', 'Alive:_20_Years_Later', 106, 'link'],
 ['Alive', 'Alive_(Pearl_Jam_song)', 97, 'link'],
 ['Alive', 'Alive!_(Kiss_album)', 69, 'link'],
 ['Alive', 'Life', 68, 'link'],
 ['Alive', 'Alive_(2002_film)', 51, 'link'],
 ['Alive', 'Alive_(Natalie_Bassingthwaighte_song)', 48, 'link']]
