In [1]:
import pandas as pd
import numpy as np
import networkx as nx
import matplotlib as plt
from helpers import *


# Importing Data

### Metadata about the articles

In [2]:
#Articles is the name of all the articles in our dataset, the index serves also as an id to retrieve articles in other files
# the names are url encoded
articles_df = pd.read_csv("./data/wikispeedia_paths-and-graph/articles.tsv",names = ["name"] ,skiprows=12,sep="\t")
#The categories of the articles, row index is the same as for the articles df
categories_df = pd.read_csv("./data/wikispeedia_paths-and-graph/categories.tsv",names = ["name","category"],skiprows=13, sep="\t")
categories_df = categories_df.groupby(["name"]).agg(lambda x : list(x))
categories_df = articles_df.join(categories_df,on="name",how="left")
categories_df.category = categories_df.category.apply(lambda d: d if isinstance(d, list) else [])
#each row of of the links df contains the list of article names that an article links to
links_df = pd.read_csv("./data/wikispeedia_paths-and-graph/links.tsv",skiprows=12, names = ["from","to"],sep="\t")
out_links_df = links_df.groupby(["from"]).agg(lambda x : list(x))
out_links_df = pd.merge(articles_df,out_links_df,left_on="name",right_on="from",how="left")
out_links_df.to = out_links_df.to.apply(lambda d: d if isinstance(d, list) else [])

in_links_df = links_df.groupby(["to"]).agg(lambda x : list(x))
in_links_df = pd.merge(articles_df,in_links_df,left_on="name",right_on="to",how="left")
in_links_df["from"] = in_links_df["from"].apply(lambda d: d if isinstance(d, list) else [])
#
shortest_path_distance = pd.read_csv("./data/wikispeedia_paths-and-graph/shortest-path-distance-matrix.txt",names=["distances"], skiprows=17)



print("Articles")
display(articles_df)
print("Categories")
display(categories_df)
print("Links")
display(links_df)
print("shortest_path_distance")
display(shortest_path_distance)


Articles


Unnamed: 0,name
0,%C3%81ed%C3%A1n_mac_Gabr%C3%A1in
1,%C3%85land
2,%C3%89douard_Manet
3,%C3%89ire
4,%C3%93engus_I_of_the_Picts
...,...
4599,Zionism
4600,Zirconium
4601,Zoroaster
4602,Zuid-Gelders


Categories


Unnamed: 0,name,category
0,%C3%81ed%C3%A1n_mac_Gabr%C3%A1in,[subject.History.British_History.British_Histo...
1,%C3%85land,"[subject.Countries, subject.Geography.European..."
2,%C3%89douard_Manet,[subject.People.Artists]
3,%C3%89ire,"[subject.Countries, subject.Geography.European..."
4,%C3%93engus_I_of_the_Picts,[subject.History.British_History.British_Histo...
...,...,...
4599,Zionism,"[subject.People.Political_People, subject.Reli..."
4600,Zirconium,[subject.Science.Chemistry.Chemical_elements]
4601,Zoroaster,[subject.People.Religious_figures_and_leaders]
4602,Zuid-Gelders,"[subject.Geography.European_Geography, subject..."


Links


Unnamed: 0,from,to
0,%C3%81ed%C3%A1n_mac_Gabr%C3%A1in,Bede
1,%C3%81ed%C3%A1n_mac_Gabr%C3%A1in,Columba
2,%C3%81ed%C3%A1n_mac_Gabr%C3%A1in,D%C3%A1l_Riata
3,%C3%81ed%C3%A1n_mac_Gabr%C3%A1in,Great_Britain
4,%C3%81ed%C3%A1n_mac_Gabr%C3%A1in,Ireland
...,...,...
119877,Zulu,South_Africa
119878,Zulu,Swaziland
119879,Zulu,United_Kingdom
119880,Zulu,Zambia


shortest_path_distance


Unnamed: 0,distances
0,0_____33333325634333435_2433544334_3_422343544...
1,_0____22222325623232424_2422544324_3_312242544...
2,__0___33222425623232324_2333444433_3_422343434...
3,___0__33333325634233334_2433434333_2_423343433...
4,____0_22323335633332435_2433545434_3_423343544...
...,...
4599,______22222325622231424_1322544334_3_422232544...
4600,______33333434523232434_2332544324_3_323333544...
4601,______22222424522231434_2322545434_3_422232544...
4602,______33333436733342435_2433545444_3_523353544...


In [3]:
G=nx.from_pandas_edgelist(links_df,source="from",target="to",create_using=nx.DiGraph())
print((nx.shortest_path_length(G,source="Asteroid",target="Viking")))
#print(nx.shortest_path_length(G,source="%C3%81ed%C3%A1n_mac_Gabr%C3%A1in",target="%C3%89douard_Manet"))

3


### Played games dataset

In [4]:
paths_finished_df = pd.read_csv("./data/wikispeedia_paths-and-graph/paths_finished.tsv",names = ["hashedIpAddress","timestamp","durationInSec","path","rating"],skiprows=16, sep="\t")
paths_finished_df["path"] = paths_finished_df["path"].apply(lambda x: x.split(";"))
paths_finished_df["start"] = paths_finished_df["path"].apply(lambda x: x[0])
paths_finished_df["target"] = paths_finished_df["path"].apply(lambda x: x[-1])
paths_finished_df["path"] = paths_finished_df["path"].apply(lambda x: x[1:-1])
paths_finished_df["final_path"] = paths_finished_df["path"].apply(remove_backs)
paths_unfinished_df = pd.read_csv("./data/wikispeedia_paths-and-graph/paths_unfinished.tsv",names=["hashedIpAddress",   "timestamp ",  "durationInSec",  "path",   "target",   "type"],skiprows=17, sep="\t")
paths_unfinished_df["path"] = paths_unfinished_df["path"].apply(lambda x: x.split(";"))
paths_unfinished_df["start"] = paths_unfinished_df["path"].apply(lambda x: x[0])
paths_unfinished_df["path"] = paths_unfinished_df["path"].apply(lambda x: x[1:])
paths_unfinished_df["final_path"] = paths_unfinished_df["path"].apply(remove_backs)
#deleting rows where the user would probably just started a game and forgot about it (timeout and no visited pages)
paths_unfinished_df = paths_unfinished_df.drop(paths_unfinished_df[(paths_unfinished_df["durationInSec"]>1800) & (paths_unfinished_df["path"].apply(lambda x: len(x)==0))].index)

print("finished paths")
display(paths_finished_df)
print("unfinished paths")
display(paths_unfinished_df)

finished paths


Unnamed: 0,hashedIpAddress,timestamp,durationInSec,path,rating,start,target,final_path
0,6a3701d319fc3754,1297740409,166,"[Atlantic_slave_trade, Africa, Accra, Atlantic...",,14th_century,African_slave_trade,"[15th_century, 16th_century, Pacific_Ocean, At..."
1,3824310e536af032,1344753412,88,"[Atlantic_slave_trade, Africa, Europe]",3.0,14th_century,African_slave_trade,"[Europe, Africa, Atlantic_slave_trade]"
2,415612e93584d30e,1349298640,138,"[Atlantic_slave_trade, Africa, Slavery, Britis...",,14th_century,African_slave_trade,"[Niger, Nigeria, British_Empire, Slavery, Afri..."
3,64dd5cd342e3780c,1265613925,37,"[Ancient_Greece, Renaissance]",,14th_century,Greece,"[Renaissance, Ancient_Greece]"
4,015245d773376aab,1366730828,175,"[President_of_the_United_States, Ronald_Reagan...",3.0,14th_century,John_F._Kennedy,"[Italy, Roman_Catholic_Church, HIV, Ronald_Rea..."
...,...,...,...,...,...,...,...,...
51313,15a13a1d66ef5456,1349231015,66,[Ancient_Egypt],,Yagan,Civilization,[Ancient_Egypt]
51314,2ef7ac844cefda58,1300254138,165,"[Novel, A_Christmas_Carol, Charles_Dickens, 19...",3.0,Yagan,Fiction,"[Folklore, 19th_century, Charles_Dickens, A_Ch..."
51315,12863abb7887f890,1385095372,228,"[Tennis, United_States, France, England, Austr...",,Yagan,U.S._Open_%28tennis%29,"[Australia, England, France, United_States, Te..."
51316,19f8284371753362,1298792567,56,"[United_States, Australia]",1.0,Yarralumla%2C_Australian_Capital_Territory,Abraham_Lincoln,"[Australia, United_States]"


unfinished paths


Unnamed: 0,hashedIpAddress,timestamp,durationInSec,path,target,type,start,final_path
2,2b015fb8181c48f2,1297090819,1818,"[Alexander_the_Great, Democracy]",First_Crusade,timeout,Malawi,"[Democracy, Alexander_the_Great]"
3,53a53bc244e08a6a,1297094761,49,[],Mount_St._Helens,restart,Paraguay,[]
4,53a53bc244e08a6a,1297099105,1808,[Bolivia],Mount_St._Helens,timeout,Paraguay,[Bolivia]
5,131600803df4895e,1297100557,2009,"[Yangtze_River, China, History_of_the_world]",Grand_Canal_of_China,timeout,Agriculture,"[History_of_the_world, China, Yangtze_River]"
6,486bb79910fe9dd2,1297101660,1932,"[Asia, Afghanistan, Ancient_Greece, Renaissanc...",Scouting,timeout,Mind,"[Christianity, Europe, Renaissance, Ancient_Gr..."
...,...,...,...,...,...,...,...,...
24870,109ed71f571d86e9,1389787605,180,"[Ghana, Sub-Saharan_Africa, <, Philippines, Ma...",Cholera,restart,Franz_Kafka,"[Tuberculosis, World_Health_Organization, Mala..."
24871,232f992e57d43e8d,1389787697,6,[],Hollandic,restart,Modern_history,[]
24872,2e09a7224600a7cd,1389798400,1900,"[<, Popular_culture, Culture, Linguistics]",The_Beatles,timeout,Computer_programming,"[Linguistics, Culture]"
24873,60af9e2138051b96,1389799481,1903,"[Battle_of_Midway, World_War_II, United_Kingdom]",Alan_Turing,timeout,Jamaica,"[United_Kingdom, World_War_II, Battle_of_Midway]"


In [5]:
out_num_links = out_links_df["to"].apply(lambda x: len(x))
display(out_num_links[out_num_links==(out_num_links.max())])

in_num_links = in_links_df["from"].apply(lambda x: len(x))
display(in_num_links[in_num_links==(in_num_links.max())])

4297    294
Name: to, dtype: int64

4297    1551
Name: from, dtype: int64

In [6]:
#list of articles that were visited without the start and finish
visited_nodes_without_sf=paths_finished_df["final_path"].explode().value_counts()
visited_nodes_without_sf

United_States             8567
Europe                    4185
United_Kingdom            3665
Earth                     3076
England                   2937
                          ... 
Battle_of_Jutland            1
Guangzhou                    1
Anglo-Saxon_literature       1
Chennai                      1
Arcadia_%28play%29           1
Name: final_path, Length: 3280, dtype: int64

In [7]:
visited_nodes_without_sf=paths_unfinished_df["final_path"].explode().value_counts()
visited_nodes_without_sf

United_States              3348
United_Kingdom             1307
Europe                     1154
England                    1068
Earth                       940
                           ... 
Gaur                          1
Indian_Railways               1
Bantu                         1
Shackleton_%28crater%29       1
Anschluss                     1
Name: final_path, Length: 3073, dtype: int64

In [8]:
#Top frequent games played, allowing us to make comparisons
games_df=paths_finished_df[["start","target","path"]].groupby(["start","target"]).count().sort_values(by="path",axis=0,ascending=False)[:10]
display(paths_finished_df[["start","target","path"]].groupby(["start","target"]).agg(list))
display(games_df)
games_df=paths_unfinished_df[["start","target","path"]].groupby(["start","target"]).count().sort_values(by="path",axis=0,ascending=False)[:10]
display(games_df)

Unnamed: 0_level_0,Unnamed: 1_level_0,path
start,target,Unnamed: 2_level_1
%E2%82%AC2_commemorative_coins,Irish_Sea,[[Ireland]]
10th_century,11th_century,"[[], [], []]"
10th_century,Banknote,"[[Coin, Silver, Maya_civilization]]"
10th_century,Country,[[Germany]]
10th_century,Harlem_Globetrotters,"[[Basketball, United_States, France], [Basketb..."
...,...,...
Zulu,Arctic_Circle,"[[Arctic, Canada, English_language], [Arctic, ..."
Zulu,Doom,"[[Computer_and_video_games, Computer, 20th_cen..."
Zulu,Jesus,[[Christianity]]
Zulu,Language,[[English_language]]


Unnamed: 0_level_0,Unnamed: 1_level_0,path
start,target,Unnamed: 2_level_1
Asteroid,Viking,1043
Brain,Telephone,1040
Theatre,Zebra,905
Pyramid,Bean,642
Batman,Wood,148
Bird,Great_white_shark,138
Batman,The_Holocaust,119
Bird,Adolf_Hitler,107
Beer,Sun,99
Batman,Banana,69


Unnamed: 0_level_0,Unnamed: 1_level_0,path
start,target,Unnamed: 2_level_1
Brain,Telephone,906
Pyramid,Bean,902
Theatre,Zebra,806
Asteroid,Viking,679
Batman,Wood,69
Batman,Banana,53
Beer,Sun,41
Cat,Microsoft,40
Aircraft,Google,40
Dog,Telephone,36


##### We can see that there are some instances of the game that were played multiple times, and these games are both finished and unfinished,this can allow us to explore the different paths taken by different players (or same player), and analyize the pattern between them, especially comparing it to the shortest path and also comapring the different strategies