In [1]:
import pandas as pd
from urllib.parse import unquote
import networkx as nx
import matplotlib.pyplot as plt
from random import seed
from random import randint
import numpy as np

## Read in finished paths data

In [2]:
colnames =['ipAddress', 'timestamp', 'duration_Secs', 'path', 'rating']
finished_df = pd.read_csv('wikispeedia_paths-and-graph/paths_finished.tsv', sep='\t', skiprows=15, names=colnames, na_values='NULL')
finished_df['path'] = finished_df.apply(lambda row: unquote(row.path), axis=1)
finished_df.head()

Unnamed: 0,ipAddress,timestamp,duration_Secs,path,rating
0,6a3701d319fc3754,1297740409,166,14th_century;15th_century;16th_century;Pacific...,
1,3824310e536af032,1344753412,88,14th_century;Europe;Africa;Atlantic_slave_trad...,3.0
2,415612e93584d30e,1349298640,138,14th_century;Niger;Nigeria;British_Empire;Slav...,
3,64dd5cd342e3780c,1265613925,37,14th_century;Renaissance;Ancient_Greece;Greece,
4,015245d773376aab,1366730828,175,14th_century;Italy;Roman_Catholic_Church;HIV;R...,3.0


## Drop unecessary columns & create new column with number of clicks

In [3]:
finished_df['click_Path'] = finished_df['path'].str.split(';')
finished_df['click_Count'] = finished_df.apply(lambda row: len(row.click_Path), axis=1)
finished_df = finished_df.drop(['ipAddress', 'timestamp', 'rating'], axis=1)
finished_df.head()

Unnamed: 0,duration_Secs,path,click_Path,click_Count
0,166,14th_century;15th_century;16th_century;Pacific...,"[14th_century, 15th_century, 16th_century, Pac...",9
1,88,14th_century;Europe;Africa;Atlantic_slave_trad...,"[14th_century, Europe, Africa, Atlantic_slave_...",5
2,138,14th_century;Niger;Nigeria;British_Empire;Slav...,"[14th_century, Niger, Nigeria, British_Empire,...",8
3,37,14th_century;Renaissance;Ancient_Greece;Greece,"[14th_century, Renaissance, Ancient_Greece, Gr...",4
4,175,14th_century;Italy;Roman_Catholic_Church;HIV;R...,"[14th_century, Italy, Roman_Catholic_Church, H...",7


## Add in binary feature for backclick and number of backclicks

In [4]:
finished_df['back_Click'] = finished_df.apply(lambda row: '<' in row.click_Path, axis=1)
finished_df.loc[finished_df['back_Click'] == True, 'back_Click'] = 1
finished_df.loc[finished_df['back_Click'] == False, 'back_Click'] = 0
finished_df['back_click_Count'] = finished_df.apply(lambda row: row.click_Path.count('<'), axis=1)
finished_df

Unnamed: 0,duration_Secs,path,click_Path,click_Count,back_Click,back_click_Count
0,166,14th_century;15th_century;16th_century;Pacific...,"[14th_century, 15th_century, 16th_century, Pac...",9,0,0
1,88,14th_century;Europe;Africa;Atlantic_slave_trad...,"[14th_century, Europe, Africa, Atlantic_slave_...",5,0,0
2,138,14th_century;Niger;Nigeria;British_Empire;Slav...,"[14th_century, Niger, Nigeria, British_Empire,...",8,0,0
3,37,14th_century;Renaissance;Ancient_Greece;Greece,"[14th_century, Renaissance, Ancient_Greece, Gr...",4,0,0
4,175,14th_century;Italy;Roman_Catholic_Church;HIV;R...,"[14th_century, Italy, Roman_Catholic_Church, H...",7,0,0
...,...,...,...,...,...,...
51313,66,Yagan;Ancient_Egypt;Civilization,"[Yagan, Ancient_Egypt, Civilization]",3,0,0
51314,165,Yagan;Folklore;Brothers_Grimm;<;19th_century;C...,"[Yagan, Folklore, Brothers_Grimm, <, 19th_cent...",9,1,1
51315,228,Yagan;Australia;England;France;United_States;T...,"[Yagan, Australia, England, France, United_Sta...",7,0,0
51316,56,"Yarralumla,_Australian_Capital_Territory;Austr...","[Yarralumla,_Australian_Capital_Territory, Aus...",4,0,0


## Add in a binay feature to indicate that paths were completed

In [5]:
finished_df['completed_Path'] = 1
finished_df

Unnamed: 0,duration_Secs,path,click_Path,click_Count,back_Click,back_click_Count,completed_Path
0,166,14th_century;15th_century;16th_century;Pacific...,"[14th_century, 15th_century, 16th_century, Pac...",9,0,0,1
1,88,14th_century;Europe;Africa;Atlantic_slave_trad...,"[14th_century, Europe, Africa, Atlantic_slave_...",5,0,0,1
2,138,14th_century;Niger;Nigeria;British_Empire;Slav...,"[14th_century, Niger, Nigeria, British_Empire,...",8,0,0,1
3,37,14th_century;Renaissance;Ancient_Greece;Greece,"[14th_century, Renaissance, Ancient_Greece, Gr...",4,0,0,1
4,175,14th_century;Italy;Roman_Catholic_Church;HIV;R...,"[14th_century, Italy, Roman_Catholic_Church, H...",7,0,0,1
...,...,...,...,...,...,...,...
51313,66,Yagan;Ancient_Egypt;Civilization,"[Yagan, Ancient_Egypt, Civilization]",3,0,0,1
51314,165,Yagan;Folklore;Brothers_Grimm;<;19th_century;C...,"[Yagan, Folklore, Brothers_Grimm, <, 19th_cent...",9,1,1,1
51315,228,Yagan;Australia;England;France;United_States;T...,"[Yagan, Australia, England, France, United_Sta...",7,0,0,1
51316,56,"Yarralumla,_Australian_Capital_Territory;Austr...","[Yarralumla,_Australian_Capital_Territory, Aus...",4,0,0,1


## Read in the unfinished paths data and drop unwanted columns

In [6]:
colnames_unfinished =['ipAddress', 'timestamp', 'duration_Secs', 'path', 'target', 'type']
unfinished_df = pd.read_csv('wikispeedia_paths-and-graph/paths_unfinished.tsv', sep='\t', skiprows=16, names=colnames_unfinished, na_values='NULL')
unfinished_df = unfinished_df.drop(['ipAddress', 'timestamp', 'type'], axis=1)
unfinished_df.head()

Unnamed: 0,duration_Secs,path,target
0,1804,Obi-Wan_Kenobi,Microsoft
1,1805,Julius_Caesar,Caracas
2,1818,Malawi;Democracy;Alexander_the_Great,First_Crusade
3,49,Paraguay,Mount_St._Helens
4,1808,Paraguay;Bolivia,Mount_St._Helens


## Prepare data to match finished dataframe

In [7]:
unfinished_df['click_Path'] = unfinished_df['path'].str.split(';')
unfinished_df['click_Count'] = unfinished_df.apply(lambda row: len(row.click_Path), axis=1)
unfinished_df['back_Click'] = unfinished_df.apply(lambda row: '<' in row.click_Path, axis=1)
unfinished_df.loc[unfinished_df['back_Click'] == True, 'back_Click'] = 1
unfinished_df.loc[unfinished_df['back_Click'] == False, 'back_Click'] = 0
unfinished_df['back_click_Count'] = unfinished_df.apply(lambda row: row.click_Path.count('<'), axis=1)
unfinished_df['completed_Path'] = 0
unfinished_df

Unnamed: 0,duration_Secs,path,target,click_Path,click_Count,back_Click,back_click_Count,completed_Path
0,1804,Obi-Wan_Kenobi,Microsoft,[Obi-Wan_Kenobi],1,0,0,0
1,1805,Julius_Caesar,Caracas,[Julius_Caesar],1,0,0,0
2,1818,Malawi;Democracy;Alexander_the_Great,First_Crusade,"[Malawi, Democracy, Alexander_the_Great]",3,0,0,0
3,49,Paraguay,Mount_St._Helens,[Paraguay],1,0,0,0
4,1808,Paraguay;Bolivia,Mount_St._Helens,"[Paraguay, Bolivia]",2,0,0,0
...,...,...,...,...,...,...,...,...
24870,180,Franz_Kafka;Tuberculosis;World_Health_Organiza...,Cholera,"[Franz_Kafka, Tuberculosis, World_Health_Organ...",8,1,1,0
24871,6,Modern_history,Hollandic,[Modern_history],1,0,0,0
24872,1900,Computer_programming;Linguistics;Culture;Popul...,The_Beatles,"[Computer_programming, Linguistics, Culture, P...",5,1,1,0
24873,1903,Jamaica;United_Kingdom;World_War_II;Battle_of_...,Alan_Turing,"[Jamaica, United_Kingdom, World_War_II, Battle...",4,0,0,0


## Balancing class distribution in unfinished

In [8]:
unfinished_df2 = unfinished_df.sample(n=26443, replace=True, axis=0)
frames = [unfinished_df,unfinished_df2]
unfin_df = pd.concat(frames)
unfin_df

Unnamed: 0,duration_Secs,path,target,click_Path,click_Count,back_Click,back_click_Count,completed_Path
0,1804,Obi-Wan_Kenobi,Microsoft,[Obi-Wan_Kenobi],1,0,0,0
1,1805,Julius_Caesar,Caracas,[Julius_Caesar],1,0,0,0
2,1818,Malawi;Democracy;Alexander_the_Great,First_Crusade,"[Malawi, Democracy, Alexander_the_Great]",3,0,0,0
3,49,Paraguay,Mount_St._Helens,[Paraguay],1,0,0,0
4,1808,Paraguay;Bolivia,Mount_St._Helens,"[Paraguay, Bolivia]",2,0,0,0
...,...,...,...,...,...,...,...,...
13247,1949,FIFA_World_Cup;Spain;Tourism;The_Ashes,A_cappella,"[FIFA_World_Cup, Spain, Tourism, The_Ashes]",4,0,0,0
9940,112,Julius_Caesar;Roman_mythology;1st_century_BC;1...,1980_eruption_of_Mount_St._Helens,"[Julius_Caesar, Roman_mythology, 1st_century_B...",7,1,1,0
4559,1838,Asteroid;Trojan_War,Viking,"[Asteroid, Trojan_War]",2,0,0,0
2582,1823,Marcel_Lefebvre;Switzerland,Chemical_synapse,"[Marcel_Lefebvre, Switzerland]",2,0,0,0


## Add in target variable to finished dataframe and merge

In [9]:
finished_df['target'] = finished_df.apply(lambda row: row.click_Path[-1], axis=1)
frames = [finished_df,unfin_df]
final_df = pd.concat(frames)
final_df['source'] = final_df.apply(lambda row: row.click_Path[0], axis=1)
final_df = final_df[final_df['click_Count'] != 1]
final_df

Unnamed: 0,duration_Secs,path,click_Path,click_Count,back_Click,back_click_Count,completed_Path,target,source
0,166,14th_century;15th_century;16th_century;Pacific...,"[14th_century, 15th_century, 16th_century, Pac...",9,0,0,1,African_slave_trade,14th_century
1,88,14th_century;Europe;Africa;Atlantic_slave_trad...,"[14th_century, Europe, Africa, Atlantic_slave_...",5,0,0,1,African_slave_trade,14th_century
2,138,14th_century;Niger;Nigeria;British_Empire;Slav...,"[14th_century, Niger, Nigeria, British_Empire,...",8,0,0,1,African_slave_trade,14th_century
3,37,14th_century;Renaissance;Ancient_Greece;Greece,"[14th_century, Renaissance, Ancient_Greece, Gr...",4,0,0,1,Greece,14th_century
4,175,14th_century;Italy;Roman_Catholic_Church;HIV;R...,"[14th_century, Italy, Roman_Catholic_Church, H...",7,0,0,1,John_F._Kennedy,14th_century
...,...,...,...,...,...,...,...,...,...
20132,30,Beach;Geology;Science;Mathematics,"[Beach, Geology, Science, Mathematics]",4,0,0,0,African_Union,Beach
13247,1949,FIFA_World_Cup;Spain;Tourism;The_Ashes,"[FIFA_World_Cup, Spain, Tourism, The_Ashes]",4,0,0,0,A_cappella,FIFA_World_Cup
9940,112,Julius_Caesar;Roman_mythology;1st_century_BC;1...,"[Julius_Caesar, Roman_mythology, 1st_century_B...",7,1,1,0,1980_eruption_of_Mount_St._Helens,Julius_Caesar
4559,1838,Asteroid;Trojan_War,"[Asteroid, Trojan_War]",2,0,0,0,Viking,Asteroid


## Add in source variable (potentially can be used for category tagging)

In [10]:
index = []
for x in range(76193):
    if 'GNU_Free_Documentation_License' in final_df.iloc[x,1]:
        index.append(x)
final_df.drop(final_df.index[index], axis=0, inplace=True)
final_df

Unnamed: 0,duration_Secs,path,click_Path,click_Count,back_Click,back_click_Count,completed_Path,target,source
0,166,14th_century;15th_century;16th_century;Pacific...,"[14th_century, 15th_century, 16th_century, Pac...",9,0,0,1,African_slave_trade,14th_century
1,88,14th_century;Europe;Africa;Atlantic_slave_trad...,"[14th_century, Europe, Africa, Atlantic_slave_...",5,0,0,1,African_slave_trade,14th_century
2,138,14th_century;Niger;Nigeria;British_Empire;Slav...,"[14th_century, Niger, Nigeria, British_Empire,...",8,0,0,1,African_slave_trade,14th_century
3,37,14th_century;Renaissance;Ancient_Greece;Greece,"[14th_century, Renaissance, Ancient_Greece, Gr...",4,0,0,1,Greece,14th_century
4,175,14th_century;Italy;Roman_Catholic_Church;HIV;R...,"[14th_century, Italy, Roman_Catholic_Church, H...",7,0,0,1,John_F._Kennedy,14th_century
...,...,...,...,...,...,...,...,...,...
20132,30,Beach;Geology;Science;Mathematics,"[Beach, Geology, Science, Mathematics]",4,0,0,0,African_Union,Beach
13247,1949,FIFA_World_Cup;Spain;Tourism;The_Ashes,"[FIFA_World_Cup, Spain, Tourism, The_Ashes]",4,0,0,0,A_cappella,FIFA_World_Cup
9940,112,Julius_Caesar;Roman_mythology;1st_century_BC;1...,"[Julius_Caesar, Roman_mythology, 1st_century_B...",7,1,1,0,1980_eruption_of_Mount_St._Helens,Julius_Caesar
4559,1838,Asteroid;Trojan_War,"[Asteroid, Trojan_War]",2,0,0,0,Viking,Asteroid


# Read in the links data & add it to a new dictionary

In [11]:
colnames =['Start_page', 'Linked_page']
links_df = pd.read_csv('wikispeedia_paths-and-graph/links.tsv', sep='\t', skiprows=11, names=colnames)
links_df.head()

Unnamed: 0,Start_page,Linked_page
0,%C3%81ed%C3%A1n_mac_Gabr%C3%A1in,Bede
1,%C3%81ed%C3%A1n_mac_Gabr%C3%A1in,Columba
2,%C3%81ed%C3%A1n_mac_Gabr%C3%A1in,D%C3%A1l_Riata
3,%C3%81ed%C3%A1n_mac_Gabr%C3%A1in,Great_Britain
4,%C3%81ed%C3%A1n_mac_Gabr%C3%A1in,Ireland


In [12]:
links_dict ={}
for x in links_df.index:
    start_Page = unquote(links_df["Start_page"][x])
    if start_Page in links_dict.keys():
        links_dict[start_Page].append(unquote(links_df["Linked_page"][x]))
    else:
        links_dict[start_Page] = [unquote(links_df["Linked_page"][x])]

# Read in the categories data & and add it to a dictionary

In [13]:
colnames=['Page', 'Categories']
categories_df = pd.read_csv('wikispeedia_paths-and-graph/categories.tsv', sep='\t', skiprows=12, names=colnames)
categories_df.head()

Unnamed: 0,Page,Categories
0,%C3%81ed%C3%A1n_mac_Gabr%C3%A1in,subject.History.British_History.British_Histor...
1,%C3%81ed%C3%A1n_mac_Gabr%C3%A1in,subject.People.Historical_figures
2,%C3%85land,subject.Countries
3,%C3%85land,subject.Geography.European_Geography.European_...
4,%C3%89douard_Manet,subject.People.Artists


In [14]:
categories_dict = {}
for x in categories_df.index:
    page = unquote(categories_df["Page"][x])
    if page in categories_dict.keys():
        categories_dict[page].append(unquote(categories_df["Categories"][x])[8:].split('.'))
    else:
        categories_dict[page] = [unquote(categories_df["Categories"][x])[8:].split('.')]

# Create category mapping for the first click

In [16]:
final_df['first_Click'] = final_df.apply(lambda row: len(row.click_Path) > 1, axis=1)
cats = []

for x in range(91547):
    if (final_df.iloc[x,3] >= 2):
        final_df.iloc[x,9] = final_df.iloc[x,2][1]
        if (final_df.iloc[x,9] in categories_dict.keys()):
            cats.append(categories_dict.get(final_df.iloc[x,9]))
        else:
            cats.append("none")
    else:
        cats.append("none")
final_df['category'] = cats

## Save first data frame 

In [17]:
cat = []
for x in range(91547):
    if (final_df.iloc[x,10] != 'none'):
        cat.append(final_df.iloc[x,10][0][0])
    else:
        cat.append('none')
final_df['first_Category'] = cat

final_df = final_df.drop(['first_Click', 'category'], axis=1)

## Create dummy variables

In [18]:
dummies = pd.get_dummies(final_df['first_Category'])
final_df = pd.concat([final_df, dummies], axis=1)
final_df.index = list(range(len(final_df.index)))
    
final_df.to_csv("df_main.csv", index=False)
print("Done!")
final_df

Done!


Unnamed: 0,duration_Secs,path,click_Path,click_Count,back_Click,back_click_Count,completed_Path,target,source,first_Category,...,Geography,History,IT,Language_and_literature,Mathematics,Music,People,Religion,Science,none
0,166,14th_century;15th_century;16th_century;Pacific...,"[14th_century, 15th_century, 16th_century, Pac...",9,0,0,1,African_slave_trade,14th_century,History,...,0,1,0,0,0,0,0,0,0,0
1,88,14th_century;Europe;Africa;Atlantic_slave_trad...,"[14th_century, Europe, Africa, Atlantic_slave_...",5,0,0,1,African_slave_trade,14th_century,Geography,...,1,0,0,0,0,0,0,0,0,0
2,138,14th_century;Niger;Nigeria;British_Empire;Slav...,"[14th_century, Niger, Nigeria, British_Empire,...",8,0,0,1,African_slave_trade,14th_century,Geography,...,1,0,0,0,0,0,0,0,0,0
3,37,14th_century;Renaissance;Ancient_Greece;Greece,"[14th_century, Renaissance, Ancient_Greece, Gr...",4,0,0,1,Greece,14th_century,History,...,0,1,0,0,0,0,0,0,0,0
4,175,14th_century;Italy;Roman_Catholic_Church;HIV;R...,"[14th_century, Italy, Roman_Catholic_Church, H...",7,0,0,1,John_F._Kennedy,14th_century,Countries,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
91542,30,Beach;Geology;Science;Mathematics,"[Beach, Geology, Science, Mathematics]",4,0,0,0,African_Union,Beach,Geography,...,1,0,0,0,0,0,0,0,0,0
91543,1949,FIFA_World_Cup;Spain;Tourism;The_Ashes,"[FIFA_World_Cup, Spain, Tourism, The_Ashes]",4,0,0,0,A_cappella,FIFA_World_Cup,Countries,...,0,0,0,0,0,0,0,0,0,0
91544,112,Julius_Caesar;Roman_mythology;1st_century_BC;1...,"[Julius_Caesar, Roman_mythology, 1st_century_B...",7,1,1,0,1980_eruption_of_Mount_St._Helens,Julius_Caesar,Religion,...,0,0,0,0,0,0,0,1,0,0
91545,1838,Asteroid;Trojan_War,"[Asteroid, Trojan_War]",2,0,0,0,Viking,Asteroid,History,...,0,1,0,0,0,0,0,0,0,0


## Save fragmented paths to a list

In [19]:
# final_df = pd.read_csv('df_main.csv')
df_fragPath = []
for x in range(91547):
    seed(101)
    path_len = len(list(final_df.iat[x, 2]))
    if path_len != 1:
        random_stop = randint(1,path_len-1)
        df_fragPath.append(list(final_df.iat[x, 2])[0:random_stop])
    else:
        df_fragPath.append(list(final_df.iat[x, 2]))

***

# Create a graph object with the category data as metadata

In [20]:
node_list = []
for key in categories_dict:
    node_list.append((key, {"subtree":categories_dict[key]}))

G = nx.DiGraph()
G.add_nodes_from(node_list)
list(G.nodes(data=True))

edge_list = []
for key in links_dict:
    for link in links_dict[key]:
        edge_list.append((key,link))
edge_list

G.add_edges_from(edge_list)
print(len(list(G.nodes())))
print(len(list(G.edges())))
G.add_edges_from([('Finland', 'Åland'),
 ('Finland', 'Åland'),
 ('Republic_of_Ireland', 'Éire'),
 ('Claude_Monet', 'Édouard_Manet'),
 ('Republic_of_Ireland', 'Éire'),
 ('Claude_Monet', 'Édouard_Manet'),
 ('Ireland', 'Éire'),
 ('Impressionism', 'Édouard_Manet'),
 ('Republic_of_Ireland', 'Éire'),
 ('Republic_of_Ireland', 'Éire'),
 ('Claude_Monet', 'Édouard_Manet')])
print(len(list(G.edges())))

4602
119882
119887


### Create Test DF

In [21]:
test_df = final_df.copy()
test_df.insert(2,'fragmented_Path', df_fragPath)

In [22]:
for n in range(91547):
    if test_df.iat[n,2] == test_df.iat[n,3] and len(test_df.iat[n,3]) !=1:
        print(n)

### Dict. of assigned start/target combos with indices -   (root,target):[indices]

In [23]:
unique_assignments = {}
for n in range(91547):
    if (unquote(final_df.iat[n,8]), unquote(final_df.iat[n,7])) not in unique_assignments.keys():
        unique_assignments[(unquote(final_df.iat[n,8]), unquote(final_df.iat[n,7]))] = [n]
    else:
        unique_assignments[(unquote(final_df.iat[n,8]), unquote(final_df.iat[n,7]))].append(n)

### Find impossible assignments

In [24]:
possible=[]
impossible=[]
for (root,target) in unique_assignments.keys():
    try:
        nx.dijkstra_path(G, root, target)
        possible.append(unique_assignments[(root,target)])
    except:
        impossible.append(unique_assignments[(root,target)])

### Make list of rows to drop and drop them

In [25]:
impossible_concat = []
for n in impossible:
    for m in n:
        impossible_concat.append(m)
print(len(test_df.index))
print(len(impossible_concat))
test_df.drop(test_df.index[impossible_concat], axis = 0, inplace = True)
print(len(test_df.index))

91547
39
91508


# Average out degree of the nodes in the path & add to final dataframe

In [27]:
avg_deg_out = []
for n in range(91508):
    degs = []
    path = list(test_df.iat[n,2])
    path = [value for value in path if value != '<']
    for m in range(len(path)):
        path[m] = unquote(path[m])
        degs.append(G.out_degree(path[m]))
    
    try:
        avg_deg_out.append(sum(degs) / len(degs))
    except:
        avg_deg_out.append('err')
#         print(n, fragmented_Path)
avg_deg_out
test_df['Avg_outDegree'] = avg_deg_out
assert 'err' not in avg_deg_out
test_df

Unnamed: 0,duration_Secs,path,fragmented_Path,click_Path,click_Count,back_Click,back_click_Count,completed_Path,target,source,...,History,IT,Language_and_literature,Mathematics,Music,People,Religion,Science,none,Avg_outDegree
0,166,14th_century;15th_century;16th_century;Pacific...,"[14th_century, 15th_century, 16th_century, Pac...","[14th_century, 15th_century, 16th_century, Pac...",9,0,0,1,African_slave_trade,14th_century,...,1,0,0,0,0,0,0,0,0,66.500000
1,88,14th_century;Europe;Africa;Atlantic_slave_trad...,"[14th_century, Europe]","[14th_century, Europe, Africa, Atlantic_slave_...",5,0,0,1,African_slave_trade,14th_century,...,0,0,0,0,0,0,0,0,0,95.000000
2,138,14th_century;Niger;Nigeria;British_Empire;Slav...,"[14th_century, Niger, Nigeria, British_Empire,...","[14th_century, Niger, Nigeria, British_Empire,...",8,0,0,1,African_slave_trade,14th_century,...,0,0,0,0,0,0,0,0,0,74.600000
3,37,14th_century;Renaissance;Ancient_Greece;Greece,"[14th_century, Renaissance, Ancient_Greece]","[14th_century, Renaissance, Ancient_Greece, Gr...",4,0,0,1,Greece,14th_century,...,1,0,0,0,0,0,0,0,0,53.000000
4,175,14th_century;Italy;Roman_Catholic_Church;HIV;R...,"[14th_century, Italy, Roman_Catholic_Church, H...","[14th_century, Italy, Roman_Catholic_Church, H...",7,0,0,1,John_F._Kennedy,14th_century,...,0,0,0,0,0,0,0,0,0,55.200000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
91542,30,Beach;Geology;Science;Mathematics,"[Beach, Geology, Science]","[Beach, Geology, Science, Mathematics]",4,0,0,0,African_Union,Beach,...,0,0,0,0,0,0,0,0,0,31.666667
91543,1949,FIFA_World_Cup;Spain;Tourism;The_Ashes,"[FIFA_World_Cup, Spain, Tourism]","[FIFA_World_Cup, Spain, Tourism, The_Ashes]",4,0,0,0,A_cappella,FIFA_World_Cup,...,0,0,0,0,0,0,0,0,0,64.333333
91544,112,Julius_Caesar;Roman_mythology;1st_century_BC;1...,"[Julius_Caesar, Roman_mythology, 1st_century_B...","[Julius_Caesar, Roman_mythology, 1st_century_B...",7,1,1,0,1980_eruption_of_Mount_St._Helens,Julius_Caesar,...,0,0,0,0,0,0,1,0,0,64.800000
91545,1838,Asteroid;Trojan_War,[Asteroid],"[Asteroid, Trojan_War]",2,0,0,0,Viking,Asteroid,...,1,0,0,0,0,0,0,0,0,19.000000


# Function to clean up backspaces

In [28]:
def backspace_cleaner(path):
    totalcount = len([index for index, page in enumerate(path) if page=='<'])
    while totalcount > 0:
        count = 1
        backpress = [index for index, page in enumerate(path) if page=='<']
        to_fix = backpress[-1]
        current = backpress[-1]
        while count > 0:
            check = []
            for n in range(2*count):
                check.append(path[current-(n+1)])
            current -= len(check)
            count = len([index for index, page in enumerate(check) if page=='<'])
        path[to_fix] = path[current]
        totalcount = len([index for index, page in enumerate(path) if page=='<'])
    return(path)

# Average number of times each link in path is traversed

In [29]:
avg_link_traversals = []
for n in range(91508):
    path = backspace_cleaner(list(test_df.iat[n,2]))
    
    links = []
    for m in range(len(path)):
        path[m] = unquote(path[m])
    for m in range(len(path)-1):
        links.append([path[m],path[m+1]])
   
    unique_links = []
    for p in links:
        if p not in unique_links:
                unique_links.append(p)
    try:
        avg_link_traversals.append(len(links) / len(unique_links))
    except:
        avg_link_traversals.append(0) #chose 0 for single node paths. Maybe should be changed, or just omit paths
test_df['Avg_linkTraversals'] = avg_link_traversals
test_df.head()

Unnamed: 0,duration_Secs,path,fragmented_Path,click_Path,click_Count,back_Click,back_click_Count,completed_Path,target,source,...,IT,Language_and_literature,Mathematics,Music,People,Religion,Science,none,Avg_outDegree,Avg_linkTraversals
0,166,14th_century;15th_century;16th_century;Pacific...,"[14th_century, 15th_century, 16th_century, Pac...","[14th_century, 15th_century, 16th_century, Pac...",9,0,0,1,African_slave_trade,14th_century,...,0,0,0,0,0,0,0,0,66.5,1.0
1,88,14th_century;Europe;Africa;Atlantic_slave_trad...,"[14th_century, Europe]","[14th_century, Europe, Africa, Atlantic_slave_...",5,0,0,1,African_slave_trade,14th_century,...,0,0,0,0,0,0,0,0,95.0,1.0
2,138,14th_century;Niger;Nigeria;British_Empire;Slav...,"[14th_century, Niger, Nigeria, British_Empire,...","[14th_century, Niger, Nigeria, British_Empire,...",8,0,0,1,African_slave_trade,14th_century,...,0,0,0,0,0,0,0,0,74.6,1.0
3,37,14th_century;Renaissance;Ancient_Greece;Greece,"[14th_century, Renaissance, Ancient_Greece]","[14th_century, Renaissance, Ancient_Greece, Gr...",4,0,0,1,Greece,14th_century,...,0,0,0,0,0,0,0,0,53.0,1.0
4,175,14th_century;Italy;Roman_Catholic_Church;HIV;R...,"[14th_century, Italy, Roman_Catholic_Church, H...","[14th_century, Italy, Roman_Catholic_Church, H...",7,0,0,1,John_F._Kennedy,14th_century,...,0,0,0,0,0,0,0,0,55.2,1.0


# Detourness: (number of links in shortest path from start to last node) / (links in path)

In [30]:
detourness = []
for n in range(91508):
    path = backspace_cleaner(list(test_df.iat[n,2]))
    for m in range(len(path)):
        path[m] = unquote(path[m])
    link_count = len(path) - 1
    try:
        shortest_path = nx.dijkstra_path(G, path[0], path[-1])
        detourness.append((len(shortest_path)-1)/link_count)
    except:
        detourness.append('err')
test_df['detourness'] = detourness
test_df

Unnamed: 0,duration_Secs,path,fragmented_Path,click_Path,click_Count,back_Click,back_click_Count,completed_Path,target,source,...,Language_and_literature,Mathematics,Music,People,Religion,Science,none,Avg_outDegree,Avg_linkTraversals,detourness
0,166,14th_century;15th_century;16th_century;Pacific...,"[14th_century, 15th_century, 16th_century, Pac...","[14th_century, 15th_century, 16th_century, Pac...",9,0,0,1,African_slave_trade,14th_century,...,0,0,0,0,0,0,0,66.500000,1.0,0.666667
1,88,14th_century;Europe;Africa;Atlantic_slave_trad...,"[14th_century, Europe]","[14th_century, Europe, Africa, Atlantic_slave_...",5,0,0,1,African_slave_trade,14th_century,...,0,0,0,0,0,0,0,95.000000,1.0,1
2,138,14th_century;Niger;Nigeria;British_Empire;Slav...,"[14th_century, Niger, Nigeria, British_Empire,...","[14th_century, Niger, Nigeria, British_Empire,...",8,0,0,1,African_slave_trade,14th_century,...,0,0,0,0,0,0,0,74.600000,1.0,0.5
3,37,14th_century;Renaissance;Ancient_Greece;Greece,"[14th_century, Renaissance, Ancient_Greece]","[14th_century, Renaissance, Ancient_Greece, Gr...",4,0,0,1,Greece,14th_century,...,0,0,0,0,0,0,0,53.000000,1.0,1
4,175,14th_century;Italy;Roman_Catholic_Church;HIV;R...,"[14th_century, Italy, Roman_Catholic_Church, H...","[14th_century, Italy, Roman_Catholic_Church, H...",7,0,0,1,John_F._Kennedy,14th_century,...,0,0,0,0,0,0,0,55.200000,1.0,0.75
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
91542,30,Beach;Geology;Science;Mathematics,"[Beach, Geology, Science]","[Beach, Geology, Science, Mathematics]",4,0,0,0,African_Union,Beach,...,0,0,0,0,0,0,0,31.666667,1.0,1
91543,1949,FIFA_World_Cup;Spain;Tourism;The_Ashes,"[FIFA_World_Cup, Spain, Tourism]","[FIFA_World_Cup, Spain, Tourism, The_Ashes]",4,0,0,0,A_cappella,FIFA_World_Cup,...,0,0,0,0,0,0,0,64.333333,1.0,1
91544,112,Julius_Caesar;Roman_mythology;1st_century_BC;1...,"[Julius_Caesar, Roman_mythology, 1st_century_B...","[Julius_Caesar, Roman_mythology, 1st_century_B...",7,1,1,0,1980_eruption_of_Mount_St._Helens,Julius_Caesar,...,0,0,0,0,1,0,0,64.800000,1.0,0.5
91545,1838,Asteroid;Trojan_War,[Asteroid],"[Asteroid, Trojan_War]",2,0,0,0,Viking,Asteroid,...,0,0,0,0,0,0,0,19.000000,0.0,err


# The Relative Mean Distance from Root (RMDFR)
Mean distance from the root node to every other node in the clickstream graph normalized over the longest distance from starting node.

In [31]:
RMDFR = []
errors = []
for n in range(91508):
    path = backspace_cleaner(list(test_df.iat[n,2]))
    for m in range(len(path)):
        path[m] = unquote(path[m])
    if len(path)>1:
        DFR = []
        for m in range(1,len(path)):
            try:
                DFR.append(len(nx.dijkstra_path(G, path[0], path[m]))-1) #distance from root
            except:
                DFR.append(0)
                errors.append(n)
        #print(DFR)
    
        MDFR = sum(DFR)/len(DFR) # mean distance from root
        RMDFR.append(MDFR/max(DFR)) #relative mean distance from root
    else:
        RMDFR.append(0)
    
test_df['RMDFR'] = RMDFR
test_df

Unnamed: 0,duration_Secs,path,fragmented_Path,click_Path,click_Count,back_Click,back_click_Count,completed_Path,target,source,...,Mathematics,Music,People,Religion,Science,none,Avg_outDegree,Avg_linkTraversals,detourness,RMDFR
0,166,14th_century;15th_century;16th_century;Pacific...,"[14th_century, 15th_century, 16th_century, Pac...","[14th_century, 15th_century, 16th_century, Pac...",9,0,0,1,African_slave_trade,14th_century,...,0,0,0,0,0,0,66.500000,1.0,0.666667,0.833333
1,88,14th_century;Europe;Africa;Atlantic_slave_trad...,"[14th_century, Europe]","[14th_century, Europe, Africa, Atlantic_slave_...",5,0,0,1,African_slave_trade,14th_century,...,0,0,0,0,0,0,95.000000,1.0,1,1.000000
2,138,14th_century;Niger;Nigeria;British_Empire;Slav...,"[14th_century, Niger, Nigeria, British_Empire,...","[14th_century, Niger, Nigeria, British_Empire,...",8,0,0,1,African_slave_trade,14th_century,...,0,0,0,0,0,0,74.600000,1.0,0.5,0.875000
3,37,14th_century;Renaissance;Ancient_Greece;Greece,"[14th_century, Renaissance, Ancient_Greece]","[14th_century, Renaissance, Ancient_Greece, Gr...",4,0,0,1,Greece,14th_century,...,0,0,0,0,0,0,53.000000,1.0,1,0.750000
4,175,14th_century;Italy;Roman_Catholic_Church;HIV;R...,"[14th_century, Italy, Roman_Catholic_Church, H...","[14th_century, Italy, Roman_Catholic_Church, H...",7,0,0,1,John_F._Kennedy,14th_century,...,0,0,0,0,0,0,55.200000,1.0,0.75,0.750000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
91542,30,Beach;Geology;Science;Mathematics,"[Beach, Geology, Science]","[Beach, Geology, Science, Mathematics]",4,0,0,0,African_Union,Beach,...,0,0,0,0,0,0,31.666667,1.0,1,0.750000
91543,1949,FIFA_World_Cup;Spain;Tourism;The_Ashes,"[FIFA_World_Cup, Spain, Tourism]","[FIFA_World_Cup, Spain, Tourism, The_Ashes]",4,0,0,0,A_cappella,FIFA_World_Cup,...,0,0,0,0,0,0,64.333333,1.0,1,0.750000
91544,112,Julius_Caesar;Roman_mythology;1st_century_BC;1...,"[Julius_Caesar, Roman_mythology, 1st_century_B...","[Julius_Caesar, Roman_mythology, 1st_century_B...",7,1,1,0,1980_eruption_of_Mount_St._Helens,Julius_Caesar,...,0,0,0,1,0,0,64.800000,1.0,0.5,0.875000
91545,1838,Asteroid;Trojan_War,[Asteroid],"[Asteroid, Trojan_War]",2,0,0,0,Viking,Asteroid,...,0,0,0,0,0,0,19.000000,0.0,err,0.000000


In [32]:
error_paths = []
for n in errors:
    path = backspace_cleaner(list(test_df.iat[n,2]))
    for m in range(len(path)):
        path[m] = unquote(path[m])
    error_paths.append(path)
edges_to_add = []
for broken_path in error_paths:
    counter = 0
    for m in range(len(broken_path)):
        if counter != 0:
            try:
                nx.dijkstra_path(G, broken_path[m-1], broken_path[m])
            except:
                edges_to_add.append((broken_path[m-1], broken_path[m]))
        counter += 1
        
edges_to_add

[]

# Connection ratio:(links used in path/all links connecting nodes in path)  

In [33]:
con_rat = []
for n in range(91508):
    path = backspace_cleaner(list(test_df.iat[n,2]))
    for m in range(len(path)):
        path[m] = unquote(path[m])
    sub = G.subgraph(path)
    edges = len(sub.edges())
    try:
        con_rat.append((len(path)-1)/edges)
    except:
        con_rat.append(0)
        #print(n,path) #errors for unfixed names and single-node paths?
test_df['connection_ratio'] = con_rat
test_df

Unnamed: 0,duration_Secs,path,fragmented_Path,click_Path,click_Count,back_Click,back_click_Count,completed_Path,target,source,...,Music,People,Religion,Science,none,Avg_outDegree,Avg_linkTraversals,detourness,RMDFR,connection_ratio
0,166,14th_century;15th_century;16th_century;Pacific...,"[14th_century, 15th_century, 16th_century, Pac...","[14th_century, 15th_century, 16th_century, Pac...",9,0,0,1,African_slave_trade,14th_century,...,0,0,0,0,0,66.500000,1.0,0.666667,0.833333,0.428571
1,88,14th_century;Europe;Africa;Atlantic_slave_trad...,"[14th_century, Europe]","[14th_century, Europe, Africa, Atlantic_slave_...",5,0,0,1,African_slave_trade,14th_century,...,0,0,0,0,0,95.000000,1.0,1,1.000000,1.000000
2,138,14th_century;Niger;Nigeria;British_Empire;Slav...,"[14th_century, Niger, Nigeria, British_Empire,...","[14th_century, Niger, Nigeria, British_Empire,...",8,0,0,1,African_slave_trade,14th_century,...,0,0,0,0,0,74.600000,1.0,0.5,0.875000,0.571429
3,37,14th_century;Renaissance;Ancient_Greece;Greece,"[14th_century, Renaissance, Ancient_Greece]","[14th_century, Renaissance, Ancient_Greece, Gr...",4,0,0,1,Greece,14th_century,...,0,0,0,0,0,53.000000,1.0,1,0.750000,0.500000
4,175,14th_century;Italy;Roman_Catholic_Church;HIV;R...,"[14th_century, Italy, Roman_Catholic_Church, H...","[14th_century, Italy, Roman_Catholic_Church, H...",7,0,0,1,John_F._Kennedy,14th_century,...,0,0,0,0,0,55.200000,1.0,0.75,0.750000,0.800000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
91542,30,Beach;Geology;Science;Mathematics,"[Beach, Geology, Science]","[Beach, Geology, Science, Mathematics]",4,0,0,0,African_Union,Beach,...,0,0,0,0,0,31.666667,1.0,1,0.750000,1.000000
91543,1949,FIFA_World_Cup;Spain;Tourism;The_Ashes,"[FIFA_World_Cup, Spain, Tourism]","[FIFA_World_Cup, Spain, Tourism, The_Ashes]",4,0,0,0,A_cappella,FIFA_World_Cup,...,0,0,0,0,0,64.333333,1.0,1,0.750000,0.500000
91544,112,Julius_Caesar;Roman_mythology;1st_century_BC;1...,"[Julius_Caesar, Roman_mythology, 1st_century_B...","[Julius_Caesar, Roman_mythology, 1st_century_B...",7,1,1,0,1980_eruption_of_Mount_St._Helens,Julius_Caesar,...,0,0,1,0,0,64.800000,1.0,0.5,0.875000,0.444444
91545,1838,Asteroid;Trojan_War,[Asteroid],"[Asteroid, Trojan_War]",2,0,0,0,Viking,Asteroid,...,0,0,0,0,0,19.000000,0.0,err,0.000000,0.000000


# Length of Shortest Path from Root to Target
i threw unquote in here, havent tried running it to see if it worked

In [34]:
shortest_path = []
for n in range(91508):
    start = unquote(str(final_df.iat[n,2][0]))
    target = unquote(str(final_df.iat[n,7]))
    try:
        shortest_path.append(len(nx.dijkstra_path(G,start,target)))
    except:
        shortest_path.append('err') #naming errors
test_df['shortest_path'] = shortest_path
test_df

Unnamed: 0,duration_Secs,path,fragmented_Path,click_Path,click_Count,back_Click,back_click_Count,completed_Path,target,source,...,People,Religion,Science,none,Avg_outDegree,Avg_linkTraversals,detourness,RMDFR,connection_ratio,shortest_path
0,166,14th_century;15th_century;16th_century;Pacific...,"[14th_century, 15th_century, 16th_century, Pac...","[14th_century, 15th_century, 16th_century, Pac...",9,0,0,1,African_slave_trade,14th_century,...,0,0,0,0,66.500000,1.0,0.666667,0.833333,0.428571,4
1,88,14th_century;Europe;Africa;Atlantic_slave_trad...,"[14th_century, Europe]","[14th_century, Europe, Africa, Atlantic_slave_...",5,0,0,1,African_slave_trade,14th_century,...,0,0,0,0,95.000000,1.0,1,1.000000,1.000000,4
2,138,14th_century;Niger;Nigeria;British_Empire;Slav...,"[14th_century, Niger, Nigeria, British_Empire,...","[14th_century, Niger, Nigeria, British_Empire,...",8,0,0,1,African_slave_trade,14th_century,...,0,0,0,0,74.600000,1.0,0.5,0.875000,0.571429,4
3,37,14th_century;Renaissance;Ancient_Greece;Greece,"[14th_century, Renaissance, Ancient_Greece]","[14th_century, Renaissance, Ancient_Greece, Gr...",4,0,0,1,Greece,14th_century,...,0,0,0,0,53.000000,1.0,1,0.750000,0.500000,3
4,175,14th_century;Italy;Roman_Catholic_Church;HIV;R...,"[14th_century, Italy, Roman_Catholic_Church, H...","[14th_century, Italy, Roman_Catholic_Church, H...",7,0,0,1,John_F._Kennedy,14th_century,...,0,0,0,0,55.200000,1.0,0.75,0.750000,0.800000,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
91542,30,Beach;Geology;Science;Mathematics,"[Beach, Geology, Science]","[Beach, Geology, Science, Mathematics]",4,0,0,0,African_Union,Beach,...,0,0,0,0,31.666667,1.0,1,0.750000,1.000000,4
91543,1949,FIFA_World_Cup;Spain;Tourism;The_Ashes,"[FIFA_World_Cup, Spain, Tourism]","[FIFA_World_Cup, Spain, Tourism, The_Ashes]",4,0,0,0,A_cappella,FIFA_World_Cup,...,0,0,0,0,64.333333,1.0,1,0.750000,0.500000,4
91544,112,Julius_Caesar;Roman_mythology;1st_century_BC;1...,"[Julius_Caesar, Roman_mythology, 1st_century_B...","[Julius_Caesar, Roman_mythology, 1st_century_B...",7,1,1,0,1980_eruption_of_Mount_St._Helens,Julius_Caesar,...,0,1,0,0,64.800000,1.0,0.5,0.875000,0.444444,4
91545,1838,Asteroid;Trojan_War,[Asteroid],"[Asteroid, Trojan_War]",2,0,0,0,Viking,Asteroid,...,0,0,0,0,19.000000,0.0,err,0.000000,0.000000,4


### Backtracking (% of traversals to previously visited nodes)

In [35]:
relative_backtracking = []
for n in range(91508):
    path = backspace_cleaner(list(test_df.iat[n,2]))
    uniques=[]
    for node in path:
        if node not in uniques:
            uniques.append(node)
    relative_backtracking.append( (len(path)-len(uniques))/len(path) )

test_df['relative_backtracking'] = relative_backtracking
test_df

Unnamed: 0,duration_Secs,path,fragmented_Path,click_Path,click_Count,back_Click,back_click_Count,completed_Path,target,source,...,Religion,Science,none,Avg_outDegree,Avg_linkTraversals,detourness,RMDFR,connection_ratio,shortest_path,relative_backtracking
0,166,14th_century;15th_century;16th_century;Pacific...,"[14th_century, 15th_century, 16th_century, Pac...","[14th_century, 15th_century, 16th_century, Pac...",9,0,0,1,African_slave_trade,14th_century,...,0,0,0,66.500000,1.0,0.666667,0.833333,0.428571,4,0.0
1,88,14th_century;Europe;Africa;Atlantic_slave_trad...,"[14th_century, Europe]","[14th_century, Europe, Africa, Atlantic_slave_...",5,0,0,1,African_slave_trade,14th_century,...,0,0,0,95.000000,1.0,1,1.000000,1.000000,4,0.0
2,138,14th_century;Niger;Nigeria;British_Empire;Slav...,"[14th_century, Niger, Nigeria, British_Empire,...","[14th_century, Niger, Nigeria, British_Empire,...",8,0,0,1,African_slave_trade,14th_century,...,0,0,0,74.600000,1.0,0.5,0.875000,0.571429,4,0.0
3,37,14th_century;Renaissance;Ancient_Greece;Greece,"[14th_century, Renaissance, Ancient_Greece]","[14th_century, Renaissance, Ancient_Greece, Gr...",4,0,0,1,Greece,14th_century,...,0,0,0,53.000000,1.0,1,0.750000,0.500000,3,0.0
4,175,14th_century;Italy;Roman_Catholic_Church;HIV;R...,"[14th_century, Italy, Roman_Catholic_Church, H...","[14th_century, Italy, Roman_Catholic_Church, H...",7,0,0,1,John_F._Kennedy,14th_century,...,0,0,0,55.200000,1.0,0.75,0.750000,0.800000,4,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
91542,30,Beach;Geology;Science;Mathematics,"[Beach, Geology, Science]","[Beach, Geology, Science, Mathematics]",4,0,0,0,African_Union,Beach,...,0,0,0,31.666667,1.0,1,0.750000,1.000000,4,0.0
91543,1949,FIFA_World_Cup;Spain;Tourism;The_Ashes,"[FIFA_World_Cup, Spain, Tourism]","[FIFA_World_Cup, Spain, Tourism, The_Ashes]",4,0,0,0,A_cappella,FIFA_World_Cup,...,0,0,0,64.333333,1.0,1,0.750000,0.500000,4,0.0
91544,112,Julius_Caesar;Roman_mythology;1st_century_BC;1...,"[Julius_Caesar, Roman_mythology, 1st_century_B...","[Julius_Caesar, Roman_mythology, 1st_century_B...",7,1,1,0,1980_eruption_of_Mount_St._Helens,Julius_Caesar,...,1,0,0,64.800000,1.0,0.5,0.875000,0.444444,4,0.0
91545,1838,Asteroid;Trojan_War,[Asteroid],"[Asteroid, Trojan_War]",2,0,0,0,Viking,Asteroid,...,0,0,0,19.000000,0.0,err,0.000000,0.000000,4,0.0


### Clickstream Compactness

In [36]:
click_comp = []
errors = []
for n in range(91508):    
    path = backspace_cleaner(list(test_df.iat[n,2]))
    for m in range(len(path)):
        path[m] = unquote(path[m])
    N=[]
    for node in path:
        if node not in N:
            N.append(node)
    n = len(N)
#     M = pd.DataFrame(np.zeros((len(path),len(path))), columns = path, index = path)
#     edges = [(path[n],path[n+1]) for n in range(len(path)-1)]
#     for (a,b) in edges:
#         M.at[a,b] = 1
    if n>1:
        C = pd.DataFrame(columns = path, index = path)
        for a in list(C.index):
            for b in list(C.index):
                try:
                    distance = len(nx.dijkstra_path(G,a,b))-1
                    C.at[a,b] = int(distance)
                except:
                    errors.append(n)
                    C.at[a,b] = n  #paper says to set inf distance to n, but how would there be nodes with inf distance in same clickstream?
        Csum = C.values.sum()
        click_comp.append( ((n**2)*(n-1)-Csum) / (n*((n-1)**2)) )
    else:
        click_comp.append(0)
test_df['clickstream_compactness'] = click_comp
test_df

Unnamed: 0,duration_Secs,path,fragmented_Path,click_Path,click_Count,back_Click,back_click_Count,completed_Path,target,source,...,Science,none,Avg_outDegree,Avg_linkTraversals,detourness,RMDFR,connection_ratio,shortest_path,relative_backtracking,clickstream_compactness
0,166,14th_century;15th_century;16th_century;Pacific...,"[14th_century, 15th_century, 16th_century, Pac...","[14th_century, 15th_century, 16th_century, Pac...",9,0,0,1,African_slave_trade,14th_century,...,0,0,66.500000,1.0,0.666667,0.833333,0.428571,4,0.0,0.861111
1,88,14th_century;Europe;Africa;Atlantic_slave_trad...,"[14th_century, Europe]","[14th_century, Europe, Africa, Atlantic_slave_...",5,0,0,1,African_slave_trade,14th_century,...,0,0,95.000000,1.0,1,1.000000,1.000000,4,0.0,0.500000
2,138,14th_century;Niger;Nigeria;British_Empire;Slav...,"[14th_century, Niger, Nigeria, British_Empire,...","[14th_century, Niger, Nigeria, British_Empire,...",8,0,0,1,African_slave_trade,14th_century,...,0,0,74.600000,1.0,0.5,0.875000,0.571429,4,0.0,0.825000
3,37,14th_century;Renaissance;Ancient_Greece;Greece,"[14th_century, Renaissance, Ancient_Greece]","[14th_century, Renaissance, Ancient_Greece, Gr...",4,0,0,1,Greece,14th_century,...,0,0,53.000000,1.0,1,0.750000,0.500000,3,0.0,0.833333
4,175,14th_century;Italy;Roman_Catholic_Church;HIV;R...,"[14th_century, Italy, Roman_Catholic_Church, H...","[14th_century, Italy, Roman_Catholic_Church, H...",7,0,0,1,John_F._Kennedy,14th_century,...,0,0,55.200000,1.0,0.75,0.750000,0.800000,4,0.0,0.787500
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
91542,30,Beach;Geology;Science;Mathematics,"[Beach, Geology, Science]","[Beach, Geology, Science, Mathematics]",4,0,0,0,African_Union,Beach,...,0,0,31.666667,1.0,1,0.750000,1.000000,4,0.0,0.500000
91543,1949,FIFA_World_Cup;Spain;Tourism;The_Ashes,"[FIFA_World_Cup, Spain, Tourism]","[FIFA_World_Cup, Spain, Tourism, The_Ashes]",4,0,0,0,A_cappella,FIFA_World_Cup,...,0,0,64.333333,1.0,1,0.750000,0.500000,4,0.0,0.833333
91544,112,Julius_Caesar;Roman_mythology;1st_century_BC;1...,"[Julius_Caesar, Roman_mythology, 1st_century_B...","[Julius_Caesar, Roman_mythology, 1st_century_B...",7,1,1,0,1980_eruption_of_Mount_St._Helens,Julius_Caesar,...,0,0,64.800000,1.0,0.5,0.875000,0.444444,4,0.0,0.837500
91545,1838,Asteroid;Trojan_War,[Asteroid],"[Asteroid, Trojan_War]",2,0,0,0,Viking,Asteroid,...,0,0,19.000000,0.0,err,0.000000,0.000000,4,0.0,0.000000


## Distance to target

In [37]:
current_distance_to_target = []
for n in range(91508):
    current = unquote(str(test_df.iat[n,2][-1]))
    target = unquote(str(test_df.iat[n,8]))
    try:
        current_distance_to_target.append(len(nx.dijkstra_path(G,current,target)))
    except:
        current_distance_to_target.append('err') #naming errors
test_df['current_distance_to_target'] = current_distance_to_target
test_df

Unnamed: 0,duration_Secs,path,fragmented_Path,click_Path,click_Count,back_Click,back_click_Count,completed_Path,target,source,...,none,Avg_outDegree,Avg_linkTraversals,detourness,RMDFR,connection_ratio,shortest_path,relative_backtracking,clickstream_compactness,current_distance_to_target
0,166,14th_century;15th_century;16th_century;Pacific...,"[14th_century, 15th_century, 16th_century, Pac...","[14th_century, 15th_century, 16th_century, Pac...",9,0,0,1,African_slave_trade,14th_century,...,0,66.500000,1.0,0.666667,0.833333,0.428571,4,0.0,0.861111,4
1,88,14th_century;Europe;Africa;Atlantic_slave_trad...,"[14th_century, Europe]","[14th_century, Europe, Africa, Atlantic_slave_...",5,0,0,1,African_slave_trade,14th_century,...,0,95.000000,1.0,1,1.000000,1.000000,4,0.0,0.500000,4
2,138,14th_century;Niger;Nigeria;British_Empire;Slav...,"[14th_century, Niger, Nigeria, British_Empire,...","[14th_century, Niger, Nigeria, British_Empire,...",8,0,0,1,African_slave_trade,14th_century,...,0,74.600000,1.0,0.5,0.875000,0.571429,4,0.0,0.825000,3
3,37,14th_century;Renaissance;Ancient_Greece;Greece,"[14th_century, Renaissance, Ancient_Greece]","[14th_century, Renaissance, Ancient_Greece, Gr...",4,0,0,1,Greece,14th_century,...,0,53.000000,1.0,1,0.750000,0.500000,3,0.0,0.833333,2
4,175,14th_century;Italy;Roman_Catholic_Church;HIV;R...,"[14th_century, Italy, Roman_Catholic_Church, H...","[14th_century, Italy, Roman_Catholic_Church, H...",7,0,0,1,John_F._Kennedy,14th_century,...,0,55.200000,1.0,0.75,0.750000,0.800000,4,0.0,0.787500,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
91542,30,Beach;Geology;Science;Mathematics,"[Beach, Geology, Science]","[Beach, Geology, Science, Mathematics]",4,0,0,0,African_Union,Beach,...,0,31.666667,1.0,1,0.750000,1.000000,4,0.0,0.500000,4
91543,1949,FIFA_World_Cup;Spain;Tourism;The_Ashes,"[FIFA_World_Cup, Spain, Tourism]","[FIFA_World_Cup, Spain, Tourism, The_Ashes]",4,0,0,0,A_cappella,FIFA_World_Cup,...,0,64.333333,1.0,1,0.750000,0.500000,4,0.0,0.833333,4
91544,112,Julius_Caesar;Roman_mythology;1st_century_BC;1...,"[Julius_Caesar, Roman_mythology, 1st_century_B...","[Julius_Caesar, Roman_mythology, 1st_century_B...",7,1,1,0,1980_eruption_of_Mount_St._Helens,Julius_Caesar,...,0,64.800000,1.0,0.5,0.875000,0.444444,4,0.0,0.837500,4
91545,1838,Asteroid;Trojan_War,[Asteroid],"[Asteroid, Trojan_War]",2,0,0,0,Viking,Asteroid,...,0,19.000000,0.0,err,0.000000,0.000000,4,0.0,0.000000,4


In [38]:
test_df.to_csv("model_Data.csv", index=False)
print("Done!")

Done!
