In [1]:
import pandas as pd
import os

In [None]:
df = pd.read_csv('../Data/Page_rank_files/page_rank_links.csv')

In [None]:
# Number of unique urls

df_unique_urls = pd.concat([df['from_url'], df['to_url']]).unique()
print(f"Number of unique URLs: {len(df_unique_urls)}")

Number of unique URLs: 3582477


In [4]:
df.tail()

Unnamed: 0,from_url,to_url,anchor_text
14393079,https://sports.yahoo.com/video/nfl-adjusts-bet...,https://www.instagram.com/yahoosports,
14393080,https://sports.yahoo.com/video/nfl-adjusts-bet...,https://www.youtube.com/@YahooSports,
14393081,https://sports.yahoo.com/video/nfl-adjusts-bet...,https://www.snapchat.com/add/yahoosports,
14393082,https://sports.yahoo.com/video/nfl-adjusts-bet...,https://apps.apple.com/us/app/yahoo-sports-liv...,
14393083,https://sports.yahoo.com/video/nfl-adjusts-bet...,https://play.google.com/store/apps/details?id=...,


In [5]:
df.groupby('to_url').count().sort_values(by='from_url', ascending=False) # Most cited pages

Unnamed: 0_level_0,from_url,anchor_text
to_url,Unnamed: 1_level_1,Unnamed: 2_level_1
https://en.wikipedia.org/wiki/ISBN_(identifier),53392,53392
https://en.wikipedia.org/wiki/Doi_(identifier),29329,29329
https://foundation.wikimedia.org/wiki/Special:MyLanguage/Policy:Privacy_policy,21867,21867
https://en.wikipedia.org/wiki/Main_Page,20461,10303
https://en.wikipedia.org/wiki/Wikipedia:About,20368,20368
...,...,...
https://en.wikipedia.org/wiki/Johann_Strauss_Sr.,1,1
https://en.wikipedia.org/wiki/Johann_Trollmann,1,1
https://en.wikipedia.org/wiki/Johann_Valentin_Andreae,1,1
https://en.wikipedia.org/wiki/Johann_Viktor_Bredt,1,1


In [6]:
df = df[
    ~df["to_url"].str.startswith(("data:", "javascript:", "mailto:", "xmpp:"))
]

#Deleting garbage links from data i-e Base64 images, email links, javascript etc

In [7]:
df.groupby('to_url').count().sort_values(by='from_url', ascending=False) # Most cited pages

Unnamed: 0_level_0,from_url,anchor_text
to_url,Unnamed: 1_level_1,Unnamed: 2_level_1
https://en.wikipedia.org/wiki/ISBN_(identifier),53392,53392
https://en.wikipedia.org/wiki/Doi_(identifier),29329,29329
https://foundation.wikimedia.org/wiki/Special:MyLanguage/Policy:Privacy_policy,21867,21867
https://en.wikipedia.org/wiki/Main_Page,20461,10303
https://en.wikipedia.org/wiki/Wikipedia:About,20368,20368
...,...,...
https://en.wikipedia.org/wiki/James_Phillips_(kickboxer),1,1
https://en.wikipedia.org/wiki/James_Piereson,1,1
https://en.wikipedia.org/wiki/James_Pilkington_(bishop),1,1
https://en.wikipedia.org/wiki/James_Planch%C3%A9,1,1


In [8]:
len(df)

14316218

In [9]:
df.to_csv(os.path.join(os.getcwd(), 'Data', 'Page_rank_files','page_rank_links.csv'), index=False)

In [12]:
domain_ds = pd.read_csv(os.path.join(os.getcwd(), 'Data', 'Page_rank_files', 'domain_rank_links.csv'))

In [13]:
domain_ds.tail()

Unnamed: 0,from_domain,to_domain
2140708,sports.yahoo.com,www.instagram.com
2140709,sports.yahoo.com,www.youtube.com
2140710,sports.yahoo.com,www.snapchat.com
2140711,sports.yahoo.com,apps.apple.com
2140712,sports.yahoo.com,play.google.com


In [14]:
domain_ds.groupby('to_domain').count().sort_values(by='from_domain', ascending=False)

Unnamed: 0_level_0,from_domain
to_domain,Unnamed: 1_level_1
web.archive.org,103680
foundation.wikimedia.org,55355
www.facebook.com,40222
doi.org,37222
twitter.com,32816
...,...
specialforcestraining.info,1
specialcollections.williams.edu,1
specialcollections.le.ac.uk,1
special.lib.umn.edu,1


In [18]:
domain_ds.groupby('from_domain').count().sort_values(by='to_domain', ascending=False)

Unnamed: 0_level_0,to_domain
from_domain,Unnamed: 1_level_1
en.wikipedia.org,1167003
www.metmuseum.org,45984
football.fandom.com,45848
www.bbc.co.uk,41618
www.bbc.com,39284
...,...
www.citypopulation.de,0
quotefancy.com,0
tristandc.com,0
www.police.gov.hk,0


### URL TO ID mapping

Next section of the notebook contains code for giving each URL in our dataset an ID. This would help in making the calculations easier when calculating page rank

In [15]:
url_to_id = {}
id_to_url = {}

current_id = 0
for url in pd.concat([df['from_url'], df['to_url']]).unique():
    if url not in url_to_id:
        url_to_id[url] = current_id
        id_to_url[current_id] = url
        current_id += 1

In [17]:
current_id

3582477

In [None]:
import json 
with open('../Data/Page_rank_files/url_to_id.json', 'w') as f:
    json.dump(url_to_id, f, indent=4)

with open('../Data/Page_rank_files/id_to_url.json', 'w') as f:
    json.dump(id_to_url, f, indent=4)

In [19]:
df["from_id"] = df["from_url"].map(url_to_id)
df["to_id"] = df["to_url"].map(url_to_id)

df.head()

Unnamed: 0,from_url,to_url,anchor_text,from_id,to_id
0,https://www.edx.org/learn/computer-science/har...,https://www.edx.org/,,0,17
1,https://www.edx.org/learn/computer-science/har...,https://www.edx.org/executive-education/massac...,Artificial Intelligence: Implications for Busi...,0,48080
2,https://www.edx.org/learn/computer-science/har...,https://www.edx.org/learn/supply-chain-managem...,Supply Chain DynamicsMITx|Course,0,48081
3,https://www.edx.org/learn/computer-science/har...,https://www.edx.org/learn/python/the-georgia-i...,Computing in Python III: Data StructuresGTx|Co...,0,48082
4,https://www.edx.org/learn/computer-science/har...,https://www.edx.org/learn/leadership/harvard-u...,Exercising Leadership: Foundational Principles...,0,48083


In [20]:
df[["from_id", "to_id"]].to_csv(os.path.join(os.getcwd(), 'Data', 'Page_rank_files','page_rank_links_ids.csv'), index=False)

In [27]:
df_dom = pd.read_csv(os.path.join(os.getcwd(), 'Data', 'Page_rank_files', 'domain_rank_links.csv'))
len(df_dom)

2057939

In [28]:
df_dom = df_dom.dropna()
def is_valid_domain(domain):
    if not isinstance(domain, str) or domain.strip() == "":
        return False
    if " " in domain:
        return False
    if any(char in domain for char in ['/', '\\', ':', '?', '#', '[', ']', '@', '!', '$', '&', "'", '(', ')', '*', '+', ',', ';', '=']):
        return False
    if '.' not in domain:
        return False
    return True

df_dom = df_dom[df_dom['from_domain'].apply(is_valid_domain) & df_dom['to_domain'].apply(is_valid_domain)]

df_dom = df_dom.reset_index(drop=True)
print(df_dom.head())

   from_domain         to_domain
0  www.edx.org  business.edx.org
1  www.edx.org  www.facebook.com
2  www.edx.org             x.com
3  www.edx.org  www.linkedin.com
4  www.edx.org    www.reddit.com


In [29]:
len(df_dom)

2055033

In [None]:
df_dom.to_csv('../Data/Page_rank_files/domain_rank_links.csv', index=False)

In [31]:
dom_to_id = {}
id_to_dom = {}
current_id = 0
for dom in pd.concat([df_dom['from_domain'], df_dom['to_domain']]).unique():
    if dom not in dom_to_id:
        dom_to_id[dom] = current_id
        id_to_dom[current_id] = dom
        current_id += 1

In [32]:
import json
with open(os.path.join(os.getcwd(), 'Data', 'Page_rank_files', 'domain_to_id.json'), 'w') as f:
    json.dump(dom_to_id, f, indent=4)
with open(os.path.join(os.getcwd(), 'Data', 'Page_rank_files', 'id_to_domain.json'), 'w') as f:
    json.dump(id_to_dom, f, indent=4)

In [33]:
df_dom["from_id"] = df_dom["from_domain"].map(dom_to_id)
df_dom["to_id"] = df_dom["to_domain"].map(dom_to_id)

In [None]:
df_dom[["from_id", "to_id"]].to_csv('../Data/Page_rank_files/domain_rank_links_ids.csv', index=False)

In [35]:
current_id

102354