In [None]:
!pip install graphistry==0.20.2 yarl==1.7.0

## Extract Elements from a URL

In [1]:
from yarl import URL

In [2]:
url = URL("https://github.com/search?q=data+science")

In [3]:
url.scheme

'https'

In [4]:
url.host

'github.com'

In [5]:
url.path

'/search'

In [6]:
url.query_string

'q=data science'

In [7]:
url = URL("https://github.com/khuyentran1401/Data-science#contents")
url.fragment

'contents'

In [8]:
import pandas as pd

data = pd.read_csv("URL Classification.csv", names=["url", "Type"], index_col=0)
data

Unnamed: 0,url,Type
1,http://www.liquidgeneration.com/,Adult
2,http://www.onlineanime.org/,Adult
3,http://www.ceres.dti.ne.jp/~nekoi/senno/senfir...,Adult
4,http://www.galeon.com/kmh/,Adult
5,http://www.fanworkrecs.com/,Adult
...,...,...
1562974,http://www.maxpreps.com/,Sports
1562975,http://www.myscore.com/,Sports
1562976,http://sportsillustrated.cnn.com/highschool,Sports
1562977,http://rss.cnn.com/rss/si_highschool?format=xml,Sports


In [9]:
sample = data.sample(10000, random_state=1)
sample.head(10)

Unnamed: 0,url,Type
1308349,http://yorkrite.com/ne/gcram/,Society
276825,http://www.alliedartistsofamerica.org/,Arts
1315267,http://www.msstate.edu/org/farmhouse/index.html,Society
392387,http://www.sunnytec.com.tw/,Business
1267730,http://www.kabissa.org,Society
1392788,http://www.newadvent.org/cathen/12134b.htm,Society
672881,http://www.gamespot.com/ps2/driving/arcticthun...,Games
1526125,http://www.studsquad.net/,Sports
1009901,http://www.spacedog.biz,Reference
330964,http://www.meditrans-japan.com/,Business


In [10]:
sample["url"] = sample["url"].apply(lambda url: URL(url))

In [11]:
processed = sample.assign(
    host=sample.url.apply(lambda url: url.host),
    path=sample.url.apply(lambda url: url.path),
    name=sample.url.apply(lambda url: url.name),
)
processed

Unnamed: 0,url,Type,host,path,name
1308349,http://yorkrite.com/ne/gcram/,Society,yorkrite.com,/ne/gcram/,
276825,http://www.alliedartistsofamerica.org/,Arts,www.alliedartistsofamerica.org,/,
1315267,http://www.msstate.edu/org/farmhouse/index.html,Society,www.msstate.edu,/org/farmhouse/index.html,index.html
392387,http://www.sunnytec.com.tw/,Business,www.sunnytec.com.tw,/,
1267730,http://www.kabissa.org,Society,www.kabissa.org,/,
...,...,...,...,...,...
169463,http://www.naxos.com/composerinfo/3303.htm,Arts,www.naxos.com,/composerinfo/3303.htm,3303.htm
1094461,http://members.tripod.com/arroweb1/,Science,members.tripod.com,/arroweb1/,
526843,http://www.petroskills.com/,Business,www.petroskills.com,/,
171473,http://www.gregbartholomew.com,Arts,www.gregbartholomew.com,/,


In [16]:
group = processed.groupby(["Type", "host"]).agg(count=("url", "count"))
group.sample(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,count
Type,host,Unnamed: 2_level_1
Society,www.haebea.org.uk,1
Computers,www.mccannas.com,1
Sports,kickette.com,1
Society,www.discountnewagebooks.com,1
Business,www.valeant.com,1
Recreation,drinkingbuddies.blogspot.com,1
Recreation,www.tyneteesmx5.web.st,1
Games,bf1942.boomtown.net,1
Recreation,illinois_scouter.tripod.com,1
Computers,stantchev.de,1


In [14]:
sorted_group = group.sort_values(by="count", ascending=False).reset_index()
sorted_group.head(10)

Unnamed: 0,Type,host,count
0,Arts,www.geocities.com,113
1,Society,www.newadvent.org,91
2,Arts,www.angelfire.com,69
3,Society,www.geocities.com,56
4,Recreation,www.geocities.com,47
5,Arts,us.imdb.com,32
6,Games,www.geocities.com,31
7,Society,www.angelfire.com,25
8,Arts,members.tripod.com,21
9,Computers,tools.ietf.org,21


In [27]:
largest = sorted_group.groupby("Type").head(5).sort_values(by='Type')
largest.sample(10)

Unnamed: 0,Type,host,count
268,Reference,www.ithaca.edu,2
13,Science,www.geocities.com,15
51,Recreation,groups.yahoo.com,6
8,Arts,members.tripod.com,21
37,Health,www.geocities.com,7
48,Science,members.tripod.com,6
35,Sports,sportsillustrated.cnn.com,7
89,Home,www.geocities.com,4
70,Business,www.geocities.com,4
1339,News,www.scottolsonphotography.com,1


## Visualize

In [19]:
from dotenv import load_dotenv
import os

load_dotenv()
PASSWORD = os.getenv("GRAPHISTRY_PASSWORD")
USERNAME = os.getenv("GRAPHISTRY_USERNAME")

In [20]:
import graphistry

graphistry.register(api=3, username=USERNAME, password=PASSWORD)

In [26]:
edges = largest[["Type", "host"]]
edges.tail(5)

Unnamed: 0,Type,host
35,Sports,sportsillustrated.cnn.com
11,Sports,www.geocities.com
45,Sports,www.clubwebsite.co.uk
50,Sports,www.angelfire.com
47,Sports,www.freewebs.com


In [31]:
def create_node_df(df: pd.DataFrame, col_name: str):
    nodes = (
        df[[col_name]]
        .assign(type=col_name)
        .rename(columns={col_name: "node"})
        .drop_duplicates()
    )
    return nodes

In [32]:
type_nodes = create_node_df(largest, "Type")
url_nodes = create_node_df(largest, "host")
nodes = pd.concat([type_nodes, url_nodes])
nodes

Unnamed: 0,node,type
115,Adult,Type
12,Arts,Type
114,Business,Type
9,Computers,Type
41,Games,Type
127,Health,Type
18,Home,Type
111,Kids,Type
1343,News,Type
4,Recreation,Type


In [33]:
g = (
    graphistry
    .edges(edges, "Type", "host")
    .nodes(nodes, "node")
)

In [34]:
g.plot()

In [28]:
types = list(edges.Type.unique())

In [38]:
from faker import Faker

fake = Faker()
colors = [fake.color() for _ in range(len(types))]
node_color_mapping = dict(zip(types, colors))
node_color_mapping

{'Adult': '#fcd599',
 'Arts': '#64e06d',
 'Business': '#dde87f',
 'Computers': '#27b567',
 'Games': '#24bf7c',
 'Health': '#f5c4fc',
 'Home': '#efe78f',
 'Kids': '#c1b7ff',
 'News': '#cebc16',
 'Recreation': '#c52cd6',
 'Reference': '#35c467',
 'Science': '#f95271',
 'Shopping': '#dcf9a4',
 'Society': '#ce610e',
 'Sports': '#78e817'}

In [39]:
node_icon_mapping = {"host": "link", "Type": "newspaper-o"}

In [40]:
g = (
    graphistry.edges(edges, "Type", "host")
    .nodes(nodes, "node")
    .encode_point_color(
        "node", categorical_mapping=node_color_mapping, default_mapping="silver"
    )
    .encode_point_icon(
        "type", categorical_mapping=node_icon_mapping
    )
)

In [41]:
g.plot()