In [1]:
# https://adrien.barbaresi.eu/blog/trafilatura-main-text-content-python.html

import requests
import bs4
from bs4 import BeautifulSoup
import os


# Get archives

In [2]:
page_url = "https://kghosh.substack.com/archive"

page_sourced = requests.get(page_url).content 

html_content = BeautifulSoup(page_sourced, "html.parser")
# #main > div.archive-page.typography.use-theme-bg > div > div > div.portable-archive-list > div:nth-child(19) > div.post-preview-content > a.post-preview-title.newsletter
links = html_content.findAll('a', class_="pencraft")
print(len(links)) 
pages = [i.get('href') for i in links if (not (i.get('href') is None))]

pages = [ i for i in pages  if "https://kghosh.substack.com/p/" in i ]
pages = list(set([ i for i in pages  if not i.endswith("comments") ]))
len(pages), pages

24


(12,
 ['https://kghosh.substack.com/p/20230723',
  'https://kghosh.substack.com/p/20290830',
  'https://kghosh.substack.com/p/20230715',
  'https://kghosh.substack.com/p/20230701',
  'https://kghosh.substack.com/p/20230623',
  'https://kghosh.substack.com/p/20230827',
  'https://kghosh.substack.com/p/20230819',
  'https://kghosh.substack.com/p/20230730',
  'https://kghosh.substack.com/p/20230616',
  'https://kghosh.substack.com/p/20230708',
  'https://kghosh.substack.com/p/20230612',
  'https://kghosh.substack.com/p/20230810'])

In [3]:
for page in pages:
    name = page.split("/")[-1]

    if not os.path.exists(".cache/"+name):
        page_sourced = requests.get(page).content 
        html_content = BeautifulSoup(page_sourced, "html.parser")
        content = html_content.findAll('div', class_="body markup")

        with open(".cache/"+name, 'w') as f:
            f.write(str(content))
        print(name,"saved")
    else:
        print(name,"exists.")

20230723 exists.
20290830 saved
20230715 exists.
20230701 exists.
20230623 exists.
20230827 exists.
20230819 exists.
20230730 exists.
20230616 exists.
20230708 exists.
20230612 exists.
20230810 exists.


In [4]:
import glob
import pandas as pd


cached_pages = glob.glob(".cache/*")


In [5]:
print("There are",len(cached_pages),'pages saved.')
cached_pages.sort()
cached_pages[:3]

There are 59 pages saved.


['.cache/20220118', '.cache/20220128', '.cache/20220212']

In [6]:
URLs = []
for page in cached_pages:
    with open(page) as fp:
        html_content = BeautifulSoup(fp, 'html.parser')  
    content = html_content.findAll('a')
    content = [i.get('href') for i in content if (not (i.get('href') is None))]
    for link in content:
        URLs.append([page,link])

In [7]:
import hashlib
print(len(URLs))
df = pd.DataFrame(URLs)
df.columns = ["page","url"]
df = df.drop_duplicates(subset=["url"])
df["url"] = df.url.apply(lambda x: str(x).encode('utf-8'))

df["hash"] = df.url.apply(lambda x: hashlib.md5(str(x).encode('utf-8')).hexdigest())
df.to_parquet("urls.parquet.gzip",  engine='pyarrow', compression='gzip')
df

1419


Unnamed: 0,page,url,hash
0,.cache/20220118,b'https://www.gov.wales/sites/default/files/pu...,3b92dc627d73b0d586fcff194b697c89
1,.cache/20220118,b'https://www.theguardian.com/world/2023/feb/1...,b3b0e79abac9ac8ba36e389c1d09621d
2,.cache/20220118,b'https://twitter.com/TrungTPhan/status/162664...,5d265f36c41b6d01ca52ab0f99ee8e95
3,.cache/20220118,b'https://www.futuregenerations.wales/about-us...,3938700a79bbde32972657f9eb7b1b69
4,.cache/20220118,b'https://www.bbc.com/future/article/20230215-...,32228bbf929d81d39a1808dd9b7c4493
...,...,...,...
1414,.cache/fully-homomorphic-encryption,b'https://web.yammer.com/main/org/mottmac.com/...,fc70437f7665556e8d27c5a9653763a6
1415,.cache/fully-homomorphic-encryption,b'https://www.kaggle.com/code/concretemlteam/t...,cde046296faf786f02951f73b9b25417
1416,.cache/fully-homomorphic-encryption,"b'https://substackcdn.com/image/fetch/f_auto,q...",f0a23dcd98c2fdf0a3fd523a04e7c255
1417,.cache/when-the-tide-of-ai-generated-texts,"b'https://substackcdn.com/image/fetch/f_auto,q...",d256bfc733bfe7fe3f55726967800605


In [8]:
pd.read_parquet("urls.parquet.gzip")

Unnamed: 0,page,url,hash
0,.cache/20220118,b'https://www.gov.wales/sites/default/files/pu...,3b92dc627d73b0d586fcff194b697c89
1,.cache/20220118,b'https://www.theguardian.com/world/2023/feb/1...,b3b0e79abac9ac8ba36e389c1d09621d
2,.cache/20220118,b'https://twitter.com/TrungTPhan/status/162664...,5d265f36c41b6d01ca52ab0f99ee8e95
3,.cache/20220118,b'https://www.futuregenerations.wales/about-us...,3938700a79bbde32972657f9eb7b1b69
4,.cache/20220118,b'https://www.bbc.com/future/article/20230215-...,32228bbf929d81d39a1808dd9b7c4493
...,...,...,...
1414,.cache/fully-homomorphic-encryption,b'https://web.yammer.com/main/org/mottmac.com/...,fc70437f7665556e8d27c5a9653763a6
1415,.cache/fully-homomorphic-encryption,b'https://www.kaggle.com/code/concretemlteam/t...,cde046296faf786f02951f73b9b25417
1416,.cache/fully-homomorphic-encryption,"b'https://substackcdn.com/image/fetch/f_auto,q...",f0a23dcd98c2fdf0a3fd523a04e7c255
1417,.cache/when-the-tide-of-ai-generated-texts,"b'https://substackcdn.com/image/fetch/f_auto,q...",d256bfc733bfe7fe3f55726967800605
