In [1]:
# https://adrien.barbaresi.eu/blog/trafilatura-main-text-content-python.html

import requests
import bs4
from bs4 import BeautifulSoup
import os


# Get archives

In [2]:
page_url = "https://substack.kghosh.me/archive"

page_sourced = requests.get(page_url).content 

html_content = BeautifulSoup(page_sourced, "html.parser")
# #main > div.archive-page.typography.use-theme-bg > div > div > div.portable-archive-list > div:nth-child(19) > div.post-preview-content > a.post-preview-title.newsletter
links = html_content.findAll('a', class_="pencraft")
print(len(links)) 
pages = [i.get('href') for i in links if (not (i.get('href') is None))]

pages = [ i for i in pages  if "https://substack.kghosh.me/p/" in i ]
pages = list(set([ i for i in pages  if not i.endswith("comments") ]))
len(pages), pages

24


(12,
 ['https://substack.kghosh.me/p/20240121',
  'https://substack.kghosh.me/p/20230114',
  'https://substack.kghosh.me/p/20240128',
  'https://substack.kghosh.me/p/20231126',
  'https://substack.kghosh.me/p/20221217',
  'https://substack.kghosh.me/p/20231209',
  'https://substack.kghosh.me/p/some-guesses-about-2024',
  'https://substack.kghosh.me/p/20231230',
  'https://substack.kghosh.me/p/20240210',
  'https://substack.kghosh.me/p/20231203',
  'https://substack.kghosh.me/p/20240204',
  'https://substack.kghosh.me/p/20230109'])

In [3]:
for page in pages:
    name = page.split("/")[-1]

    if not os.path.exists(".cache/"+name):
        page_sourced = requests.get(page).content 
        html_content = BeautifulSoup(page_sourced, "html.parser")
        content = html_content.findAll('div', class_="body markup")

        with open(".cache/"+name, 'w') as f:
            f.write(str(content))
        print(name,"saved")
    else:
        print(name,"exists.")

20240121 exists.
20230114 exists.
20240128 exists.
20231126 exists.
20221217 exists.
20231209 exists.
some-guesses-about-2024 exists.
20231230 exists.
20240210 saved
20231203 exists.
20240204 exists.
20230109 exists.


In [4]:
import glob
import pandas as pd


cached_pages = glob.glob(".cache/*")


In [5]:
print("There are",len(cached_pages),'pages saved.')
cached_pages.sort()
cached_pages[:3]

There are 1375 pages saved.


['.cache/001b195bbddec8fb0193b668a70aa883',
 '.cache/002ee663c73c7add6ce2cabe29e4ae02',
 '.cache/00931247998b35b40d513cfa65a11571']

In [6]:
URLs = []
for page in cached_pages:
    with open(page) as fp:
        html_content = BeautifulSoup(fp, 'html.parser')  
    content = html_content.findAll('a')
    content = [i.get('href') for i in content if (not (i.get('href') is None))]
    for link in content:
        URLs.append([page,link])

In [7]:
import hashlib
print(len(URLs))
df = pd.DataFrame(URLs)
df.columns = ["page","url"]
df = df.drop_duplicates(subset=["url"])
df["url"] = df.url.apply(lambda x: str(x).encode('utf-8'))

df["hash"] = df.url.apply(lambda x: hashlib.md5(str(x).encode('utf-8')).hexdigest())
df.to_parquet("urls.parquet.gzip",  engine='pyarrow', compression='gzip')
df

2032


Unnamed: 0,page,url,hash
0,.cache/20220118,b'https://www.gov.wales/sites/default/files/pu...,3b92dc627d73b0d586fcff194b697c89
1,.cache/20220118,b'https://www.theguardian.com/world/2023/feb/1...,b3b0e79abac9ac8ba36e389c1d09621d
2,.cache/20220118,b'https://twitter.com/TrungTPhan/status/162664...,5d265f36c41b6d01ca52ab0f99ee8e95
3,.cache/20220118,b'https://www.futuregenerations.wales/about-us...,3938700a79bbde32972657f9eb7b1b69
4,.cache/20220118,b'https://www.bbc.com/future/article/20230215-...,32228bbf929d81d39a1808dd9b7c4493
...,...,...,...
2027,.cache/fully-homomorphic-encryption,"b'https://substackcdn.com/image/fetch/f_auto,q...",f0a23dcd98c2fdf0a3fd523a04e7c255
2028,.cache/some-guesses-about-2024,"b'https://substackcdn.com/image/fetch/f_auto,q...",b82a7509d34dc6974ca9036c0d80ba7d
2029,.cache/some-guesses-about-2024,b'https://www2.deloitte.com/us/en/insights/ind...,c1f03407e92d82ff6f8e3bc04f8996c2
2030,.cache/when-the-tide-of-ai-generated-texts,"b'https://substackcdn.com/image/fetch/f_auto,q...",d256bfc733bfe7fe3f55726967800605


In [8]:
pd.read_parquet("urls.parquet.gzip")

Unnamed: 0,page,url,hash
0,.cache/20220118,b'https://www.gov.wales/sites/default/files/pu...,3b92dc627d73b0d586fcff194b697c89
1,.cache/20220118,b'https://www.theguardian.com/world/2023/feb/1...,b3b0e79abac9ac8ba36e389c1d09621d
2,.cache/20220118,b'https://twitter.com/TrungTPhan/status/162664...,5d265f36c41b6d01ca52ab0f99ee8e95
3,.cache/20220118,b'https://www.futuregenerations.wales/about-us...,3938700a79bbde32972657f9eb7b1b69
4,.cache/20220118,b'https://www.bbc.com/future/article/20230215-...,32228bbf929d81d39a1808dd9b7c4493
...,...,...,...
2027,.cache/fully-homomorphic-encryption,"b'https://substackcdn.com/image/fetch/f_auto,q...",f0a23dcd98c2fdf0a3fd523a04e7c255
2028,.cache/some-guesses-about-2024,"b'https://substackcdn.com/image/fetch/f_auto,q...",b82a7509d34dc6974ca9036c0d80ba7d
2029,.cache/some-guesses-about-2024,b'https://www2.deloitte.com/us/en/insights/ind...,c1f03407e92d82ff6f8e3bc04f8996c2
2030,.cache/when-the-tide-of-ai-generated-texts,"b'https://substackcdn.com/image/fetch/f_auto,q...",d256bfc733bfe7fe3f55726967800605
