In [1]:
import time
from selenium import webdriver
from bs4 import BeautifulSoup
from bs4.element import Comment
from tqdm import tqdm
import urllib.request
import requests
import pandas as pd
import os
import pickle



First we make a list of the names and webpages of all the authors listed on Rekhta. We use selenium for this because Rekhta employs 'lazy loading', i.e. the webpage loads only the first 50 entries at a time and more are loaded once the user scrolls down.

In [None]:
base_url = 'https://www.rekhta.org'
poets = []

# Launch the browser
driver = webdriver.Chrome()

#Removing 'V' from the list of alphabet because all those names are repeated under 'W' anyway.
for letter in tqdm('ABCDEFGHIJKLMNOPQRSTUWXYZ'):
    url = f'{base_url}/poets?startswith={letter}'
    driver.get(url)

    # Scroll down to load more poets
    last_height = driver.execute_script("return document.body.scrollHeight")
    while True:
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(2)  # Adjust sleep time as needed
        new_height = driver.execute_script("return document.body.scrollHeight")
        if new_height == last_height:
            break
        last_height = new_height

    # Parse the page source after scrolling
    soup = BeautifulSoup(driver.page_source, 'html.parser')

    for poet_div in soup.find_all('div', class_='poetColumn'):
        name_div = poet_div.find('div', class_='poetNameDatePlace')
        name_a = name_div.find('a')
        name = name_a.text.strip()
        href = name_a['href']

        location_div = poet_div.find('div', class_='poetPlaceDate')
        location_a = location_div.find('a')
        # location = location_a.text.strip()

        poet = {
            'name': name,
            'href': href,
            'location': location_a.text.strip() if location_a else None
        }
        poets.append(poet)

# Close the browser
driver.quit()

There was an error managing chromedriver (error sending request for url (https://googlechromelabs.github.io/chrome-for-testing/last-known-good-versions-with-downloads.json): error trying to connect: dns error: No such host is known. (os error 11001)); using driver found in the cache
There was an error managing chrome; using browser found in the cache


Converting the saved data to a Pandas dataframe.

In [100]:
df = pd.DataFrame(poets)
df['auth'] = df['href'].str.split('/').str[-1]
df.drop_duplicates(inplace=True)
df.reset_index(inplace=True)
authors=df['auth']
df.to_csv('poets.csv', index=False)
df

Unnamed: 0,index,name,href,location,auth
0,0,A G Josh,https://www.rekhta.org/poets/a-g-josh,Lahore,a-g-josh
1,1,A. D. Azhar,https://www.rekhta.org/poets/a-d-azhar,,a-d-azhar
2,2,A. Hameed,https://www.rekhta.org/authors/a-hameed,Lahore,a-hameed
3,3,A. Khayyam,https://www.rekhta.org/authors/a-khayyam-1,Karachi,a-khayyam-1
4,4,A. S Bukhari,https://www.rekhta.org/authors/a-s-bukhari,,a-s-bukhari
...,...,...,...,...,...
7863,7929,Zulfiqar Ahsan,https://www.rekhta.org/poets/zulfiqar-ahsan,Sargodha,zulfiqar-ahsan
7864,7930,Zulfiqar Naqvi,https://www.rekhta.org/poets/zulfiqar-naqvi,Jammu,zulfiqar-naqvi
7865,7931,Zulfiqar Rizvi,https://www.rekhta.org/poets/zulfiqar-rizvi,,zulfiqar-rizvi
7866,7932,Zulfiqar Zaki,https://www.rekhta.org/poets/zulfiqar-zaki,Sialkot,zulfiqar-zaki


Just as a sanity check, let's see how many authors' names begin with each alphabet.

In [48]:
for letter in 'ABCDEFGHIJKLMNOPQRSTUVWXYZ':
    print(letter, len(df[df['name'].str.startswith(letter)].index))

A 1375
B 203
C 39
D 124
E 63
F 229
G 148
H 317
I 264
J 213
K 384
L 46
M 1001
N 464
O 30
P 138
Q 141
R 447
S 1454
T 210
U 64
V 65
W 90
X 0
Y 73
Z 230


In [70]:
df[df['name'].str.startswith('P')]

Unnamed: 0,name,href,location,auth
4936,P P Srivastava Rind,https://www.rekhta.org/poets/p-p-srivastava-rind,Noida,p-p-srivastava-rind
4937,P. C. Kuttikrishnan,https://www.rekhta.org/authors/p-c-kuttikrishnan,,p-c-kuttikrishnan
4938,P. Keshav Dev,https://www.rekhta.org/authors/p-keshav-dev,,p-keshav-dev
4939,Pagal Adilabadi,https://www.rekhta.org/poets/pagal-adilabadi,Hyderabad,pagal-adilabadi
4940,Paigham Aafaqi,https://www.rekhta.org/authors/paigham-aafaqi,,paigham-aafaqi
...,...,...,...,...
5069,Purushottam Bhaskar Bhave,https://www.rekhta.org/poets/purushottam-bhask...,,purushottam-bhaskar-bhave
5070,Purushottam Shivaram Rege,https://www.rekhta.org/poets/purushottam-shiva...,Mumbai,purushottam-shivaram-rege
5071,Pushp Raj yadav,https://www.rekhta.org/poets/pushp-raj-yadav,Badayun,pushp-raj-yadav
5072,Pushpendra Pushp,https://www.rekhta.org/poets/pushpendra-pushp,Jalaun,pushpendra-pushp


Now we visit each author's ghazals page and save all the links. We need to use Selenium again for this part because only 50 ghazals are listed at a time and we need to scroll down to see the rest.

We don't need to scroll down the page when scraping the actual text of the ghazals, so we will use BeautifulSoup for that part, which is much faster.

In [21]:
##Parsing based on home page of authors
driver = webdriver.Chrome()
url_base='https://www.rekhta.org/poets/'
dataset = {}
for author in tqdm(df.auth):
    links = []
    url_home_page= url_base +author+ '/ghazals'
    driver.get(url_home_page)
    last_height = driver.execute_script("return document.body.scrollHeight")
    while True:
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(1)
        new_height = driver.execute_script("return document.body.scrollHeight")
        if new_height == last_height:
            break
        last_height = new_height
    titles = parse_webpage_at_given_scroll(driver.page_source)
    for url in titles:
        if url.startswith('https://www.rekhta.org/ghazals/'):
            links.append(url)
    if len(links)>0:
        dataset[author] = links
    if i%100==0: #save the links to the file every 100 iterations (just in case the program crashes)
        with open('ghazal_links.pk', 'wb') as file:
            pickle.dump(dataset, file)
            driver.quit()
            driver = webdriver.Chrome()

with open('ghazal_links.pk', 'wb') as file:
    pickle.dump(dataset, file)
driver.quit()

100%|████████████████████████████████████████████████████████████████████████████| 1467/1467 [2:43:37<00:00,  6.69s/it]


I realized afterwards that some ghazals on Rekhta are in the "Hindi Ghazals" section rather than the Ghazals section where everything else is (this distinciton seems somewhat arbitrary). Luckily, this is a fairly small number of ghazals and I manually checked that there are no more 50 of these on any poet's webpage. So lazy loading isn't a problem, and we can just use BeautifulSoup for saving these links.

In [None]:
response = requests.get('https://www.rekhta.org/hindi-ghazals')
soup = BeautifulSoup(response.text, 'html.parser')

for poet_div in soup.find_all('div', class_='poetColumn'):
    name_div = poet_div.find('div', class_='poetNameDatePlace')
    name_a = name_div.find('a')
    href = name_a['href']
    auth = href.split('/')[-2]
    html = urllib.request.urlopen(href).read()
    titles = parse_webpage_at_given_scroll(html)
    if not auth in dataset.keys():
        dataset[auth] = []
    for url in titles:
        if url.startswith('https://www.rekhta.org/hindi-ghazals/'):
            dataset[auth].append(url)
            
with open('ghazal_links.pk', 'wb') as file:
    pickle.dump(dataset, file)

Some helper functions that we will use to save the text from the webpage containing a ghazal. The parse_ghazal function may need to be updated if the structure of the webpage changes.

In [23]:
#captures the different <p> tags as newlines
def text_with_newlines(elem):
    text = ''
    for e in elem.descendants:
        if isinstance(e, str):
            text += e
        elif e.name == 'br' or e.name == 'p':
            text += '\n'
    return text


def parse_ghazal(url):
    html = urllib.request.urlopen(url).read()
    soup = BeautifulSoup(html, 'html.parser')
    mydivs = soup.find_all("div", {"class": "pMC"})  # Find all instances of the pMC class
    
    # Check if there is at least one instance of pMC class
    if mydivs:        
        # Remove English translations present on the webpage
        for div in mydivs[1].find_all("div", {'class': 't'}):
            div.decompose()
        
        return text_with_newlines(mydivs[1])

In [2]:
#Loading the previously saved file (in case the kernel was restarted)
with open('ghazal_links.pk', 'rb') as file:
    dataset=pickle.load(file)
authors = list(dataset.keys())

In [4]:
total=0
for auth in dataset:
    total+=len(dataset[auth])
print(f'There are {len(dataset)} poets in this dataset and a total of {total} ghazals.')

There are 6224 poets in this dataset and a total of 69868 ghazals.


Now, actually scraping the ghazals' text. We make one file for every author.

In [24]:
lang='hi' #can be 'en', 'ur', or 'hi'
begin = 0 #Change 'begin' and 'end' if you want to only go over a smaller subset of authors at a time
end = 6224
failed = [] #Keep a track of the authors whose ghazals we couldn't scrape,
            #so we can go over them again later
    
if not os.path.exists('ghazals_'+lang):
    os.makedirs('ghazals_'+lang)
    
for auth in authors[begin:end]:
    try:
        ghazals = []
        print(f'scraping ghazals of {auth}')
        for url in tqdm(dataset[auth]):
            if lang!='en':
                url = url+'?lang='+lang
            ghazals.append(parse_ghazal(url).lstrip("\n"))
        with open(f'ghazals_{lang}/{auth}_{lang}.txt', 'w', encoding='utf-8') as file:
            file.write("\n\n".join(ghazals))
    except Exception as e:
        print(e)
        print("Failed:",auth)
        failed.append(auth)
with open(f'failed_{begin}-{end}.txt', 'w', encoding='utf-8') as f:
    f.write("\n".join(failed))

scraping ghazals of dard-faiz-khan


100%|██████████████████████████████████████████████████████████████████████████| 11/11 [00:16<00:00,  1.50s/it]


scraping ghazals of dard-sironji


100%|██████████████████████████████████████████████████████████████████████████| 11/11 [00:29<00:00,  2.66s/it]


scraping ghazals of darshan-dayal-parwaz


100%|██████████████████████████████████████████████████████████████████████████| 12/12 [00:27<00:00,  2.32s/it]
