In [2]:
!pip install cloudscraper

Collecting cloudscraper
  Downloading cloudscraper-1.2.71-py2.py3-none-any.whl.metadata (19 kB)
Downloading cloudscraper-1.2.71-py2.py3-none-any.whl (99 kB)
Installing collected packages: cloudscraper
Successfully installed cloudscraper-1.2.71


In [4]:
import pandas as pd

In [5]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
import time

chrome_options = Options()
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-blink-features=AutomationControlled")
chrome_options.add_argument("--start-maximized")

service = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service, options=chrome_options)

story_links = []

try:
    for page in range(1, 25):
        url = f"https://www.urdupoint.com/kids/category/moral-stories-page{page}.html"
        print(f"[DEBUG] Opening page {page}: {url}")

        try:
            driver.get(url)

            wait = WebDriverWait(driver, 10)
            links_elements = wait.until(
                EC.visibility_of_all_elements_located((By.CSS_SELECTOR, "a.sharp_box"))
            )

            print(f"[DEBUG] Found {len(links_elements)} links on page {page}")

            for elem in links_elements:
                link = elem.get_attribute("href")
                urdu_title = elem.find_element(By.CSS_SELECTOR, "p.title_ur").text
                eng_title = elem.find_element(By.CSS_SELECTOR, "p.title_en").text
                story_links.append({
                    "link": link,
                    "urdu_title": urdu_title,
                    "eng_title": eng_title
                })

        except Exception as e:
            print(f"[DEBUG] Error on page {page}: {e}")
            continue

finally:
    driver.quit()

print(f"\nTotal story links collected: {len(story_links)}")
for s in story_links[:5]:  
    print(s)


[DEBUG] Opening page 1: https://www.urdupoint.com/kids/category/moral-stories-page1.html
[DEBUG] Found 12 links on page 1
[DEBUG] Opening page 2: https://www.urdupoint.com/kids/category/moral-stories-page2.html
[DEBUG] Found 12 links on page 2
[DEBUG] Opening page 3: https://www.urdupoint.com/kids/category/moral-stories-page3.html
[DEBUG] Found 12 links on page 3
[DEBUG] Opening page 4: https://www.urdupoint.com/kids/category/moral-stories-page4.html
[DEBUG] Found 12 links on page 4
[DEBUG] Opening page 5: https://www.urdupoint.com/kids/category/moral-stories-page5.html
[DEBUG] Found 12 links on page 5
[DEBUG] Opening page 6: https://www.urdupoint.com/kids/category/moral-stories-page6.html
[DEBUG] Found 12 links on page 6
[DEBUG] Opening page 7: https://www.urdupoint.com/kids/category/moral-stories-page7.html
[DEBUG] Found 12 links on page 7
[DEBUG] Opening page 8: https://www.urdupoint.com/kids/category/moral-stories-page8.html
[DEBUG] Found 12 links on page 8
[DEBUG] Opening page 9: 

In [24]:
import cloudscraper
from bs4 import BeautifulSoup
import re
import json
import time
import random

scraper = cloudscraper.create_scraper(
    browser={
        'browser': 'chrome',
        'platform': 'windows',
        'desktop': True
    }
)

def clean_and_tokenize(html_content):
    soup = BeautifulSoup(html_content, 'html.parser')    
    container = soup.find('div', class_=lambda x: x and 'txt_detail' in x)
    if not container:
        return None

    for noise in container.find_all(['script', 'style', 'iframe', 'ins', 'div'], class_=['txt_banner', 'clear']):
        noise.decompose()

    for br in container.find_all(['br', 'p']):
        br.replace_with(" <EOP> ")

    text = container.get_text(separator=" ")

    text = re.sub(r'([۔！？])', r'\1 <EOS>', text)

    text = re.sub(r'\s+', ' ', text)
    text = text.replace("<EOP> <EOP>", "<EOP>").strip()

    if text and len(text) > 50:
        return text + " <EOT>"
    return None

final_dataset = []

print(f"Starting Bypass Extraction for {len(story_links)} stories...")

for i, item in enumerate(story_links):
    try:
        response = scraper.get(item['link'], timeout=20)
        
        if response.status_code == 200:
            tokenized_content = clean_and_tokenize(response.text)
            if tokenized_content:
                final_dataset.append({
                    "urdu_title": item['urdu_title'],
                    "content": tokenized_content,
                    "eng_id": item['eng_title']
                })
                print(f"[{i+1}/288] Success: {item['eng_title']}")
            else:
                print(f"[{i+1}/288] Warning: Failed to parse content for {item['eng_title']}")
        
        elif response.status_code == 403:
            print(f"[{i+1}/288] Critical: Still Blocked (403). The site might have flagged your IP temporarily.")
            break 
        else:
            print(f"[{i+1}/288] Failed: HTTP {response.status_code}")

        time.sleep(random.uniform(2, 5))

    except Exception as e:
        print(f"[{i+1}/288] Error: {str(e)}")
        time.sleep(5)

with open('urdu_stories_final.json', 'w', encoding='utf-8') as f:
    json.dump(final_dataset, f, ensure_ascii=False, indent=4)

print(f"\nTask Complete. Saved {len(final_dataset)} stories.")

Starting Bypass Extraction for 288 stories...
[1/288] Success: Purisrar Boorha
[2/288] Success: Naya Azm
[3/288] Success: Ghar Mein Mujrim
[4/288] Success: Jannat Ka Rasta
[5/288] Success: Bhooton Ka Naam
[6/288] Success: Imandari Ka Inaam
[7/288] Success: Reham Ka Sila
[8/288] Success: Hiran Ki Laparwahi
[9/288] Success: Himmat
[10/288] Success: Karo Meharbani Tum Ahle Zameen Par
[11/288] Success: Panda Aur Smart Phone
[12/288] Success: Qalam Ki Khwahish
[13/288] Success: Anmol Dosti
[14/288] Success: Nani Ka Saman
[15/288] Success: Khali Pinjra
[16/288] Success: Taqatwar Bewaqoof
[17/288] Success: Lakarhare Ka Beta
[18/288] Success: Azhdahe Ka Wada
[19/288] Success: Yeh Bakra Kis Ka Hai?
[20/288] Success: Naiki Ka Sila
[21/288] Success: Lagan
[22/288] Success: Pari Ki Mamta
[23/288] Success: Ilm Ki Inteha Jahalat Hai
[24/288] Success: Super Hero
[25/288] Success: Taimoor Ki Zindagi
[26/288] Success: Kawwa
[27/288] Success: Choti Si Khabar
[28/288] Success: Inaam
[29/288] Success: Nai

In [None]:
def remove_author_name(text):
    cleaned_text = re.sub(r'^.*?<EOP>\s*', '', text)
    return cleaned_text.strip()
with open('urdu_stories_final.json', 'r', encoding='utf-8') as f:
    stories = json.load(f)
with open('urdu_tokenizer_training.txt', 'w', encoding='utf-8') as f:
    for item in stories:
        content = item['content']
        content = remove_author_name(content)
        content = content.replace("<EOS>", " <EOS> ").replace("<EOP>", " <EOP> ")
        f.write(content + "\n")