In [7]:
!pip install cloudscraper



In [8]:
import pandas as pd

In [9]:
import cloudscraper
from bs4 import BeautifulSoup
import time
import random

scraper = cloudscraper.create_scraper(
    browser={
        'browser': 'chrome',
        'platform': 'windows',
        'desktop': True
    }
)

story_links = []

print(f"Starting link collection from 149 pages...")

for page in range(1, 150):
    url = f"https://www.urdupoint.com/kids/category/moral-stories-page{page}.html"
    print(f"[DEBUG] Fetching page {page}: {url}", end=" ")
    
    try:
        response = scraper.get(url, timeout=20)
        
        if response.status_code == 200:
            soup = BeautifulSoup(response.text, 'html.parser')
            links_elements = soup.select('a.sharp_box')
            print(f"Found {len(links_elements)} links")
            
            for elem in links_elements:
                link = elem.get('href')
                title_ur = elem.find('p', class_='title_ur')
                title_en = elem.find('p', class_='title_en')
                
                urdu_title = title_ur.text if title_ur else ''
                eng_title = title_en.text if title_en else ''
                
                story_links.append({
                    'link': link,
                    'urdu_title': urdu_title,
                    'eng_title': eng_title
                })
        else:
            print(f"Failed: HTTP {response.status_code}")
        
        time.sleep(random.uniform(1, 2))
        
    except Exception as e:
        print(f"Error: {str(e)}")
        time.sleep(3)
        continue

print(f"Total story links collected: {len(story_links)}")
for s in story_links[:5]:  
    print(s)

Starting link collection from 149 pages...
[DEBUG] Fetching page 1: https://www.urdupoint.com/kids/category/moral-stories-page1.html Found 12 links
[DEBUG] Fetching page 2: https://www.urdupoint.com/kids/category/moral-stories-page2.html Found 12 links
[DEBUG] Fetching page 3: https://www.urdupoint.com/kids/category/moral-stories-page3.html Found 12 links
[DEBUG] Fetching page 4: https://www.urdupoint.com/kids/category/moral-stories-page4.html Found 12 links
[DEBUG] Fetching page 5: https://www.urdupoint.com/kids/category/moral-stories-page5.html Found 12 links
[DEBUG] Fetching page 6: https://www.urdupoint.com/kids/category/moral-stories-page6.html Found 12 links
[DEBUG] Fetching page 7: https://www.urdupoint.com/kids/category/moral-stories-page7.html Found 12 links
[DEBUG] Fetching page 8: https://www.urdupoint.com/kids/category/moral-stories-page8.html Found 12 links
[DEBUG] Fetching page 9: https://www.urdupoint.com/kids/category/moral-stories-page9.html Found 12 links
[DEBUG] Fetc

In [12]:
import cloudscraper
from bs4 import BeautifulSoup
import re
import json
import time
import random
from concurrent.futures import ThreadPoolExecutor

scraper = cloudscraper.create_scraper(
    browser={
        'browser': 'chrome',
        'platform': 'windows',
        'desktop': True
    }
)

def clean_and_tokenize(html_content):
    soup = BeautifulSoup(html_content, 'html.parser')    
    
    container = soup.find('div', class_='txt_detail') or \
                soup.find('div', {'id': 'ubody'}) or \
                soup.find(id="main_content")
    
    if not container:
        return None

    raw_text = []
    for text in container.find_all(string=True):
        if text.parent.name in ['script', 'style', 'ins', 'iframe']:
            continue
        
        clean_chunk = text.strip()
        if clean_chunk:
            raw_text.append(clean_chunk)
            if text.parent.name in ['br', 'p']:
                raw_text.append("<EOP>")

    full_content = " ".join(raw_text)

    if "2026" in full_content:
        full_content = full_content.split("2026")[-1]
    
    if "Facebook" in full_content:
        full_content = full_content.split("Facebook")[0]

    full_content = re.sub(r'([۔！？])', r' <EOS>', full_content)
    full_content = re.sub(r'\s+', ' ', full_content)
    full_content = full_content.replace("<EOP> <EOP>", "<EOP>").strip()

    if len(full_content) > 100:
        return full_content + " <EOT>"
    return None


def fetch_story(args):
    i, item = args
    time.sleep(random.uniform(1, 2))
    
    for attempt in range(3):
        try:
            response = scraper.get(item['link'], timeout=20)
            
            if response.status_code == 200:
                tokenized_content = clean_and_tokenize(response.text)
                
                if tokenized_content:
                    result = {
                        "urdu_title": item['urdu_title'],
                        "content": tokenized_content,
                        "eng_id": item['eng_title']
                    }
                    print(f"[{i+1}/{len(story_links)}] Success: {item['eng_title']}")
                    return result
                else:
                    print(f"[{i+1}/{len(story_links)}] Warning: Parsing failed or content too short for {item['eng_title']}")
                    return None
            
            elif response.status_code == 429:
                wait_time = (attempt + 1) * 5
                print(f"[{i+1}/{len(story_links)}] Rate limited, waiting {wait_time}s...")
                time.sleep(wait_time)
                continue
                
            elif response.status_code == 403:
                print(f"[{i+1}/{len(story_links)}] Critical: 403 Forbidden. Stopping loop.")
                return None
            else:
                print(f"[{i+1}/{len(story_links)}] Failed: HTTP {response.status_code}")
                return None

        except Exception as e:
            print(f"[{i+1}/{len(story_links)}] Error: {str(e)}")
            time.sleep(3)
            continue
    
    print(f"[{i+1}/{len(story_links)}] Failed after 3 attempts: {item['eng_title']}")
    return None


final_dataset = []

print(f"Starting Bypass Extraction for {len(story_links)} stories...")

with ThreadPoolExecutor(max_workers=3) as executor:
    results = list(executor.map(fetch_story, enumerate(story_links)))

final_dataset = [r for r in results if r is not None]

with open('urdu_stories_final.json', 'w', encoding='utf-8') as f:
    json.dump(final_dataset, f, ensure_ascii=False, indent=4)

print(f"\nTask Complete. Saved {len(final_dataset)} stories to urdu_stories_final.json.")

Starting Bypass Extraction for 1780 stories...
[3/1780] Success: Ghar Mein Mujrim
[2/1780] Success: Naya Azm
[1/1780] Success: Purisrar Boorha
[6/1780] Success: Imandari Ka Inaam
[4/1780] Success: Jannat Ka Rasta
[5/1780] Success: Bhooton Ka Naam
[9/1780] Success: Himmat
[8/1780] Success: Hiran Ki Laparwahi
[7/1780] Success: Reham Ka Sila
[12/1780] Success: Qalam Ki Khwahish
[10/1780] Success: Karo Meharbani Tum Ahle Zameen Par
[11/1780] Success: Panda Aur Smart Phone
[14/1780] Success: Nani Ka Saman
[15/1780] Success: Khali Pinjra
[13/1780] Success: Anmol Dosti
[16/1780] Success: Taqatwar Bewaqoof
[18/1780] Success: Azhdahe Ka Wada
[17/1780] Success: Lakarhare Ka Beta
[19/1780] Success: Yeh Bakra Kis Ka Hai?
[20/1780] Success: Naiki Ka Sila
[22/1780] Success: Pari Ki Mamta
[21/1780] Success: Lagan
[23/1780] Success: Ilm Ki Inteha Jahalat Hai
[25/1780] Success: Taimoor Ki Zindagi
[24/1780] Success: Super Hero
[26/1780] Success: Kawwa
[28/1780] Success: Inaam
[27/1780] Success: Choti Si

In [13]:
with open('urdu_stories_final.json', 'r', encoding='utf-8') as f:
    stories = json.load(f)
with open('urdu_tokenizer_training.txt', 'w', encoding='utf-8') as f:
    for item in stories:
        content = item['content']
        content = content.replace("<EOS>", " <EOS> ").replace("<EOP>", " <EOP> ")
        f.write(content + "\n")