In [1]:
import pandas as pd
import csv
import time
import os
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException, WebDriverException
from urllib.parse import urljoin, urlparse
import requests

print("Libraries imported successfully!")

Libraries imported successfully!


In [2]:
from webdriver_manager.chrome import ChromeDriverManager

def setup_driver():
    """
    Set up Chrome WebDriver with appropriate options using webdriver-manager
    """
    try:
        chrome_options = Options()
        # Add options for better compatibility
        chrome_options.add_argument("--disable-blink-features=AutomationControlled")
        chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
        chrome_options.add_experimental_option('useAutomationExtension', False)
        chrome_options.add_argument("--no-sandbox")
        chrome_options.add_argument("--disable-dev-shm-usage")
        chrome_options.add_argument("--disable-gpu")
        chrome_options.add_argument("--window-size=1920,1080")
        
        # Create service with automatically managed Chrome driver
        service = Service(ChromeDriverManager().install())
        
        # Create driver
        driver = webdriver.Chrome(service=service, options=chrome_options)
        driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})")
        driver.maximize_window()
        
        return driver
    except Exception as e:
        print(f"Error setting up driver: {e}")
        return None

print("Driver setup function created with webdriver-manager!")

Driver setup function created with webdriver-manager!


In [10]:
def scrape_navigation_links(driver, base_url):
    """
    Scrape all navigation links from the website
    """
    navigation_links = []
    
    try:
        # Wait for page to load
        wait = WebDriverWait(driver, 10)
        wait.until(EC.presence_of_element_located((By.TAG_NAME, "body")))
        
        # Find all navigation links
        nav_elements = driver.find_elements(By.CSS_SELECTOR, "nav a, .nav a, .navbar a, .menu a, header a")
        
        for element in nav_elements:
            try:
                href = element.get_attribute("href")
                text = element.text.strip()
                
                if href and text:
                    # Convert relative URLs to absolute
                    full_url = urljoin(base_url, href)
                    navigation_links.append({
                        'link_text': text,
                        'url': full_url,
                        'section': 'navigation'
                    })
            except Exception as e:
                continue
                
        # Also get links from main content area
        content_links = driver.find_elements(By.CSS_SELECTOR, "main a, .content a, .main-content a")
        
        for element in content_links[:20]:  # Limit to first 20 content links
            try:
                href = element.get_attribute("href")
                text = element.text.strip()
                
                if href and text and len(text) > 2:
                    full_url = urljoin(base_url, href)
                    navigation_links.append({
                        'link_text': text,
                        'url': full_url,
                        'section': 'content'
                    })
            except Exception as e:
                continue
                
    except Exception as e:
        print(f"Error scraping navigation: {e}")
    
    return navigation_links

print("Navigation scraping function fixed!")

Navigation scraping function fixed!


In [4]:
def scrape_contact_info(driver):
    """
    Scrape contact information from the website
    """
    contact_info = []
    
    try:
        # Look for contact information in various selectors
        contact_selectors = [
            "contact", "footer", ".contact", ".contact-info", 
            ".contact-details", ".footer", "#contact", "#footer"
        ]
        
        for selector in contact_selectors:
            try:
                elements = driver.find_elements(By.CSS_SELECTOR, selector)
                for element in elements:
                    text = element.text.strip()
                    if text:
                        # Look for email addresses
                        import re
                        emails = re.findall(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', text)
                        
                        # Look for phone numbers
                        phones = re.findall(r'[\+]?[1-9]?[\d\s\-\(\)]{10,}', text)
                        
                        for email in emails:
                            contact_info.append({
                                'type': 'email',
                                'value': email,
                                'source_section': selector
                            })
                        
                        for phone in phones:
                            # Clean phone number
                            clean_phone = re.sub(r'[^\d\+]', '', phone)
                            if len(clean_phone) >= 10:
                                contact_info.append({
                                    'type': 'phone',
                                    'value': phone.strip(),
                                    'source_section': selector
                                })
                        
                        # Look for addresses (lines with common address keywords)
                        address_keywords = ['address', 'street', 'road', 'avenue', 'colombo', 'sri lanka']
                        lines = text.split('\n')
                        for line in lines:
                            line = line.strip()
                            if any(keyword.lower() in line.lower() for keyword in address_keywords) and len(line) > 10:
                                contact_info.append({
                                    'type': 'address',
                                    'value': line,
                                    'source_section': selector
                                })
            except Exception as e:
                continue
                
    except Exception as e:
        print(f"Error scraping contact info: {e}")
    
    return contact_info

print("Contact info scraping function created!")

Contact info scraping function created!


In [5]:
def scrape_departments_and_programs(driver):
    """
    Scrape academic departments and programs information
    """
    departments = []
    
    try:
        # Look for department/program related content
        dept_selectors = [
            "department", "program", "course", "faculty", "school",
            ".department", ".program", ".course", ".faculty", ".school",
            "#departments", "#programs", "#courses", "#faculties"
        ]
        
        for selector in dept_selectors:
            try:
                elements = driver.find_elements(By.CSS_SELECTOR, selector)
                for element in elements:
                    # Get text content
                    text = element.text.strip()
                    
                    # Get any links within the element
                    links = element.find_elements(By.TAG_NAME, "a")
                    
                    if text and len(text) > 10:
                        departments.append({
                            'section_type': selector,
                            'content': text[:500],  # Limit content length
                            'has_links': len(links) > 0,
                            'link_count': len(links)
                        })
                        
                    # Extract individual links if they seem to be department/program related
                    for link in links:
                        link_text = link.text.strip()
                        link_url = link.get_attribute("href")
                        
                        if link_text and link_url and len(link_text) > 3:
                            # Check if link text suggests it's academic content
                            academic_keywords = [
                                'department', 'faculty', 'school', 'program', 'course',
                                'degree', 'diploma', 'bachelor', 'master', 'phd', 'studies'
                            ]
                            
                            if any(keyword.lower() in link_text.lower() for keyword in academic_keywords):
                                departments.append({
                                    'section_type': 'academic_link',
                                    'content': link_text,
                                    'url': link_url,
                                    'has_links': True,
                                    'link_count': 1
                                })
                                
            except Exception as e:
                continue
                
    except Exception as e:
        print(f"Error scraping departments: {e}")
    
    return departments

print("Departments scraping function created!")

Departments scraping function created!


In [6]:
def scrape_general_content(driver, url):
    """
    Scrape general content from the website
    """
    content_data = []
    
    try:
        # Get page title
        title = driver.title
        
        # Get meta description
        meta_desc = ""
        try:
            meta_element = driver.find_element(By.CSS_SELECTOR, "meta[name='description']")
            meta_desc = meta_element.get_attribute("content")
        except:
            pass
            
        # Get main headings
        headings = []
        for tag in ['h1', 'h2', 'h3']:
            elements = driver.find_elements(By.TAG_NAME, tag)
            for element in elements:
                text = element.text.strip()
                if text:
                    headings.append({
                        'tag': tag,
                        'text': text
                    })
        
        # Get main content paragraphs
        paragraphs = []
        p_elements = driver.find_elements(By.TAG_NAME, "p")
        for p in p_elements[:10]:  # Limit to first 10 paragraphs
            text = p.text.strip()
            if text and len(text) > 20:
                paragraphs.append(text[:300])  # Limit paragraph length
        
        # Get images with alt text
        images = []
        img_elements = driver.find_elements(By.TAG_NAME, "img")
        for img in img_elements[:10]:  # Limit to first 10 images
            alt_text = img.get_attribute("alt")
            src = img.get_attribute("src")
            if alt_text or src:
                images.append({
                    'alt_text': alt_text or 'No alt text',
                    'src': src
                })
        
        content_data.append({
            'url': url,
            'title': title,
            'meta_description': meta_desc,
            'headings_count': len(headings),
            'paragraphs_count': len(paragraphs),
            'images_count': len(images),
            'main_headings': [h['text'] for h in headings[:5]],  # Top 5 headings
            'sample_content': paragraphs[:3] if paragraphs else []  # First 3 paragraphs
        })
        
    except Exception as e:
        print(f"Error scraping general content: {e}")
    
    return content_data

print("General content scraping function created!")

General content scraping function created!


In [7]:
def save_to_csv(data, filename, output_dir="../scraped_data"):
    """
    Save scraped data to CSV file
    """
    try:
        # Create output directory if it doesn't exist
        os.makedirs(output_dir, exist_ok=True)
        
        # Full path for the CSV file
        filepath = os.path.join(output_dir, filename)
        
        if data:
            # Convert to DataFrame and save
            df = pd.DataFrame(data)
            df.to_csv(filepath, index=False, encoding='utf-8')
            print(f"Saved {len(data)} records to {filepath}")
            return True
        else:
            print(f"No data to save for {filename}")
            return False
            
    except Exception as e:
        print(f"Error saving to CSV {filename}: {e}")
        return False

def save_all_data(navigation_data, contact_data, departments_data, content_data):
    """
    Save all scraped data to separate CSV files
    """
    results = {}
    
    # Save navigation links
    results['navigation'] = save_to_csv(navigation_data, "navigation_links.csv")
    
    # Save contact information
    results['contact'] = save_to_csv(contact_data, "contact_information.csv")
    
    # Save departments/programs
    results['departments'] = save_to_csv(departments_data, "departments_programs.csv")
    
    # Save general content
    results['content'] = save_to_csv(content_data, "general_content.csv")
    
    return results

print("CSV saving functions created!")

CSV saving functions created!


In [8]:
def main_scrape(url="https://www.sab.ac.lk/"):
    """
    Main function to scrape the SAB website
    """
    print(f"Starting to scrape: {url}")
    print("=" * 50)
    
    driver = None
    
    try:
        # Set up the driver
        driver = setup_driver()
        if driver is None:
            print("Failed to set up Chrome driver!")
            return False
        
        print("✓ Chrome driver set up successfully")
        
        # Navigate to the website
        driver.get(url)
        print(f"✓ Navigated to {url}")
        
        # Wait for the page to load
        time.sleep(3)
        
        # Scrape different types of data
        print("\n📊 Scraping navigation links...")
        navigation_data = scrape_navigation_links(driver, url)
        print(f"   Found {len(navigation_data)} navigation items")
        
        print("\n📞 Scraping contact information...")
        contact_data = scrape_contact_info(driver)
        print(f"   Found {len(contact_data)} contact items")
        
        print("\n🏫 Scraping departments and programs...")
        departments_data = scrape_departments_and_programs(driver)
        print(f"   Found {len(departments_data)} department items")
        
        print("\n📄 Scraping general content...")
        content_data = scrape_general_content(driver, url)
        print(f"   Found {len(content_data)} content items")
        
        # Save all data to CSV files
        print("\n💾 Saving data to CSV files...")
        save_results = save_all_data(navigation_data, contact_data, departments_data, content_data)
        
        print("\n✅ Scraping completed!")
        print("=" * 50)
        
        # Print summary
        print("\n📋 SUMMARY:")
        print(f"Navigation links: {len(navigation_data)}")
        print(f"Contact items: {len(contact_data)}")
        print(f"Department items: {len(departments_data)}")
        print(f"Content items: {len(content_data)}")
        print(f"CSV files saved: {sum(save_results.values())}/4")
        
        return True
        
    except Exception as e:
        print(f"❌ Error during scraping: {e}")
        return False
        
    finally:
        if driver:
            driver.quit()
            print("\n🔒 Browser closed")

print("Main scraping function created!")

Main scraping function created!


In [9]:
# Execute the web scraping
if __name__ == "__main__":
    print("🚀 Starting SAB Website Scraping...")
    print("📍 Target: https://www.sab.ac.lk/")
    print()
    
    # Run the main scraping function
    success = main_scrape()
    
    if success:
        print("\n🎉 Scraping completed successfully!")
        print("📁 Check the '../scraped_data/' folder for CSV files")
    else:
        print("\n❌ Scraping failed!")
        print("💡 Make sure Chrome browser is installed and try again")

🚀 Starting SAB Website Scraping...
📍 Target: https://www.sab.ac.lk/

Starting to scrape: https://www.sab.ac.lk/
✓ Chrome driver set up successfully
✓ Navigated to https://www.sab.ac.lk/

📊 Scraping navigation links...
Error scraping navigation: 'WebDriverWait' object has no attribute 'wait'
   Found 0 navigation items

📞 Scraping contact information...
   Found 25 contact items

🏫 Scraping departments and programs...
   Found 0 department items

📄 Scraping general content...
   Found 1 content items

💾 Saving data to CSV files...
No data to save for navigation_links.csv
Saved 25 records to ../scraped_data\contact_information.csv
No data to save for departments_programs.csv
Saved 1 records to ../scraped_data\general_content.csv

✅ Scraping completed!

📋 SUMMARY:
Navigation links: 0
Contact items: 25
Department items: 0
Content items: 1
CSV files saved: 2/4

🔒 Browser closed

🎉 Scraping completed successfully!
📁 Check the '../scraped_data/' folder for CSV files


In [11]:
# Display the scraped data
print("📊 DISPLAYING SCRAPED DATA")
print("=" * 40)

# Check if CSV files exist and display sample data
import os

csv_files = [
    ("contact_information.csv", "Contact Information"),
    ("general_content.csv", "General Content"),
    ("navigation_links.csv", "Navigation Links"),
    ("departments_programs.csv", "Departments & Programs")
]

for filename, title in csv_files:
    filepath = os.path.join("../scraped_data", filename)
    if os.path.exists(filepath):
        print(f"\n📄 {title}:")
        print("-" * 30)
        try:
            df = pd.read_csv(filepath)
            print(f"Records found: {len(df)}")
            if len(df) > 0:
                print("Sample data:")
                print(df.head(3).to_string(index=False))
        except Exception as e:
            print(f"Error reading {filename}: {e}")
    else:
        print(f"\n❌ {title}: No data file found")

print("\n✅ Data display completed!")

📊 DISPLAYING SCRAPED DATA

📄 Contact Information:
------------------------------
Records found: 25
Sample data:
 type          value source_section
email info@sab.ac.lk         footer
phone +94-45-2280014         footer
phone +94-45-2280087         footer

📄 General Content:
------------------------------
Records found: 1
Sample data:
                   url       title                                                                                     meta_description  headings_count  paragraphs_count  images_count                                                                  main_headings sample_content
https://www.sab.ac.lk/ Home | SUSL The Sabaragamuwa University of Sri Lanka is a public university in Belihuloya, Balangoda, Sri Lanka.              11                 0            10 ['Upcoming Events', 'Notice', 'Life At SUSL', 'Our Faculties', 'CONTACT INFO']             []

📄 Navigation Links:
------------------------------
Records found: 654
Sample data:
            text       

In [12]:
# Run scraping again to get navigation links with the fixed function
print("🔄 Running scraping again with fixed navigation function...")
success = main_scrape()

🔄 Running scraping again with fixed navigation function...
Starting to scrape: https://www.sab.ac.lk/
✓ Chrome driver set up successfully
✓ Navigated to https://www.sab.ac.lk/

📊 Scraping navigation links...
   Found 143 navigation items

📞 Scraping contact information...
   Found 25 contact items

🏫 Scraping departments and programs...
   Found 0 department items

📄 Scraping general content...
   Found 1 content items

💾 Saving data to CSV files...
Saved 143 records to ../scraped_data\navigation_links.csv
Saved 25 records to ../scraped_data\contact_information.csv
No data to save for departments_programs.csv
Saved 1 records to ../scraped_data\general_content.csv

✅ Scraping completed!

📋 SUMMARY:
Navigation links: 143
Contact items: 25
Department items: 0
Content items: 1
CSV files saved: 3/4

🔒 Browser closed


In [13]:
# Final Summary of Scraped Data
print("🎉 FINAL SCRAPING SUMMARY")
print("=" * 50)
print(f"🌐 Website: https://www.sab.ac.lk/")
print(f"🕒 Scraped on: {pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S')}")
print()

# Load and display summary of each CSV file
csv_files = [
    ("navigation_links.csv", "🔗 Navigation Links"),
    ("contact_information.csv", "📞 Contact Information"), 
    ("general_content.csv", "📄 General Content"),
    ("departments_programs.csv", "🏫 Departments & Programs")
]

total_records = 0

for filename, title in csv_files:
    filepath = os.path.join("../scraped_data", filename)
    if os.path.exists(filepath):
        df = pd.read_csv(filepath)
        print(f"{title}: {len(df)} records")
        total_records += len(df)
        
        if len(df) > 0:
            print(f"   Sample: {df.iloc[0].to_dict()}")
        print()
    else:
        print(f"{title}: No data")
        print()

print(f"📊 Total Records Scraped: {total_records}")
print(f"📁 Data saved in: ../scraped_data/ folder")
print()
print("✅ Webscraping completed successfully!")
print("🔍 You can now analyze the CSV files for further insights.")

🎉 FINAL SCRAPING SUMMARY
🌐 Website: https://www.sab.ac.lk/
🕒 Scraped on: 2025-08-17 10:50:45

🔗 Navigation Links: 143 records
   Sample: {'link_text': 'Online Teaching', 'url': 'https://www.sab.ac.lk/online-video-tutorials', 'section': 'navigation'}

📞 Contact Information: 25 records
   Sample: {'type': 'email', 'value': 'info@sab.ac.lk', 'source_section': 'footer'}

📄 General Content: 1 records
   Sample: {'url': 'https://www.sab.ac.lk/', 'title': 'Home | SUSL', 'meta_description': 'The Sabaragamuwa University of Sri Lanka is a public university in Belihuloya, Balangoda, Sri Lanka.', 'headings_count': 11, 'paragraphs_count': 0, 'images_count': 10, 'main_headings': "['Upcoming Events', 'Notice', 'Life At SUSL', 'Our Faculties', 'CONTACT INFO']", 'sample_content': '[]'}

🏫 Departments & Programs: No data

📊 Total Records Scraped: 169
📁 Data saved in: ../scraped_data/ folder

✅ Webscraping completed successfully!
🔍 You can now analyze the CSV files for further insights.
