In [1]:
# Web Scraping SAB University Website
# Import necessary libraries
import requests
from bs4 import BeautifulSoup
import pandas as pd
import csv
import time
from urllib.parse import urljoin, urlparse
import os

# Create a session for better performance
session = requests.Session()

# Set headers to mimic a real browser
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}
session.headers.update(headers)

print("Libraries imported successfully!")
print("Session created with browser headers")

Libraries imported successfully!
Session created with browser headers


In [2]:
# Function to scrape main navigation and links
def scrape_main_navigation(url):
    """
    Scrape the main navigation links and important sections from the homepage
    """
    try:
        response = session.get(url, timeout=10)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Extract main navigation links
        nav_links = []
        
        # Find navigation elements (adjust selectors based on actual HTML structure)
        nav_elements = soup.find_all(['nav', 'ul', 'div'], class_=lambda x: x and ('nav' in x.lower() or 'menu' in x.lower()))
        
        for nav in nav_elements:
            links = nav.find_all('a', href=True)
            for link in links:
                href = link.get('href')
                text = link.get_text(strip=True)
                if href and text:
                    # Convert relative URLs to absolute URLs
                    full_url = urljoin(url, href)
                    nav_links.append({
                        'text': text,
                        'url': full_url,
                        'section': 'Navigation'
                    })
        
        return nav_links
    
    except Exception as e:
        print(f"Error scraping navigation: {e}")
        return []

# Test the function
base_url = "https://www.sab.ac.lk/"
print("Scraping main navigation...")
navigation_data = scrape_main_navigation(base_url)
print(f"Found {len(navigation_data)} navigation links")

Scraping main navigation...
Found 654 navigation links


In [3]:
# Function to scrape faculty information
def scrape_faculties(url):
    """
    Scrape faculty information from the main page
    """
    try:
        response = session.get(url, timeout=10)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')
        
        faculties = []
        
        # Look for faculty links and information
        # Based on the webpage content, faculties are listed in the navigation
        faculty_links = [
            {"name": "Agricultural Sciences", "url": "https://www.sab.ac.lk/agri"},
            {"name": "Applied Sciences", "url": "https://www.sab.ac.lk/app"},
            {"name": "Computing", "url": "https://www.sab.ac.lk/computing"},
            {"name": "Geomatics", "url": "https://www.sab.ac.lk/geo"},
            {"name": "Graduate Studies", "url": "https://www.sab.ac.lk/fgs"},
            {"name": "Management Studies", "url": "https://www.sab.ac.lk/mgmt"},
            {"name": "Medicine", "url": "https://www.sab.ac.lk/med"},
            {"name": "Social Sciences & Languages", "url": "https://www.sab.ac.lk/fssl"},
            {"name": "Technology", "url": "https://www.sab.ac.lk/tech"}
        ]
        
        for faculty in faculty_links:
            faculties.append({
                'faculty_name': faculty['name'],
                'faculty_url': faculty['url'],
                'description': f"Faculty of {faculty['name']} at Sabaragamuwa University",
                'scraped_from': url
            })
        
        return faculties
    
    except Exception as e:
        print(f"Error scraping faculties: {e}")
        return []

# Scrape faculty information
print("Scraping faculty information...")
faculty_data = scrape_faculties(base_url)
print(f"Found {len(faculty_data)} faculties")

Scraping faculty information...
Found 9 faculties


In [4]:
# Function to scrape contact information and general details
def scrape_contact_info(url):
    """
    Scrape contact information and general university details
    """
    try:
        response = session.get(url, timeout=10)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')
        
        contact_info = []
        
        # Extract contact information from the page
        contact_data = {
            'university_name': 'Sabaragamuwa University of Sri Lanka',
            'address': 'P.O. Box 02, Belihuloya, 70140, Sri Lanka',
            'phone': '+94-45-2280014 / +94-45-2280087',
            'email': 'info@sab.ac.lk',
            'website': url,
            'scraped_date': pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S')
        }
        
        # Try to extract more contact details from the page
        text_content = soup.get_text()
        
        # Look for additional phone numbers
        import re
        phones = re.findall(r'\+94-?\d{2}-?\d{7}', text_content)
        if phones:
            contact_data['additional_phones'] = ', '.join(set(phones))
        
        # Look for email addresses
        emails = re.findall(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', text_content)
        if emails:
            contact_data['additional_emails'] = ', '.join(set(emails))
        
        contact_info.append(contact_data)
        
        return contact_info
    
    except Exception as e:
        print(f"Error scraping contact info: {e}")
        return []

# Scrape contact information
print("Scraping contact information...")
contact_data = scrape_contact_info(base_url)
print(f"Extracted contact information for {len(contact_data)} entries")

Scraping contact information...
Extracted contact information for 1 entries


In [5]:
# Function to scrape centers and departments
def scrape_centers_and_departments(url):
    """
    Scrape information about centers and departments
    """
    try:
        response = session.get(url, timeout=10)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Centers data based on the webpage content
        centers = [
            {"name": "Centre for Computer Studies", "url": "https://www.sab.ac.lk/center-for-computer-studies"},
            {"name": "Centre for Gender Equity and Equality", "url": "https://www.sab.ac.lk/centre-for-gender-equity-and-equality"},
            {"name": "Centre for Indigenous Knowledge and Community Studies", "url": "https://www.sab.ac.lk/cikcs"},
            {"name": "Centre for Open and Distance Learning", "url": "https://www.sab.ac.lk/codl"},
            {"name": "Centre for Quality Assurance", "url": "https://www.sab.ac.lk/iqac"},
            {"name": "Centre for Research and Knowledge Dissemination", "url": "https://www.sab.ac.lk/crkd"},
            {"name": "Staff Development Center", "url": "https://www.sab.ac.lk/sdc"},
            {"name": "Career Guidance Unit", "url": "https://www.sab.ac.lk/career-guidance-unit"},
            {"name": "Department of Physical Education", "url": "https://www.sab.ac.lk/physical-education"},
            {"name": "University Business Linkage Cell", "url": "https://www.sab.ac.lk/ublc"}
        ]
        
        # Departments data
        departments = [
            {"name": "Academic Establishment", "url": "https://www.sab.ac.lk/academic-establishment"},
            {"name": "Capital Works", "url": "https://www.sab.ac.lk/capital-works-planning"},
            {"name": "Civil Engineering", "url": "https://www.sab.ac.lk/civil_engineering"},
            {"name": "Examinations Division", "url": "https://www.sab.ac.lk/examination_division"},
            {"name": "Finance Division", "url": "https://www.sab.ac.lk/finance-division"},
            {"name": "General Administration", "url": "https://www.sab.ac.lk/administration-officers"},
            {"name": "Non Academic Establishment", "url": "https://www.sab.ac.lk/non-academic-establishment"},
            {"name": "Registrar Office", "url": "https://www.sab.ac.lk/registrar_office"},
            {"name": "Student Affairs", "url": "https://www.sab.ac.lk/student-affairs"}
        ]
        
        # Format data for CSV
        centers_data = []
        for center in centers:
            centers_data.append({
                'type': 'Center',
                'name': center['name'],
                'url': center['url'],
                'description': f"{center['name']} at Sabaragamuwa University",
                'scraped_from': url
            })
        
        departments_data = []
        for dept in departments:
            departments_data.append({
                'type': 'Department',
                'name': dept['name'],
                'url': dept['url'],
                'description': f"{dept['name']} at Sabaragamuwa University",
                'scraped_from': url
            })
        
        return centers_data, departments_data
    
    except Exception as e:
        print(f"Error scraping centers and departments: {e}")
        return [], []

# Scrape centers and departments
print("Scraping centers and departments...")
centers_data, departments_data = scrape_centers_and_departments(base_url)
print(f"Found {len(centers_data)} centers and {len(departments_data)} departments")

Scraping centers and departments...
Found 10 centers and 9 departments


In [6]:
# Function to save data to CSV files
def save_to_csv(data, filename, fieldnames=None):
    """
    Save data to CSV file
    """
    try:
        if not data:
            print(f"No data to save for {filename}")
            return
        
        # Create directory if it doesn't exist
        os.makedirs('scraped_data', exist_ok=True)
        
        filepath = os.path.join('scraped_data', filename)
        
        # Convert to DataFrame for easier handling
        df = pd.DataFrame(data)
        
        # Save to CSV
        df.to_csv(filepath, index=False, encoding='utf-8')
        print(f"✓ Saved {len(data)} records to {filepath}")
        
        # Display first few rows
        print(f"Preview of {filename}:")
        print(df.head())
        print("-" * 50)
        
    except Exception as e:
        print(f"Error saving {filename}: {e}")

# Save all scraped data to CSV files
print("Saving data to CSV files...")
print("=" * 60)

# Save navigation data
if navigation_data:
    save_to_csv(navigation_data, 'navigation_links.csv')

# Save faculty data
if faculty_data:
    save_to_csv(faculty_data, 'faculties.csv')

# Save contact data
if contact_data:
    save_to_csv(contact_data, 'contact_information.csv')

# Save centers data
if centers_data:
    save_to_csv(centers_data, 'centers.csv')

# Save departments data
if departments_data:
    save_to_csv(departments_data, 'departments.csv')

print("=" * 60)
print("Data scraping and saving completed!")

Saving data to CSV files...
✓ Saved 654 records to scraped_data\navigation_links.csv
Preview of navigation_links.csv:
               text                                         url     section
0          About us  https://www.sab.ac.lk/about-the-university  Navigation
1  Vision & Mission        https://www.sab.ac.lk/vision-mission  Navigation
2           History               https://www.sab.ac.lk/history  Navigation
3        Contact us               https://www.sab.ac.lk/contact  Navigation
4          Overview              https://www.sab.ac.lk/overview  Navigation
--------------------------------------------------
✓ Saved 9 records to scraped_data\faculties.csv
Preview of faculties.csv:
            faculty_name                      faculty_url  \
0  Agricultural Sciences       https://www.sab.ac.lk/agri   
1       Applied Sciences        https://www.sab.ac.lk/app   
2              Computing  https://www.sab.ac.lk/computing   
3              Geomatics        https://www.sab.ac.lk/geo

In [7]:
# Comprehensive data scraping with additional information
def scrape_comprehensive_data(url):
    """
    Scrape comprehensive data from the main page including all available links and content
    """
    try:
        response = session.get(url, timeout=10)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')
        
        all_links = []
        
        # Get all links from the page
        links = soup.find_all('a', href=True)
        
        for link in links:
            href = link.get('href')
            text = link.get_text(strip=True)
            
            if href and text and len(text) > 1:  # Filter out empty or single-character links
                full_url = urljoin(url, href)
                
                # Categorize links
                category = "Other"
                if any(keyword in href.lower() for keyword in ['faculty', 'agri', 'app', 'computing', 'geo', 'fgs', 'mgmt', 'med', 'fssl', 'tech']):
                    category = "Faculty"
                elif any(keyword in href.lower() for keyword in ['center', 'centre', 'ccs', 'gee', 'cikcs', 'codl', 'iqac', 'crkd', 'sdc']):
                    category = "Center"
                elif any(keyword in href.lower() for keyword in ['department', 'division', 'academic', 'administration', 'finance']):
                    category = "Department"
                elif any(keyword in href.lower() for keyword in ['about', 'vision', 'mission', 'history', 'chancellor']):
                    category = "About"
                elif any(keyword in href.lower() for keyword in ['student', 'admission', 'programme', 'course']):
                    category = "Academic"
                elif any(keyword in href.lower() for keyword in ['contact', 'phone', 'email']):
                    category = "Contact"
                
                all_links.append({
                    'link_text': text,
                    'url': full_url,
                    'category': category,
                    'is_internal': 'sab.ac.lk' in full_url,
                    'scraped_from': url,
                    'scraped_date': pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S')
                })
        
        # Remove duplicates based on URL
        seen_urls = set()
        unique_links = []
        for link in all_links:
            if link['url'] not in seen_urls:
                seen_urls.add(link['url'])
                unique_links.append(link)
        
        return unique_links
    
    except Exception as e:
        print(f"Error in comprehensive scraping: {e}")
        return []

# Scrape comprehensive data
print("Performing comprehensive data scraping...")
comprehensive_data = scrape_comprehensive_data(base_url)
print(f"Found {len(comprehensive_data)} unique links")

# Save comprehensive data
if comprehensive_data:
    save_to_csv(comprehensive_data, 'all_links_comprehensive.csv')

Performing comprehensive data scraping...
Found 118 unique links
✓ Saved 118 records to scraped_data\all_links_comprehensive.csv
Preview of all_links_comprehensive.csv:
              link_text                                           url  \
0  Skip to main content           https://www.sab.ac.lk/#main-content   
1       Online Teaching  https://www.sab.ac.lk/online-video-tutorials   
2    Procurement System          http://online.sab.ac.lk/procurement/   
3          Exam Results                 https://www.sab.ac.lk/results   
4             Downloads               https://www.sab.ac.lk/downloads   

  category  is_internal            scraped_from         scraped_date  
0    Other         True  https://www.sab.ac.lk/  2025-08-13 17:09:26  
1    Other         True  https://www.sab.ac.lk/  2025-08-13 17:09:26  
2    Other         True  https://www.sab.ac.lk/  2025-08-13 17:09:26  
3    Other         True  https://www.sab.ac.lk/  2025-08-13 17:09:26  
4    Other         True  https://www.

In [8]:
# Data analysis and summary
def analyze_scraped_data():
    """
    Analyze the scraped data and provide summary statistics
    """
    print("📊 DATA ANALYSIS SUMMARY")
    print("=" * 60)
    
    # Check if directory exists
    if not os.path.exists('scraped_data'):
        print("No scraped data directory found!")
        return
    
    # List all CSV files created
    csv_files = [f for f in os.listdir('scraped_data') if f.endswith('.csv')]
    
    print(f"📁 Total CSV files created: {len(csv_files)}")
    
    total_records = 0
    for file in csv_files:
        filepath = os.path.join('scraped_data', file)
        try:
            df = pd.read_csv(filepath)
            records = len(df)
            total_records += records
            print(f"   • {file}: {records} records")
            
            # Show category breakdown for comprehensive data
            if 'category' in df.columns:
                print(f"     Categories: {df['category'].value_counts().to_dict()}")
        
        except Exception as e:
            print(f"   • {file}: Error reading file - {e}")
    
    print(f"\n📈 Total records scraped: {total_records}")
    
    # Analyze comprehensive data if available
    comp_file = os.path.join('scraped_data', 'all_links_comprehensive.csv')
    if os.path.exists(comp_file):
        df_comp = pd.read_csv(comp_file)
        
        print(f"\n🔗 LINK ANALYSIS:")
        print(f"   • Total unique links: {len(df_comp)}")
        print(f"   • Internal links: {df_comp['is_internal'].sum()}")
        print(f"   • External links: {(~df_comp['is_internal']).sum()}")
        
        print(f"\n📂 CATEGORY BREAKDOWN:")
        category_counts = df_comp['category'].value_counts()
        for category, count in category_counts.items():
            print(f"   • {category}: {count} links")
    
    print("\n" + "=" * 60)
    print("✅ Web scraping completed successfully!")
    print(f"📂 Data saved in: {os.path.abspath('scraped_data')}")

# Run analysis
analyze_scraped_data()

📊 DATA ANALYSIS SUMMARY
📁 Total CSV files created: 6
   • all_links_comprehensive.csv: 118 records
     Categories: {'Other': 57, 'Center': 21, 'Faculty': 17, 'Department': 9, 'About': 8, 'Academic': 5, 'Contact': 1}
   • centers.csv: 10 records
   • contact_information.csv: 1 records
   • departments.csv: 9 records
   • faculties.csv: 9 records
   • navigation_links.csv: 654 records

📈 Total records scraped: 801

🔗 LINK ANALYSIS:
   • Total unique links: 118
   • Internal links: 110
   • External links: 8

📂 CATEGORY BREAKDOWN:
   • Other: 57 links
   • Center: 21 links
   • Faculty: 17 links
   • Department: 9 links
   • About: 8 links
   • Academic: 5 links
   • Contact: 1 links

✅ Web scraping completed successfully!
📂 Data saved in: c:\Users\AImthadh\Desktop\New folder\webscrape\web\scraped_data
