<a href="https://colab.research.google.com/github/henilptel/result_scraper/blob/main/results_web_scrap_master.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import requests
from bs4 import BeautifulSoup
import csv

# Output CSV file
output_file = "scraped_results.csv"

# Base URL components
base_url = "https://ums.cvmu.ac.in/GenerateResultHTML/1378/"
start_id = 1213001
end_id = 1213100

# Column names for the CSV
columns = [
    "Seat No", "Exam Name", "Program Name", "Student Name", "College Name",
    "Enrolment No", "Result Declared ON Date", "SP ID", "Courses",
    "SGPA", "CGPA", "Result", "Current Semester Backlogs"
]

# Function to scrape data from a single URL
def scrape_result(url):
    response = requests.get(url)
    if response.status_code == 404:
        return None
    elif response.status_code != 200:
        print(f"Failed to fetch {url}. Status Code: {response.status_code}")
        return None

    soup = BeautifulSoup(response.content, 'html.parser')

    # Extract top-level details
    details = {}
    rows = soup.find_all('tr', class_='background1')
    for row in rows:
        cols = row.find_all('td')
        if len(cols) == 2:
            key = cols[0].text.strip().split(":")[0]
            value = cols[1].text.strip()
            details[key] = value

    # Extract course details
    try:
        course_rows = soup.find('table', {'id': 'mytbl'}).find_all('tr')[1:]  # Skip header row
        course_info = []
        for row in course_rows:
            cols = row.find_all('td')
            course = [col.text.strip() for col in cols]
            if len(course) >= 2:
                course_info.append(f"{course[0]}: {course[1]} (GL: {course[2]}, GP: {course[3]}, Credit: {course[4]})")
        courses_combined = "; ".join(course_info)
    except AttributeError:
        print(f"Course table missing in {url}")
        courses_combined = ""

    # Extract additional information (SGPA, CGPA, etc.)
    additional_info = {}
    for row in rows:
        cols = row.find_all('td')
        if len(cols) == 1:
            text = cols[0].text.strip()
            if ":" in text:
                key, value = text.split(":", 1)  # Split into two parts
                additional_info[key.strip()] = value.strip()

    # Combine all details into a single record
    result = {
        "Seat No": details.get("Seat No", ""),
        "Exam Name": details.get("Exam Name", ""),
        "Program Name": details.get("Program Name", ""),
        "Student Name": details.get("Student Name", ""),
        "College Name": details.get("College Name", ""),
        "Enrolment No": details.get("Enrolment / PG Registration No", ""),
        "Result Declared ON Date": details.get("Result Declared ON Date", ""),
        "SP ID": details.get("SP ID", ""),
        "Courses": courses_combined,  # Combined course details
        "SGPA": additional_info.get("SGPA", ""),
        "CGPA": additional_info.get("CGPA", ""),
        "Result": additional_info.get("Result", ""),
        "Current Semester Backlogs": additional_info.get("Current Semester Backlogs", ""),
    }

    return result


# Main scraping function
def scrape_all_results():
    all_results = []
    consecutive_404_count = 0  # Counter for consecutive 404 errors

    for seat_no in range(start_id, end_id + 1):
        url = f"{base_url}{seat_no}.html"
        print(f"Scraping {url}...")
        result = scrape_result(url)

        if result:
            all_results.append(result)
            consecutive_404_count = 0  # Reset the 404 counter on successful fetch
        else:
            consecutive_404_count += 1
            print(f"404 Not Found. Consecutive 404 errors: {consecutive_404_count}")
            if consecutive_404_count >= 4:
                print("Stopping script after 4 consecutive 404 errors.")
                break

    # Write all results to a CSV file
    csv_file = "scraped_results.csv"
    fieldnames = [
        "Seat No", "Exam Name", "Program Name", "Student Name", "College Name",
        "Enrolment No", "Result Declared ON Date", "SP ID", "Courses", "SGPA",
        "CGPA", "Result", "Current Semester Backlogs"
    ]

    with open(csv_file, mode="w", newline="", encoding="utf-8") as file:
        writer = csv.DictWriter(file, fieldnames=fieldnames)
        writer.writeheader()
        writer.writerows(all_results)  # Pass the list of dictionaries to the writer

    print(f"Results saved to {csv_file}.")


# Run the script
if __name__ == "__main__":
    scrape_all_results()


Scraping https://ums.cvmu.ac.in/GenerateResultHTML/1378/1213001.html...
Scraping https://ums.cvmu.ac.in/GenerateResultHTML/1378/1213002.html...
Scraping https://ums.cvmu.ac.in/GenerateResultHTML/1378/1213003.html...
Scraping https://ums.cvmu.ac.in/GenerateResultHTML/1378/1213004.html...
Scraping https://ums.cvmu.ac.in/GenerateResultHTML/1378/1213005.html...
Scraping https://ums.cvmu.ac.in/GenerateResultHTML/1378/1213006.html...
Scraping https://ums.cvmu.ac.in/GenerateResultHTML/1378/1213007.html...
Scraping https://ums.cvmu.ac.in/GenerateResultHTML/1378/1213008.html...
Scraping https://ums.cvmu.ac.in/GenerateResultHTML/1378/1213009.html...
Scraping https://ums.cvmu.ac.in/GenerateResultHTML/1378/1213010.html...
Scraping https://ums.cvmu.ac.in/GenerateResultHTML/1378/1213011.html...
Scraping https://ums.cvmu.ac.in/GenerateResultHTML/1378/1213012.html...
Scraping https://ums.cvmu.ac.in/GenerateResultHTML/1378/1213013.html...
Scraping https://ums.cvmu.ac.in/GenerateResultHTML/1378/1213014.