In [10]:
import pdfplumber
import pandas as pd
import re
import csv

desired_categories = ['1G', '1H', '2AG', '2AH', '2BG', '2BH', 
                      '3AG', '3AH', '3BG', '3BH', 'GM', 
                      'GMH', 'NKN', 'PH', 'SCG', 'SCH', 
                      'STG', 'STH', 'XD']

def is_numeric(s):
    try:
        float(s)
        return True
    except ValueError:
        return False

file_path = "MCA_BULL_2023_FREE_C_FINenglish.pdf"
college_data = []

with pdfplumber.open(file_path) as pdf:
    for page in pdf.pages:
        text = page.extract_text().split('\n')
        
        college_code_pattern = re.compile(r'C\d{3}MC')
        
        for i, line in enumerate(text):
            if college_code_pattern.search(line):
                college_code = college_code_pattern.search(line).group()

                if i + 1 < len(text):
                    seat_info_line = text[i]
                    seat_info = seat_info_line.split()

                    if len(seat_info) >= len(desired_categories) + 1:
                        total_seats = seat_info[-1]
                        seat_availability = ['0'] * len(desired_categories)

                        num_idx = 0
                        for item in seat_info:
                            if is_numeric(item) and num_idx < len(desired_categories):
                                seat_availability[num_idx] = item
                                num_idx += 1
                        
                        college_data.append([college_code] + seat_availability + [total_seats])

columns = ['College Code'] + desired_categories + ['Total Seats']
df = pd.DataFrame(college_data, columns=columns)

csv_file_path = "college_seats_cleaned.csv"
df.to_csv(csv_file_path, index=False, quoting=csv.QUOTE_NONE, escapechar='\\')
print(f"Data saved to {csv_file_path}")


Data saved to college_seats_cleaned.csv
