In [15]:
from pypdf import PdfReader

reader = PdfReader("fall_2025_unofficial_transcript.pdf")
page = reader.pages[0]
# print(page.extract_text())

for line in page.extract_text().split("\n"):
    print(line)

UNOFFICIAL
California State University, Long Beach – Unofficial Transcript
Name:           Krish Snehalkumar Patel
Student ID:   034002668
_________________________________________________________________________________________________________________________________________________
Print Date: 29/12/2025
Beginning of Post-baccalaureate Record
Fall 2025
Program: Masters Degree
Plan: Computer Science Major
Course Description Attempted Earned Grade Points
CECS  524 Adv Topics in 
Programming Lang
3.000 3.000 B 9.000
CECS  528 Adv Analysis of 
Algorithms
3.000 3.000 A 12.000
CECS  543 Advanced Software 
Engineering
3.000 3.000 A 12.000
Attempted Earned GPA Units Points
Term GPA 3.667 Term Totals 9.000 9.000 9.000 33.000
Combined GPA 3.667 Comb Totals 9.000 9.000 9.000 33.000
Attempted Earned GPA Units Points
Cum GPA 3.667 Cum Totals 9.000 9.000 9.000 33.000
Transfer Cum GPA Transfer  Totals 0.000 0.000 0.000 0.000
Combined Cum GPA 3.667 Comb Totals 9.000 9.000 9.000 33.000
Spring 2026
Pr

In [6]:
# Data fields to extract:

# Transcript Type enum[Unofficial, Official]
# University Name
# Student Name
# Student ID
# Print Date
# Program
# Term [fall/spring/summer] Year
# columns: Course Description, Attempted, Earned, Grade, Points
# rows with course code and name [eg: CECS  524 Adv Topics in Programming Lang], Attempted, Earned, Grade, Points [eg: 3.000 3.000 B 9.000]

# Attempted Earned GPA Units Points
# Term GPA 3.667 Term Totals 9.000 9.000 9.000 33.000
# Combined GPA 3.667 Comb Totals 9.000 9.000 9.000 33.000

# Attempted Earned GPA Units Points
# Cum GPA 3.667 Cum Totals 9.000 9.000 9.000 33.000
# Transfer Cum GPA Transfer  Totals 0.000 0.000 0.000 0.000
# Combined Cum GPA 3.667 Comb Totals 9.000 9.000 9.000 33.000


In [16]:
# TEst
def normalize(line):
    return " ".join(line.strip().split())

for line in page.extract_text().split("\n"):
    print(normalize(line))


UNOFFICIAL
California State University, Long Beach – Unofficial Transcript
Name: Krish Snehalkumar Patel
Student ID: 034002668
_________________________________________________________________________________________________________________________________________________
Print Date: 29/12/2025
Beginning of Post-baccalaureate Record
Fall 2025
Program: Masters Degree
Plan: Computer Science Major
Course Description Attempted Earned Grade Points
CECS 524 Adv Topics in
Programming Lang
3.000 3.000 B 9.000
CECS 528 Adv Analysis of
Algorithms
3.000 3.000 A 12.000
CECS 543 Advanced Software
Engineering
3.000 3.000 A 12.000
Attempted Earned GPA Units Points
Term GPA 3.667 Term Totals 9.000 9.000 9.000 33.000
Combined GPA 3.667 Comb Totals 9.000 9.000 9.000 33.000
Attempted Earned GPA Units Points
Cum GPA 3.667 Cum Totals 9.000 9.000 9.000 33.000
Transfer Cum GPA Transfer Totals 0.000 0.000 0.000 0.000
Combined Cum GPA 3.667 Comb Totals 9.000 9.000 9.000 33.000
Spring 2026
Program: Masters Degr

###  3-line rolling buffer
Detection rule

A course always starts with:
```code
^[A-Z]{4}\s+\d{3}
```
Parsing logic

When you see a course code line:

Capture course code + first title fragment

Consume subsequent title lines until you hit a numeric line

Numeric line format:
```code
(\d+\.\d{3})\s+(\d+\.\d{3})(?:\s+([A-F][+-]?))?\s*(\d+\.\d{3})?
```


This handles:

Completed courses

In-progress courses (missing grade & points)

```code
Output example
{
  "code": "CECS 524",
  "title": "Adv Topics in Programming Lang",
  "attempted_units": 3.0,
  "earned_units": 3.0,
  "grade": "B",
  "points": 9.0
}
```

In [None]:
# REGEX Patterns:
NAME_RE = r"^Name:\s*(.+)$"
STUDENT_ID_RE = r"^Student ID:\s*(\d+)$"
PRINT_DATE_RE = r"^Print Date:\s*(\d{2}/\d{2}/\d{4})$"
PROGRAM_RE = r"^Program:\s*(.+)$"
TERM_RE = r"^(Fall|Spring|Summer)\s+(\d{4})$"
PLAN = r"^Plan:\s*(.+)$"



In [12]:
# Data Validation Rules:

TRANSCRIPT_TYPE = ["UNOFFICIAL", "OFFICIAL", "NOTFOUND"]

def validate_transcript_type(value: str) -> bool:
    return value.upper() in TRANSCRIPT_TYPE


In [31]:
def _is_beginning_record_header(line: str) -> bool:
    return (
        line.startswith("Beginning of")
        and line.endswith("Record")
    )

def _is_course_table_header(line: str) -> bool:
    return (
        line.startswith("Course Description")
        and "Attempted" in line
        and "Earned" in line
        and "Grade" in line
    )


def _is_totals_header(line: str) -> bool:
    return (
        line.startswith("Attempted Earned")
        and "GPA" in line
        and "Units" in line
        and "Points" in line
    )


def _is_career_totals_header(line: str) -> bool:
    return line.endswith("Career Totals")

In [32]:
import re
from typing import Any, Dict, List, Optional, Tuple


# ============================
# Public API
# ============================

def parse_transcript_text(raw_text: str) -> Dict[str, Any]:
    """
    Parse extracted transcript text (like pypdf's page.extract_text()) into a structured dict.
    """

    lines = _prep_lines(raw_text)

    out: Dict[str, Any] = {
        "student": {"name": None, "student_id": None, "print_date": None},
        "terms": [],
        "career_totals": {},
        "raw": {"warnings": []},
    }

    i = 0
    n = len(lines)

    # ----------------------------
    # 1) HEADER
    # ----------------------------
    while i < n:
        line = lines[i]

        if _is_beginning_record_header(line) or _is_term_header(line):
            break

        m = re.match(r"^Name:\s*(.+)$", line)
        if m:
            out["student"]["name"] = m.group(1).strip()

        m = re.match(r"^Student ID:\s*(\d+)$", line)
        if m:
            out["student"]["student_id"] = m.group(1)

        m = re.match(r"^Print Date:\s*(\d{2}/\d{2}/\d{4})$", line)
        if m:
            out["student"]["print_date"] = m.group(1)

        i += 1

    # Advance past "Beginning of ... Record"
    while (
        i < n
        and not _is_beginning_record_header(lines[i])
        and not _is_term_header(lines[i])
    ):
        i += 1

    if i < n and _is_beginning_record_header(lines[i]):
        i += 1

    # ----------------------------
    # 2) TERMS
    # ----------------------------
    while i < n:
        line = lines[i]

        if _is_career_totals_header(line):
            break

        if not _is_term_header(line):
            i += 1
            continue

        term_obj: Dict[str, Any] = {
            "term": line,
            "program": None,
            "plan": None,
            "courses": [],
            "totals": {},
        }
        i += 1

        # Program / Plan
        while i < n:
            line = lines[i]
            if line.startswith("Program:"):
                term_obj["program"] = line.replace("Program:", "", 1).strip()
                i += 1
                continue
            if line.startswith("Plan:"):
                term_obj["plan"] = line.replace("Plan:", "", 1).strip()
                i += 1
                continue
            break

        # Course table header
        while i < n and not _is_course_table_header(lines[i]):
            if _is_term_header(lines[i]) or _is_career_totals_header(lines[i]):
                break
            i += 1

        if i < n and _is_course_table_header(lines[i]):
            i += 1

        # Courses
        while i < n:
            line = lines[i]

            if _is_totals_header(line):
                break
            if _is_term_header(line) or _is_career_totals_header(line):
                break

            if _is_course_code_line(line):
                course, i = _parse_course(lines, i, out["raw"]["warnings"])
                term_obj["courses"].append(course)
                continue

            i += 1

        # Totals blocks
        while i < n and _is_totals_header(lines[i]):
            i += 1
            while i < n:
                line = lines[i]

                if _is_totals_header(line):
                    break
                if _is_term_header(line) or _is_career_totals_header(line):
                    break

                parsed = _parse_totals_line(line)
                if parsed:
                    key, payload = parsed
                    term_obj["totals"][key] = payload

                i += 1

        out["terms"].append(term_obj)

    # ----------------------------
    # 3) CAREER TOTALS
    # ----------------------------
    if i < n and _is_career_totals_header(lines[i]):
        i += 1
        while i < n:
            line = lines[i]

            if line.startswith("End of"):
                break

            parsed = _parse_career_totals_line(line)
            if parsed:
                key, payload = parsed
                out["career_totals"][key] = payload

            i += 1

    return out


# ============================
# Helpers
# ============================

def _prep_lines(raw_text: str) -> List[str]:
    lines = raw_text.replace("\r\n", "\n").replace("\r", "\n").split("\n")
    return [" ".join(ln.strip().split()) for ln in lines if ln.strip()]


_TERM_RE = re.compile(r"^(Fall|Spring|Summer)\s+\d{4}$")
def _is_term_header(line: str) -> bool:
    return bool(_TERM_RE.match(line))


def _is_beginning_record_header(line: str) -> bool:
    return line.startswith("Beginning of") and line.endswith("Record")


def _is_course_table_header(line: str) -> bool:
    return (
        line.startswith("Course Description")
        and "Attempted" in line
        and "Earned" in line
        and "Grade" in line
    )


def _is_totals_header(line: str) -> bool:
    return (
        line.startswith("Attempted Earned")
        and "GPA" in line
        and "Units" in line
        and "Points" in line
    )


def _is_career_totals_header(line: str) -> bool:
    return line.endswith("Career Totals")


_COURSE_START_RE = re.compile(r"^([A-Z]{4})\s+(\d{3})\s+(.+)$")
def _is_course_code_line(line: str) -> bool:
    return bool(_COURSE_START_RE.match(line))


_NUMERIC_LINE_RE = re.compile(
    r"^(?P<attempted>\d+\.\d{3})\s+"
    r"(?P<earned>\d+\.\d{3})"
    r"(?:\s+(?P<grade>[A-F][+-]?))?"
    r"(?:\s+(?P<points>\d+\.\d{3}))?$"
)


def _parse_course(
    lines: List[str], i: int, warnings: List[str]
) -> Tuple[Dict[str, Any], int]:

    m = _COURSE_START_RE.match(lines[i])
    subject, number, first_title = m.groups()
    title_parts = [first_title.strip()]
    i += 1

    while i < len(lines):
        line = lines[i]

        if _is_totals_header(line) or _is_term_header(line) or _is_career_totals_header(line):
            warnings.append(f"Course {subject} {number}: terminated early.")
            break

        if _is_course_code_line(line):
            warnings.append(f"Course {subject} {number}: next course before numeric line.")
            break

        if _NUMERIC_LINE_RE.match(line):
            break

        title_parts.append(line)
        i += 1

    attempted = earned = grade = points = None

    if i < len(lines) and _NUMERIC_LINE_RE.match(lines[i]):
        nm = _NUMERIC_LINE_RE.match(lines[i])
        attempted = float(nm.group("attempted"))
        earned = float(nm.group("earned"))
        grade = nm.group("grade")
        pts = nm.group("points")
        points = float(pts) if pts else None
        i += 1
    else:
        warnings.append(f"Course {subject} {number}: missing numeric line.")

    return {
        "code": f"{subject} {number}",
        "title": " ".join(title_parts),
        "attempted_units": attempted,
        "earned_units": earned,
        "grade": grade,
        "points": points,
    }, i


def _parse_totals_line(line: str) -> Optional[Tuple[str, Dict[str, Any]]]:
    tail = re.search(r"(\d+\.\d{3})\s+(\d+\.\d{3})\s+(\d+\.\d{3})\s+(\d+\.\d{3})$", line)
    if not tail:
        return None

    attempted, earned, gpa_units, points = map(float, tail.groups())
    gpa_match = re.search(r"\bGPA\b[: ]+(\d+\.\d{3})", line)
    gpa = float(gpa_match.group(1)) if gpa_match else None

    if line.startswith("Term GPA"):
        key = "term"
    elif line.startswith("Combined GPA"):
        key = "combined"
    elif line.startswith("Cum GPA"):
        key = "cumulative"
    elif line.startswith("Transfer Cum GPA"):
        key = "transfer_cumulative"
    elif line.startswith("Combined Cum GPA"):
        key = "combined_cumulative"
    else:
        return None

    return key, {
        "gpa": gpa,
        "attempted": attempted,
        "earned": earned,
        "gpa_units": gpa_units,
        "points": points,
        "raw_line": line,
    }


def _parse_career_totals_line(line: str) -> Optional[Tuple[str, Dict[str, Any]]]:
    return _parse_totals_line(line)


In [33]:
import json
reader = PdfReader("fall_2025_unofficial_transcript.pdf")
page = reader.pages[0]

raw_text = page.extract_text()

parsed = parse_transcript_text(raw_text)
print(type(parsed))

print(json.dumps(parsed, indent=2))

<class 'dict'>
{
  "student": {
    "name": "Krish Snehalkumar Patel",
    "student_id": "034002668",
    "print_date": "29/12/2025"
  },
  "terms": [
    {
      "term": "Fall 2025",
      "program": "Masters Degree",
      "plan": "Computer Science Major",
      "courses": [
        {
          "code": "CECS 524",
          "title": "Adv Topics in Programming Lang",
          "attempted_units": 3.0,
          "earned_units": 3.0,
          "grade": "B",
          "points": 9.0
        },
        {
          "code": "CECS 528",
          "title": "Adv Analysis of Algorithms",
          "attempted_units": 3.0,
          "earned_units": 3.0,
          "grade": "A",
          "points": 12.0
        },
        {
          "code": "CECS 543",
          "title": "Advanced Software Engineering",
          "attempted_units": 3.0,
          "earned_units": 3.0,
          "grade": "A",
          "points": 12.0
        }
      ],
      "totals": {
        "term": {
          "gpa": 3.667,
      

In [34]:
import pandas as pd

def courses_to_dataframe(parsed: dict) -> pd.DataFrame:
    rows = []

    student = parsed.get("student", {})

    for term in parsed.get("terms", []):
        term_name = term.get("term")
        program = term.get("program")
        plan = term.get("plan")

        for course in term.get("courses", []):
            rows.append({
                "student_name": student.get("name"),
                "student_id": student.get("student_id"),
                "term": term_name,
                "program": program,
                "plan": plan,
                "course_code": course.get("code"),
                "course_title": course.get("title"),
                "attempted_units": course.get("attempted_units"),
                "earned_units": course.get("earned_units"),
                "grade": course.get("grade"),
                "points": course.get("points"),
            })

    return pd.DataFrame(rows)
df = courses_to_dataframe(parsed)
df

Unnamed: 0,student_name,student_id,term,program,plan,course_code,course_title,attempted_units,earned_units,grade,points
0,Krish Snehalkumar Patel,34002668,Fall 2025,Masters Degree,Computer Science Major,CECS 524,Adv Topics in Programming Lang,3.0,3.0,B,9.0
1,Krish Snehalkumar Patel,34002668,Fall 2025,Masters Degree,Computer Science Major,CECS 528,Adv Analysis of Algorithms,3.0,3.0,A,12.0
2,Krish Snehalkumar Patel,34002668,Fall 2025,Masters Degree,Computer Science Major,CECS 543,Advanced Software Engineering,3.0,3.0,A,12.0
3,Krish Snehalkumar Patel,34002668,Spring 2026,Masters Degree,Computer Science Major,CECS 551,Adv Artificial Intelligence,3.0,0.0,,0.0
4,Krish Snehalkumar Patel,34002668,Spring 2026,Masters Degree,Computer Science Major,CECS 553,Machine Vision 3.000 0.000 0.000,,,,
5,Krish Snehalkumar Patel,34002668,Spring 2026,Masters Degree,Computer Science Major,CECS 575,Obj Oriented Analysis & Design,3.0,0.0,,0.0


In [35]:
def term_totals_to_dataframe(parsed: dict) -> pd.DataFrame:
    rows = []

    for term in parsed.get("terms", []):
        term_name = term.get("term")

        for totals_type, totals in term.get("totals", {}).items():
            rows.append({
                "term": term_name,
                "totals_type": totals_type,  # term / combined / cumulative
                "gpa": totals.get("gpa"),
                "attempted": totals.get("attempted"),
                "earned": totals.get("earned"),
                "gpa_units": totals.get("gpa_units"),
                "points": totals.get("points"),
            })

    return pd.DataFrame(rows)
df_totals = term_totals_to_dataframe(parsed)
df_totals

Unnamed: 0,term,totals_type,gpa,attempted,earned,gpa_units,points
0,Fall 2025,term,3.667,9.0,9.0,9.0,33.0
1,Fall 2025,combined,3.667,9.0,9.0,9.0,33.0
2,Fall 2025,cumulative,3.667,9.0,9.0,9.0,33.0
3,Fall 2025,transfer_cumulative,,0.0,0.0,0.0,0.0
4,Fall 2025,combined_cumulative,3.667,9.0,9.0,9.0,33.0
5,Spring 2026,term,0.0,9.0,0.0,0.0,0.0
6,Spring 2026,combined,0.0,9.0,0.0,0.0,0.0
7,Spring 2026,cumulative,3.667,18.0,9.0,9.0,33.0
8,Spring 2026,transfer_cumulative,,0.0,0.0,0.0,0.0
9,Spring 2026,combined_cumulative,3.667,18.0,9.0,9.0,33.0


In [36]:
def career_totals_to_dataframe(parsed: dict) -> pd.DataFrame:
    rows = []

    for totals_type, totals in parsed.get("career_totals", {}).items():
        rows.append({
            "totals_type": totals_type,
            "gpa": totals.get("gpa"),
            "attempted": totals.get("attempted"),
            "earned": totals.get("earned"),
            "gpa_units": totals.get("gpa_units"),
            "points": totals.get("points"),
        })

    return pd.DataFrame(rows)
df_career = career_totals_to_dataframe(parsed)
df_career

Unnamed: 0,totals_type,gpa,attempted,earned,gpa_units,points
0,cumulative,3.667,18.0,9.0,9.0,33.0
1,transfer_cumulative,,0.0,0.0,0.0,0.0
2,combined_cumulative,3.667,18.0,9.0,9.0,33.0
