In [22]:
import pandas as pd
import PyPDF2
import re
import os

# --- Advanced Stats Parsing Section ---
# Define file paths
pdf_path = r"C:\Users\jonla\NBA_Playoffs_Series_Predictor\Data\Advanced_Stats_25.pdf"
csv_path = r"C:\Users\jonla\NBA_Playoffs_Series_Predictor\Data\Advanced_Stats_25.csv"

# Define column headers for the output CSV
columns = ["Team", "W", "PIE", "eFG%", "OREB%", "DREB%", "TS%", "OffRtg", "DefRtg"]

# Extract text from PDF
def extract_text_from_pdf(pdf_path):
    try:
        text = ""
        with open(pdf_path, "rb") as file:
            reader = PyPDF2.PdfReader(file)
            for page in reader.pages:
                page_text = page.extract_text()
                if page_text:
                    text += page_text + "\n"
        print("Text extracted with PyPDF2 successfully.")
        return text
    except FileNotFoundError:
        raise FileNotFoundError(f"PDF file not found at {pdf_path}")
    except Exception as e:
        raise Exception(f"Error reading PDF: {e}")

# Process text to extract team data using sequential block parsing
def process_text(text):
    # Print extracted text for debugging
    print("\nExtracted text (first 1000 characters):\n", text[:1000])
    
    # Split text into lines and clean
    lines = [line.strip() for line in text.split('\n') if line.strip()]
    
    # Define NBA team keywords for validation
    nba_team_keywords = [
        "Cavaliers", "Celtics", "Thunder", "Nuggets", "Knicks", "Grizzlies", "Kings",
        "Timberwolves", "Pacers", "Bucks", "Lakers", "Rockets", "Suns", "Pistons",
        "Clippers", "Warriors", "Hawks", "Mavericks", "Spurs", "Bulls", "Heat",
        "Trail Blazers", "76ers", "Jazz", "Pelicans", "Raptors", "Magic", "Nets",
        "Hornets", "Wizards", "LA", "Los", "Angeles", "York", "City", "State", "Antonio", "Orleans", "Golden"
    ]
    
    # Initialize data storage
    structured_data = []
    skipped_rows = []
    
    # Skip tokens (header rows and irrelevant text)
    skip_tokens = [
        "TEAM", "GP", "W", "L", "MIN", "OffRtg", "DefRtg", "NetRtg", "AST%", "AST/TO",
        "AST", "Ratio", "OREB%", "DREB%", "REB%", "TOV%", "eFG%", "TS%", "PACE", "PIE", "POSS"
    ]
    
    def process_block(block, rank):
        if len(block) < 22 or len(block) > 24:  # Rank + 1–3 team name tokens + 19 stats
            skipped_rows.append(f"Skipped block for rank {rank}: 'Invalid count' (tokens: {block[:10]}, total tokens: {len(block)})")
            return
        
        # Extract team name (skip the rank)
        team_parts = []
        j = 1  # Start after rank
        while j < len(block) and not re.match(r'^-?\d+(\.\d+)?$|^-?\d+$', block[j]):
            team_parts.append(block[j])
            j += 1
        team_name = " ".join(team_parts).strip()
        
        # Validate team name
        if team_name and any(keyword in team_name for keyword in nba_team_keywords):
            stats = block[j:j+19]
            if len(stats) < 19:
                stats.extend([None] * (19 - len(stats)))
            
            # Map stats to required columns
            row = [
                team_name,        # Team
                stats[1],         # W
                stats[17],        # PIE
                stats[14],        # eFG%
                stats[10],        # OREB%
                stats[11],        # DREB%
                stats[15],        # TS%
                stats[4],         # OffRtg
                stats[5]          # DefRtg
            ]
            structured_data.append(row)
            print(f"Processed team: {team_name}, rank: {rank}, tokens count: {len(block)}")
        else:
            skipped_rows.append(f"Skipped block for rank {rank}: 'Invalid team name: {team_name}' (tokens: {block[:10]}, total tokens: {len(block)})")
    
    # Find all rank positions
    rank_indices = []
    i = 0
    while i < len(lines):
        if any(token in lines[i] for token in skip_tokens) and not re.match(r'^\d+$', lines[i]):
            i += 1
            continue
        if re.match(r'^\d+$', lines[i]) and 1 <= int(lines[i]) <= 30:
            # Validate rank by checking for team name
            if i + 1 < len(lines) and not re.match(r'^-?\d+(\.\d+)?$|^-?\d+$', lines[i + 1]):
                rank_indices.append((int(lines[i]), i))
            i += 1
        else:
            i += 1
    
    print(f"Found {len(rank_indices)} rank indices: {rank_indices}")
    
    # Process blocks between rank indices
    for idx in range(len(rank_indices)):
        rank, start = rank_indices[idx]
        # Determine block end
        if idx + 1 < len(rank_indices):
            _, end = rank_indices[idx + 1]
        else:
            end = len(lines)
        
        block = lines[start:end]
        print(f"Rank {rank} block size: {len(block)}")
        if len(block) >= 22:
            process_block(block, rank)
        else:
            skipped_rows.append(f"Skipped block for rank {rank}: 'Incomplete' (tokens: {block[:10]}, total tokens: {len(block)})")
    
    # Create DataFrame
    df = pd.DataFrame(structured_data, columns=columns)
    
    # Clean numeric columns
    for col in columns[1:]:
        df[col] = df[col].str.replace(' .', '.', regex=False)  # Fix formatting like "121 .0"
        df[col] = df[col].str.replace(',', '')  # Remove commas
        df[col] = pd.to_numeric(df[col], errors="coerce")
    
    print(f"Extracted {len(df)} teams")
    print("Skipped rows:", skipped_rows)
    print("\nFirst few rows of DataFrame:")
    print(df.head())
    
    return df

# Save to CSV
def save_to_csv(df, csv_path):
    try:
        df.to_csv(csv_path, index=False)
        print(f"Saved data to {csv_path} with shape {df.shape}")
    except Exception as e:
        print(f"Error saving to {csv_path}: {e}")

# Run the extraction and processing
try:
    text = extract_text_from_pdf(pdf_path)
    df = process_text(text)
    save_to_csv(df, csv_path)
except Exception as e:
    print(f"Error in Advanced Stats Parsing: {e}")

Text extracted with PyPDF2 successfully.

Extracted text (first 1000 characters):
 TEAM
 
GP
 
W
 
L
 
MIN
 
OffRtg
 
DefRtg
 
NetRtg
 
AST%
 
AST/TO
 
AST
 
Ratio
 
OREB%
 
DREB%
 
REB%
 
TOV%
 
eFG%
 
TS%
 
PACE
 
PIE
 
POSS
 
1
 
Cleveland
 
Cavaliers
 
82
 
64
 
18
 
3951.0
 
121.0
 
111.8
 
9.2
 
63.1
 
2.13
 
19.6
 
29.6
 
70.0
 
50.5
 
13.1
 
57.8
 
60.7
 
100.31
 
54.6
 
8,267
 
2
 
Boston
 
Celtics
 
82
 
61
 
21
 
3966.0
 
119.5
 
110.1
 
9.4
 
62.8
 
2.20
 
19.0
 
29.1
 
71.7
 
50.6
 
12.2
 
56.1
 
59.1
 
96.59
 
54.2
 
7,979
 
3
 
Oklahoma
 
City
 
Thunder
 
82
 
68
 
14
 
3941.0
 
119.2
 
106.6
 
12.7
 
60.2
 
2.29
 
18.9
 
28.1
 
70.4
 
49.6
 
11.6
 
56.0
 
59.3
 
100.90
 
56.2
 
8,286
 
4
 
Denver
 
Nuggets
 
82
 
50
 
32
 
3971.0
 
118.9
 
115.1
 
3.8
 
68.3
 
2.17
 
21.1
 
31.1
 
70.9
 
52.0
 
14.0
 
57.3
 
60.4
 
100.67
 
52.9
 
8,328
 
5
 
New
 
York
 
Knicks
 
82
 
51
 
31
 
3976.0
 
117.3
 
113.3
 
4.0
 
63.4
 
2.07
 
19.6
 
30.5
 
71.0
 
50.8
 
13.4
 
55.6
 
58.9


In [24]:
import pandas as pd
df = pd.read_csv(r"C:\Users\jonla\NBA_Playoffs_Series_Predictor\Data\Advanced_Stats_25.csv")
print(df)
print(df.shape)
print(df['Team'].tolist())

                      Team   W   PIE  eFG%  OREB%  DREB%   TS%  OffRtg  DefRtg
0      Cleveland Cavaliers  64  54.6  57.8   29.6   70.0  60.7   121.0   111.8
1           Boston Celtics  61  54.2  56.1   29.1   71.7  59.1   119.5   110.1
2    Oklahoma City Thunder  68  56.2  56.0   28.1   70.4  59.3   119.2   106.6
3           Denver Nuggets  50  52.9  57.3   31.1   70.9  60.4   118.9   115.1
4          New York Knicks  51  52.7  55.6   30.5   71.0  58.9   117.3   113.3
5        Memphis Grizzlies  48  52.4  55.4   33.0   70.2  58.8   117.2   112.6
6         Sacramento Kings  40  50.2  54.8   29.4   72.8  58.2   115.9   115.3
7   Minnesota Timberwolves  49  52.4  55.4   30.0   70.9  58.8   115.7   110.8
8           Indiana Pacers  50  52.1  56.2   25.4   70.5  59.4   115.4   113.3
9          Milwaukee Bucks  48  51.3  56.8   23.5   72.5  59.8   115.1   112.7
10      Los Angeles Lakers  50  51.1  55.7   27.3   70.2  59.3   115.0   113.8
11         Houston Rockets  52  51.5  52.3   36.3   