<a href="https://colab.research.google.com/github/kinugasa-hirata/claude-code-docker/blob/main/pdfreader_20250724.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!pip install PyPDF2

import pandas as pd
import re
from google.colab import files
import PyPDF2
import io
from typing import Dict, List, Any
import numpy as np

class CMMDataParser:
    def __init__(self):
        self.measurement_data = []
        self.coordinate_system_data = {}
        self.reference_elements = {}

    def upload_file(self):
        """Allow user to upload a file in Google Colab"""
        print("Please upload your CMM measurement file (PDF format):")
        uploaded = files.upload()

        if not uploaded:
            print("No file uploaded!")
            return None

        filename = list(uploaded.keys())[0]
        file_content = uploaded[filename]

        print(f"File '{filename}' uploaded successfully!")
        return filename, file_content

    def extract_pdf_text(self, file_content):
        """Extract text from PDF file"""
        try:
            pdf_file = io.BytesIO(file_content)
            pdf_reader = PyPDF2.PdfReader(pdf_file)

            text = ""
            for page in pdf_reader.pages:
                text += page.extract_text() + "\n"

            return text
        except Exception as e:
            print(f"Error extracting PDF text: {e}")
            return None

    def parse_measurement_line(self, line):
        """Parse a single measurement line to extract values"""
        # Pattern to match measurement data lines
        patterns = {
            'circle': r'(円\d+|基準円\d+)\s+円\(最小二乗法\)\s+点数\s+\((\d+)\)\s+(内側|外側)?',
            'plane': r'(平面\d+|.*平面)\s+平面\(最小二乗法\)\s+点数\s+\((\d+)\)',
            'line': r'(.*線)\s+直線\(最小二乗法\)\s+点数\s+\((\d+)\)',
            'coordinate_value': r'([XYZ]-値_.*?|[XYZ])\s+([XYZ])\s+([-\d.]+)\s+([-\d.]+)\s+([-\d.]+)\s+([-\d.]+)\s+([-\d.]+)',
            'diameter': r'D\s+([\d.]+)\s+([\d.]+)',
            'statistics': r'S=\s+([\d.]+)\s+Min=\([^)]+\)\s+([-\d.]+)\s+Max=\([^)]+\)\s+([-\d.]+)\s+形状=\s+([\d.]+)'
        }

        result = {}

        for pattern_name, pattern in patterns.items():
            match = re.search(pattern, line)
            if match:
                result['type'] = pattern_name
                result['match'] = match
                break

        return result if result else None

    def parse_cmm_data(self, text):
        """Parse the entire CMM measurement text"""
        lines = text.split('\n')
        current_element = None

        for i, line in enumerate(lines):
            line = line.strip()
            if not line:
                continue

            # Check if this is a new measurement element
            parsed_line = self.parse_measurement_line(line)

            if parsed_line:
                if parsed_line['type'] in ['circle', 'plane', 'line']:
                    # Start of new measurement element
                    if current_element:
                        self.measurement_data.append(current_element)

                    current_element = {
                        'name': parsed_line['match'].group(1),
                        'type': parsed_line['type'],
                        'point_count': int(parsed_line['match'].group(2)) if len(parsed_line['match'].groups()) >= 2 else None,
                        'side': parsed_line['match'].group(3) if len(parsed_line['match'].groups()) >= 3 else None,
                        'coordinates': {},
                        'statistics': {},
                        'tolerances': {}
                    }

                elif parsed_line['type'] == 'coordinate_value' and current_element:
                    # Coordinate value line
                    match = parsed_line['match']
                    coord_name = match.group(1)
                    coord_axis = match.group(2)
                    measured_value = float(match.group(3))
                    reference_value = float(match.group(4))
                    upper_tolerance = float(match.group(5))
                    lower_tolerance = float(match.group(6))
                    deviation = float(match.group(7))

                    current_element['coordinates'][coord_name] = {
                        'axis': coord_axis,
                        'measured': measured_value,
                        'reference': reference_value,
                        'upper_tol': upper_tolerance,
                        'lower_tol': lower_tolerance,
                        'deviation': deviation
                    }

                elif parsed_line['type'] == 'diameter' and current_element:
                    # Diameter measurement
                    match = parsed_line['match']
                    current_element['diameter'] = {
                        'measured': float(match.group(1)),
                        'reference': float(match.group(2))
                    }

                elif parsed_line['type'] == 'statistics' and current_element:
                    # Statistical data
                    match = parsed_line['match']
                    current_element['statistics'] = {
                        'std_dev': float(match.group(1)),
                        'min_value': float(match.group(2)),
                        'max_value': float(match.group(3)),
                        'form_error': float(match.group(4))
                    }

            # Check for coordinate system information
            if '基本座標系' in line:
                self.coordinate_system_data['name'] = line
                # Parse subsequent datum lines
                for j in range(i+1, min(i+10, len(lines))):
                    datum_line = lines[j].strip()
                    if 'ﾃﾞｰﾀﾑ' in datum_line:
                        if 'datums' not in self.coordinate_system_data:
                            self.coordinate_system_data['datums'] = []
                        self.coordinate_system_data['datums'].append(datum_line)

        # Add the last element
        if current_element:
            self.measurement_data.append(current_element)

    def get_all_elements(self):
        """Return all parsed measurement elements"""
        return {
            'measurement_data': self.measurement_data,
            'coordinate_system': self.coordinate_system_data,
            'total_elements': len(self.measurement_data)
        }

    def create_summary_dataframe(self):
        """Create a pandas DataFrame summary of all measurements"""
        summary_data = []

        for element in self.measurement_data:
            row = {
                'Element_Name': element['name'],
                'Type': element['type'],
                'Point_Count': element.get('point_count', 'N/A'),
                'Side': element.get('side', 'N/A')
            }

            # Add coordinate information
            for coord_name, coord_data in element.get('coordinates', {}).items():
                row[f'{coord_name}_Measured'] = coord_data['measured']
                row[f'{coord_name}_Reference'] = coord_data['reference']
                row[f'{coord_name}_Deviation'] = coord_data['deviation']

            # Add diameter if present
            if 'diameter' in element:
                row['Diameter_Measured'] = element['diameter']['measured']
                row['Diameter_Reference'] = element['diameter']['reference']

            # Add statistics if present
            if 'statistics' in element:
                row.update({
                    'Std_Dev': element['statistics'].get('std_dev'),
                    'Min_Value': element['statistics'].get('min_value'),
                    'Max_Value': element['statistics'].get('max_value'),
                    'Form_Error': element['statistics'].get('form_error')
                })

            summary_data.append(row)

        return pd.DataFrame(summary_data)

    def create_detailed_dataframe(self):
        """Create a detailed pandas DataFrame with all measurement data"""
        detailed_data = []

        for element in self.measurement_data:
            base_info = {
                'Element_Name': element['name'],
                'Type': element['type'],
                'Point_Count': element.get('point_count', 'N/A'),
                'Side': element.get('side', 'N/A')
            }

            # Add statistics if present
            if 'statistics' in element:
                base_info.update({
                    'Std_Dev': element['statistics'].get('std_dev'),
                    'Min_Value': element['statistics'].get('min_value'),
                    'Max_Value': element['statistics'].get('max_value'),
                    'Form_Error': element['statistics'].get('form_error')
                })

            # Add diameter if present
            if 'diameter' in element:
                base_info.update({
                    'Diameter_Measured': element['diameter']['measured'],
                    'Diameter_Reference': element['diameter']['reference'],
                    'Diameter_Deviation': element['diameter']['measured'] - element['diameter']['reference']
                })

            # Create separate rows for each coordinate
            if element.get('coordinates'):
                for coord_name, coord_data in element['coordinates'].items():
                    row = base_info.copy()
                    row.update({
                        'Coordinate_Name': coord_name,
                        'Axis': coord_data['axis'],
                        'Measured_Value': coord_data['measured'],
                        'Reference_Value': coord_data['reference'],
                        'Upper_Tolerance': coord_data['upper_tol'],
                        'Lower_Tolerance': coord_data['lower_tol'],
                        'Deviation': coord_data['deviation'],
                        'Within_Tolerance': coord_data['lower_tol'] <= coord_data['deviation'] <= coord_data['upper_tol']
                    })
                    detailed_data.append(row)
            else:
                # If no coordinates, add the base info as a single row
                detailed_data.append(base_info)

        return pd.DataFrame(detailed_data)

    def export_to_csv(self, export_type='summary', filename_prefix='CMM_Data'):
        """Export measurement data to CSV file(s)

        Args:
            export_type: 'summary', 'detailed', or 'both'
            filename_prefix: prefix for the CSV filename
        """
        from google.colab import files
        import datetime

        # Generate timestamp for unique filenames
        timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")

        exported_files = []

        if export_type in ['summary', 'both']:
            # Export summary DataFrame
            summary_df = self.create_summary_dataframe()
            summary_filename = f"{filename_prefix}_Summary_{timestamp}.csv"
            summary_df.to_csv(summary_filename, index=False, encoding='utf-8-sig')

            print(f"Summary data exported to: {summary_filename}")
            print(f"Summary contains {len(summary_df)} rows and {len(summary_df.columns)} columns")

            # Download the file
            files.download(summary_filename)
            exported_files.append(summary_filename)

        if export_type in ['detailed', 'both']:
            # Export detailed DataFrame
            detailed_df = self.create_detailed_dataframe()
            detailed_filename = f"{filename_prefix}_Detailed_{timestamp}.csv"
            detailed_df.to_csv(detailed_filename, index=False, encoding='utf-8-sig')

            print(f"Detailed data exported to: {detailed_filename}")
            print(f"Detailed contains {len(detailed_df)} rows and {len(detailed_df.columns)} columns")

            # Download the file
            files.download(detailed_filename)
            exported_files.append(detailed_filename)

        # Export coordinate system info if available
        if self.coordinate_system_data:
            coord_sys_data = []
            coord_sys_data.append({
                'Type': 'Coordinate System',
                'Name': self.coordinate_system_data.get('name', 'N/A'),
                'Description': 'Primary coordinate system'
            })

            if 'datums' in self.coordinate_system_data:
                for i, datum in enumerate(self.coordinate_system_data['datums']):
                    coord_sys_data.append({
                        'Type': 'Datum',
                        'Name': f'Datum_{i+1}',
                        'Description': datum
                    })

            coord_sys_df = pd.DataFrame(coord_sys_data)
            coord_sys_filename = f"{filename_prefix}_CoordinateSystem_{timestamp}.csv"
            coord_sys_df.to_csv(coord_sys_filename, index=False, encoding='utf-8-sig')

            print(f"Coordinate system data exported to: {coord_sys_filename}")
            files.download(coord_sys_filename)
            exported_files.append(coord_sys_filename)

        return exported_files

    def export_tolerance_analysis(self, filename_prefix='CMM_ToleranceAnalysis'):
        """Export tolerance analysis to CSV"""
        from google.colab import files
        import datetime

        timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
        tolerance_data = []

        for element in self.measurement_data:
            for coord_name, coord_data in element.get('coordinates', {}).items():
                within_tolerance = coord_data['lower_tol'] <= coord_data['deviation'] <= coord_data['upper_tol']
                tolerance_utilization = abs(coord_data['deviation']) / max(abs(coord_data['upper_tol']), abs(coord_data['lower_tol'])) * 100

                tolerance_data.append({
                    'Element_Name': element['name'],
                    'Element_Type': element['type'],
                    'Coordinate': coord_name,
                    'Axis': coord_data['axis'],
                    'Deviation': coord_data['deviation'],
                    'Upper_Tolerance': coord_data['upper_tol'],
                    'Lower_Tolerance': coord_data['lower_tol'],
                    'Within_Tolerance': within_tolerance,
                    'Tolerance_Utilization_%': round(tolerance_utilization, 2),
                    'Status': 'PASS' if within_tolerance else 'FAIL'
                })

        tolerance_df = pd.DataFrame(tolerance_data)
        tolerance_filename = f"{filename_prefix}_{timestamp}.csv"
        tolerance_df.to_csv(tolerance_filename, index=False, encoding='utf-8-sig')

        print(f"Tolerance analysis exported to: {tolerance_filename}")
        print(f"Total measurements: {len(tolerance_df)}")
        print(f"PASS: {len(tolerance_df[tolerance_df['Status'] == 'PASS'])}")
        print(f"FAIL: {len(tolerance_df[tolerance_df['Status'] == 'FAIL'])}")

        files.download(tolerance_filename)
        return tolerance_filename

    def display_results(self):
        """Display parsed results in a user-friendly format"""
        print(f"\n=== CMM Measurement Data Analysis ===")
        print(f"Total elements found: {len(self.measurement_data)}")

        # Group by type
        type_counts = {}
        for element in self.measurement_data:
            element_type = element['type']
            type_counts[element_type] = type_counts.get(element_type, 0) + 1

        print(f"\nElement types:")
        for element_type, count in type_counts.items():
            print(f"  {element_type}: {count}")

        # Display coordinate system info
        if self.coordinate_system_data:
            print(f"\nCoordinate System:")
            print(f"  Name: {self.coordinate_system_data.get('name', 'N/A')}")
            if 'datums' in self.coordinate_system_data:
                print(f"  Datums: {len(self.coordinate_system_data['datums'])}")

        # Create and display summary DataFrame
        df = self.create_summary_dataframe()
        print(f"\n=== Summary DataFrame ===")
        print(df.head(10))  # Show first 10 rows

        # Display export options
        print(f"\n=== Export Options ===")
        print("Available export functions:")
        print("1. parser.export_to_csv('summary') - Export summary data")
        print("2. parser.export_to_csv('detailed') - Export detailed data")
        print("3. parser.export_to_csv('both') - Export both summary and detailed")
        print("4. parser.export_tolerance_analysis() - Export tolerance analysis")

        return df

# Main execution function
def main():
    """Main function to run the CMM data parser"""
    print("CMM Data Parser - Carl Zeiss CALYPSO Report Analyzer")
    print("=" * 50)

    # Initialize parser
    parser = CMMDataParser()

    # Upload file
    file_data = parser.upload_file()
    if not file_data:
        return None

    filename, file_content = file_data

    # Extract text from PDF
    print("Extracting text from PDF...")
    text = parser.extract_pdf_text(file_content)
    if not text:
        print("Failed to extract text from PDF!")
        return None

    # Parse the measurement data
    print("Parsing measurement data...")
    parser.parse_cmm_data(text)

    # Display results
    df = parser.display_results()

    # Return all elements
    all_elements = parser.get_all_elements()

    print(f"\n=== Parser Complete ===")
    print(f"Successfully parsed {all_elements['total_elements']} measurement elements")

    # Ask user if they want to export data
    print(f"\n=== Export Data ===")
    export_choice = input("データをcsvにエクスポートしますか?「はい」はy、「いいえ」はnを入力ください。 (y/n): ").lower().strip()

    if export_choice == 'y' or export_choice == 'yes':
        print("\nExport options:")
        print("1. 要約のみ")
        print("2. 詳細データ")
        print("3. 要約と詳細データ")
        print("4. 公差解析")
        print("5. 全てのエクスポート")

        choice = input("データ出力オプションを１から５の番号で入力の上、エンターキーを押してください: ").strip()

        try:
            if choice == '1':
                parser.export_to_csv('summary')
            elif choice == '2':
                parser.export_to_csv('detailed')
            elif choice == '3':
                parser.export_to_csv('both')
            elif choice == '4':
                parser.export_tolerance_analysis()
            elif choice == '5':
                parser.export_to_csv('both')
                parser.export_tolerance_analysis()
            else:
                print("Invalid choice. Skipping export.")
        except Exception as e:
            print(f"Error during export: {e}")

    return {
        'parser': parser,
        'dataframe': df,
        'all_elements': all_elements,
        'raw_text': text
    }

def export_example_usage():
    """Example of how to use export functions after parsing"""
    print("\n=== Export Function Examples ===")
    print("After running main(), you can use these commands:")
    print()
    print("# Export summary data only:")
    print("result['parser'].export_to_csv('summary')")
    print()
    print("# Export detailed data only:")
    print("result['parser'].export_to_csv('detailed')")
    print()
    print("# Export both summary and detailed:")
    print("result['parser'].export_to_csv('both')")
    print()
    print("# Export tolerance analysis:")
    print("result['parser'].export_tolerance_analysis()")
    print()
    print("# Custom filename prefix:")
    print("result['parser'].export_to_csv('both', 'MyProject_CMM')")
    print()

# Run the parser when this cell is executed
if __name__ == "__main__":
    # Install required packages if not already installed

    try:
        import PyPDF2
    except ImportError:
        print("Installing PyPDF2...")
        !pip install PyPDF2
        import PyPDF2

    # Show export examples
    export_example_usage()

    # Run the main function
    result = main()

    # Store result in a global variable for easy access
    if result:
        globals()['cmm_result'] = result
        print(f"\n=== Access Your Data ===")
        print("Your parsed data is stored in 'cmm_result'")
        print("Access it using:")
        print("- cmm_result['parser'] - Parser object with all methods")
        print("- cmm_result['dataframe'] - Summary DataFrame")
        print("- cmm_result['all_elements'] - Complete parsed data")
        print("- cmm_result['raw_text'] - Extracted PDF text")


=== Export Function Examples ===
After running main(), you can use these commands:

# Export summary data only:
result['parser'].export_to_csv('summary')

# Export detailed data only:
result['parser'].export_to_csv('detailed')

# Export both summary and detailed:
result['parser'].export_to_csv('both')

# Export tolerance analysis:
result['parser'].export_tolerance_analysis()

# Custom filename prefix:
result['parser'].export_to_csv('both', 'MyProject_CMM')

CMM Data Parser - Carl Zeiss CALYPSO Report Analyzer
Please upload your CMM measurement file (PDF format):


Saving D001_elements.pdf to D001_elements.pdf
File 'D001_elements.pdf' uploaded successfully!
Extracting text from PDF...
Parsing measurement data...

=== CMM Measurement Data Analysis ===
Total elements found: 71

Element types:
  plane: 4
  circle: 63
  line: 4

Coordinate System:
  Name: 20190521支持板D          基本座標系
  Datums: 5

=== Summary DataFrame ===
  Element_Name    Type  Point_Count  Side  Std_Dev  Min_Value  Max_Value  \
0          平面1   plane            6  None   0.0700    -0.0744     0.0602   
1        基準円28  circle            8    内側   0.0071    -0.0088     0.0116   
2        基準円27  circle            8    内側   0.0047    -0.0062     0.0062   
3          ロハ線    line            3  None   0.0070    -0.0057     0.0029   
4           円1  circle            4    内側   0.0127    -0.0064     0.0064   
5           円2  circle            4    内側   0.0079    -0.0040     0.0039   
6           円3  circle            4    内側   0.0163    -0.0082     0.0082   
7           円4  circle           

KeyboardInterrupt: Interrupted by user