In [59]:
import io
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from reportlab.pdfgen import canvas
from reportlab.lib.pagesizes import letter
from reportlab.lib.utils import ImageReader
from reportlab.lib import colors
from datetime import datetime
from reportlab.platypus import Table, TableStyle

In [60]:
auto_version = 'alpha_1.0'

def generate_pdf_report(df, output_file="report.pdf", df_name="Dataset"):
    """
    Generate a PDF report with enhanced layout including:
    - Cover page with logo, metadata, and version info
    - Optimized element positioning
    - Non-overlapping outlier annotations
    - Improved visual hierarchy
    """
    c = canvas.Canvas(output_file, pagesize=letter)
    high_cardinality_cols = []
    low_missing_cols = []
    version_info = {
        'pandas': pd.__version__,
        'numpy': np.__version__,
        'reportlab': '4.0.4'
    }

    # Create cover page
    c.setFont("Helvetica-Bold", 18)
    c.drawCentredString(300, 750, "Data Analysis Report")
    c.drawCentredString(300, 720, f"Generated using AutoStats {auto_version}")
    # c.setFont("Helvetica", 14)
    # c.drawCentredString(300, 690, "a python library by Matin Irajpour M.D.")

    
    logo_path = 'logo.png'
    # Add logo
    if logo_path:
        try:
            logo = ImageReader(logo_path)
            c.drawImage(logo, 225, 560, width=150, height=150, preserveAspectRatio=True)
        except:
            c.drawString(250, 570, "[Logo Not Found]")
    
    # Metadata
    c.setFont("Helvetica", 12)
    metadata = [
        f"DataFrame Name: {df_name}",
        f"Analysis Date: {datetime.now().strftime('%Y-%m-%d')}",
        f"Analysis Time: {datetime.now().strftime('%H:%M:%S')}",
        "",
        "Library Versions:",
        *[f"{lib}: {ver}" for lib, ver in version_info.items()]
    ]
    
    y_pos = 500
    for line in metadata:
        c.drawString(100, y_pos, line)
        y_pos -= 20
    
    c.showPage()

    # Main content
    for column in df.columns:
        if not pd.api.types.is_numeric_dtype(df[column]):
            continue

        c.setFont("Helvetica-Bold", 16)
        c.drawString(100, 750, "Column Analysis")
        c.setFont("Helvetica-Bold", 14)
        c.drawString(100, 725, f"Column: {column}")

        y_position = 690

        # Basic statistics
        stats = df[column].describe()
        nulls = df[column].isna().sum()
        uniques = df[column].nunique()
        missingness = nulls / len(df) if len(df) > 0 else 0

        if uniques >= 10:
            high_cardinality_cols.append(column)
        if missingness < 0.5:
            low_missing_cols.append(column)

        # Create statistics table
        stats_data = [
            ["Statistic", "Value"],
            ["Missing Values", nulls],
            ["Missingness", f"{missingness:.2%}"],
            ["Unique Values", uniques]
        ]
        
        if uniques >= 10:
            stats_data += [[stat.capitalize(), f"{value:.2f}"] 
                          for stat, value in stats.items()]
        else:
            stats_data += [[stat.capitalize(), f"{value:.2f}"] 
                          for stat, value in stats.items()
                          if stat in ['count', 'mean', 'std']]

        # Create and position table
        col_widths = [120, 80]
        table = Table(stats_data, colWidths=col_widths)
        table.setStyle(TableStyle([
            ('FONTNAME', (0,0), (-1,0), 'Helvetica-Bold'),
            ('BACKGROUND', (0,0), (-1,0), colors.lightgrey),
            ('TEXTCOLOR', (0,0), (-1,0), colors.black),
            ('ALIGN', (0,0), (-1,-1), 'LEFT'),
            ('FONTSIZE', (0,0), (-1,-1), 10),
            ('BOX', (0,0), (-1,-1), 1, colors.black),
            ('GRID', (0,0), (-1,-1), 0.5, colors.grey)
        ]))
        
        table.wrapOn(c, 400, 200)
        table.drawOn(c, 100, y_position - table._height - 20)
        y_position -= table._height + 40

        # Visualization
        plt.figure(figsize=(8, 4))
        outliers_text = ""

        if uniques < 10:
            value_counts = df[column].value_counts().sort_index()
            bars = plt.bar(value_counts.index.astype(str), value_counts.values)
            
            for bar in bars:
                height = bar.get_height()
                plt.text(bar.get_x() + bar.get_width()/2., height,
                         f'{int(height)}',
                         ha='center', va='bottom', fontsize=8)
            
            plt.title(f'Value Distribution: {column}')
        else:
            clean_data = df[column].dropna()
            if not clean_data.empty:
                q1 = clean_data.quantile(0.25)
                q3 = clean_data.quantile(0.75)
                iqr = q3 - q1
                lower_bound = q1 - 1.5 * iqr
                upper_bound = q3 + 1.5 * iqr
                outliers = clean_data[(clean_data < lower_bound) | (clean_data > upper_bound)]
                
                # Create horizontal box plot
                plt.boxplot(clean_data, vert=False)
                plt.title(f'Box Plot: {column}\n({len(outliers)} Outliers Detected)')
                plt.yticks([])

                if not outliers.empty:
                    outlier_indices = outliers.index.tolist()
                    displayed_outliers = [f"Index: {idx}, Value: {outliers[idx]:.2f}" 
                                        for idx in outlier_indices[:15]]
                    outliers_text = "Key Outliers:\n" + "\n".join(displayed_outliers)
                    
                    if len(outliers) > 15:
                        outliers_text += f"\n(Showing first 15 of {len(outliers)} outliers)"
                
                    # Adjusted annotations for horizontal layout
                    for i, (idx, val) in enumerate(outliers.items()):
                        y_offset = 1.1 + (0.1 * (i % 2))  # Vertical stagger
                        plt.annotate(
                            f'{idx}',
                            xy=(val, 1),
                            xytext=(val, y_offset),
                            textcoords='data',
                            arrowprops=dict(arrowstyle='->', color='red', lw=0.5),
                            fontsize=6,
                            ha='center',
                            va='bottom',
                            rotation=45
                        )
            else:
                plt.text(0.5, 0.5, 'No valid data points', ha='center', va='center')

        plt.tight_layout()

        # Save plot to buffer
        buffer = io.BytesIO()
        plt.savefig(buffer, format='png', bbox_inches='tight', dpi=150)
        buffer.seek(0)
        img = ImageReader(buffer)
        plt.close()

        # Dynamic image scaling
        max_img_height = y_position - 100
        img_width, img_height = img.getSize()
        scaling_factor = min(400 / img_width, max_img_height / img_height)
        scaled_width = img_width * scaling_factor
        scaled_height = img_height * scaling_factor

        # Draw image
        img_y = y_position - scaled_height
        c.drawImage(img, 100, img_y, width=scaled_width, height=scaled_height)

        # Add outlier text
        if uniques >= 10 and outliers_text:
            text_y = img_y - 20
            text = c.beginText(100, text_y)
            text.setFont("Helvetica", 8)
            text.setLeading(9)
            for line in outliers_text.split('\n'):
                text.textLine(line)
                if text_y < 100:
                    break
                text_y -= 9
            c.drawText(text)

        c.showPage()

    # Summary pages
    c.setFont("Helvetica-Bold", 16)
    c.drawString(100, 750, "Summary Statistics")
    
    summary_data = [
        ["Metric", "Count"],
        ["Total Columns", len(df.columns)],
        ["High-Cardinality Columns", len(high_cardinality_cols)],
        ["Acceptable-Missingness Columns", len(low_missing_cols)]
    ]
    
    summary_table = Table(summary_data, colWidths=[200, 100])
    summary_table.setStyle(TableStyle([
        ('FONTNAME', (0,0), (-1,0), 'Helvetica-Bold'),
        ('BACKGROUND', (0,0), (-1,0), colors.lightgrey),
        ('GRID', (0,0), (-1,-1), 1, colors.black)
    ]))
    summary_table.wrapOn(c, 400, 200)
    summary_table.drawOn(c, 100, 650)

    c.showPage()
    c.save()
    print(f"Report generated: {output_file}")

In [61]:
# df = pd.read_excel('raw.xlsx', 'raw')
# df['side'] = df['side'].astype('str')
# generate_pdf_report(df, df_name='berlin_edema_data')

In [62]:
df = pd.read_excel('raw.xlsx', 'raw')
df['side'].info()
df['side'] = df['side'].astype('str')

<class 'pandas.core.series.Series'>
RangeIndex: 45 entries, 0 to 44
Series name: side
Non-Null Count  Dtype 
--------------  ----- 
45 non-null     object
dtypes: object(1)
memory usage: 488.0+ bytes


In [63]:
df['side'].value_counts()

side
OS    25
OD    20
Name: count, dtype: int64

In [None]:
auto_version = 'alpha_1.0'

def generate_pdf_report_v2(df, output_file="report.pdf", df_name="Dataset"):
    """
    Generate a PDF report with enhanced layout including:
    - Cover page with logo, metadata, and version info
    - Column type categorization (categorical, discrete, continuous)
    - Boxplots for continuous features, count plots for others
    - Detailed tables with appropriate statistics for each column type
    """
    c = canvas.Canvas(output_file, pagesize=letter)
    
    # --- Convert columns to numeric if possible ---
    for col in df.columns:
        # Attempt conversion: values that cannot be converted will become NaN.
        converted = pd.to_numeric(df[col], errors='coerce')
        # If the conversion produced at least one valid number, replace the column.
        if not converted.isna().all():
            df[col] = converted
    # ------------------------------------------------
    # Categorize columns
    categorical_cols = []
    discrete_cols = []
    continuous_cols = []
    for col in df.columns:
        if pd.api.types.is_numeric_dtype(df[col]):
            nunique = df[col].nunique()
            if nunique < 10:
                discrete_cols.append(col)
            else:
                continuous_cols.append(col)
        else:
            categorical_cols.append(col)
    
    low_missing_cols = [col for col in df.columns if (df[col].isna().sum() / len(df) if len(df) > 0 else 0) < 0.5]
    version_info = {
        'pandas': pd.__version__,
        'numpy': np.__version__,
        'reportlab': '4.0.4'
    }

    # Create cover page
    c.setFont("Helvetica-Bold", 18)
    c.drawCentredString(300, 750, "Data Analysis Report")
    c.drawCentredString(300, 720, f"Generated using AutoStats {auto_version}")
    
    logo_path = 'logo.png'
    if logo_path:
        try:
            logo = ImageReader(logo_path)
            c.drawImage(logo, 225, 560, width=150, height=150, preserveAspectRatio=True)
        except:
            c.drawString(250, 570, "[Logo Not Found]")
    
    # Metadata
    c.setFont("Helvetica", 12)
    metadata = [
        f"DataFrame Name: {df_name}",
        f"Analysis Date: {datetime.now().strftime('%Y-%m-%d')}",
        f"Analysis Time: {datetime.now().strftime('%H:%M:%S')}",
        "",
        "Library Versions:",
        *[f"{lib}: {ver}" for lib, ver in version_info.items()]
    ]
    
    y_pos = 500
    for line in metadata:
        c.drawString(100, y_pos, line)
        y_pos -= 20
    
    c.showPage()

    # Main content
    for column in df.columns:
        if column in categorical_cols or column in discrete_cols:
            # Handle categorical/discrete columns
            c.setFont("Helvetica-Bold", 16)
            c.drawString(100, 750, "Column Analysis")
            c.setFont("Helvetica-Bold", 14)
            c.drawString(100, 725, f"Column: {column}")

            y_position = 690

            # Compute stats
            nulls = df[column].isna().sum()
            missingness = nulls / len(df) if len(df) > 0 else 0
            total_count = len(df)
            value_counts = df[column].value_counts(dropna=True)
            total_non_missing = value_counts.sum()
            unique_count = len(value_counts)

            # Create stats table
            stats_data = [
                ["Statistic", "Value"],
                ["Missing Values", nulls],
                ["Missingness", f"{missingness:.2%}"],
                ["Total Count", total_count],
                ["Non-Missing Count", total_non_missing],
                ["Unique Values", unique_count],
                ["Value Counts", ""]
            ]

            max_values = 10
            for value, count in value_counts.head(max_values).items():
                percentage = (count / total_non_missing) * 100 if total_non_missing != 0 else 0
                stats_data.append([f"  {value}", f"{count} ({percentage:.1f}%)"])
            if len(value_counts) > max_values:
                stats_data.append([f"  ... (showing top {max_values})", f"{len(value_counts) - max_values} more"])

            # Create table
            col_widths = [120, 80]
            table = Table(stats_data, colWidths=col_widths)
            table.setStyle(TableStyle([
                ('FONTNAME', (0,0), (-1,0), 'Helvetica-Bold'),
                ('BACKGROUND', (0,0), (-1,0), colors.lightgrey),
                ('TEXTCOLOR', (0,0), (-1,0), colors.black),
                ('ALIGN', (0,0), (-1,-1), 'LEFT'),
                ('FONTSIZE', (0,0), (-1,-1), 10),
                ('BOX', (0,0), (-1,-1), 1, colors.black),
                ('GRID', (0,0), (-1,-1), 0.5, colors.grey)
            ]))
            
            table.wrapOn(c, 400, 200)
            table.drawOn(c, 100, y_position - table._height - 20)
            y_position -= table._height + 40

            # Visualization (count plot)
            plt.figure(figsize=(8,4))
            value_counts_plot = df[column].value_counts()
            value_counts_plot.index = value_counts_plot.index.astype(str)
            
            bars = plt.bar(value_counts_plot.index, value_counts_plot.values)
            plt.title(f'Value Distribution: {column}')
            plt.xticks(rotation=45)
            
            for bar in bars:
                height = bar.get_height()
                plt.text(bar.get_x() + bar.get_width()/2., height,
                         f'{int(height)}',
                         ha='center', va='bottom', fontsize=8)
            
            plt.tight_layout()

            buffer = io.BytesIO()
            plt.savefig(buffer, format='png', bbox_inches='tight', dpi=150)
            buffer.seek(0)
            img = ImageReader(buffer)
            plt.close()

            max_img_height = y_position - 100
            img_width, img_height = img.getSize()
            scaling_factor = min(400 / img_width, max_img_height / img_height)
            scaled_width = img_width * scaling_factor
            scaled_height = img_height * scaling_factor

            img_y = y_position - scaled_height
            c.drawImage(img, 100, img_y, width=scaled_width, height=scaled_height)
            c.showPage()
        
        else:
            # Handle continuous columns
            c.setFont("Helvetica-Bold", 16)
            c.drawString(100, 750, "Column Analysis")
            c.setFont("Helvetica-Bold", 14)
            c.drawString(100, 725, f"Column: {column}")

            y_position = 690

            # Basic statistics
            stats = df[column].describe()
            nulls = df[column].isna().sum()
            uniques = df[column].nunique()
            missingness = nulls / len(df) if len(df) > 0 else 0

            # Create statistics table
            stats_data = [
                ["Statistic", "Value"],
                ["Missing Values", nulls],
                ["Missingness", f"{missingness:.2%}"],
                ["Unique Values", uniques]
            ]
            
            stats_data += [[stat.capitalize(), f"{value:.2f}"] 
                          for stat, value in stats.items()]

            # Create and position table
            col_widths = [120, 80]
            table = Table(stats_data, colWidths=col_widths)
            table.setStyle(TableStyle([
                ('FONTNAME', (0,0), (-1,0), 'Helvetica-Bold'),
                ('BACKGROUND', (0,0), (-1,0), colors.lightgrey),
                ('TEXTCOLOR', (0,0), (-1,0), colors.black),
                ('ALIGN', (0,0), (-1,-1), 'LEFT'),
                ('FONTSIZE', (0,0), (-1,-1), 10),
                ('BOX', (0,0), (-1,-1), 1, colors.black),
                ('GRID', (0,0), (-1,-1), 0.5, colors.grey)
            ]))
            
            table.wrapOn(c, 400, 200)
            table.drawOn(c, 100, y_position - table._height - 20)
            y_position -= table._height + 40

            # Visualization (boxplot)
            plt.figure(figsize=(8,4))
            clean_data = df[column].dropna()
            outliers_text = ""
            if not clean_data.empty:
                q1 = clean_data.quantile(0.25)
                q3 = clean_data.quantile(0.75)
                iqr = q3 - q1
                lower_bound = q1 - 1.5 * iqr
                upper_bound = q3 + 1.5 * iqr
                outliers = clean_data[(clean_data < lower_bound) | (clean_data > upper_bound)]
                
                plt.boxplot(clean_data, vert=False)
                plt.title(f'Box Plot: {column}\n({len(outliers)} Outliers Detected)')
                plt.yticks([])

                if not outliers.empty:
                    outlier_indices = outliers.index.tolist()
                    displayed_outliers = [f"Index: {idx}, Value: {outliers[idx]:.2f}" 
                                        for idx in outlier_indices[:15]]
                    outliers_text = "Key Outliers:\n" + "\n".join(displayed_outliers)
                    
                    if len(outliers) > 15:
                        outliers_text += f"\n(Showing first 15 of {len(outliers)} outliers)"
                    
                    for i, (idx, val) in enumerate(outliers.items()):
                        y_offset = 1.1 + (0.1 * (i % 2))
                        plt.annotate(
                            f'{idx}',
                            xy=(val, 1),
                            xytext=(val, y_offset),
                            textcoords='data',
                            arrowprops=dict(arrowstyle='->', color='red', lw=0.5),
                            fontsize=6,
                            ha='center',
                            va='bottom',
                            rotation=45
                        )
            else:
                plt.text(0.5, 0.5, 'No valid data points', ha='center', va='center')

            plt.tight_layout()

            buffer = io.BytesIO()
            plt.savefig(buffer, format='png', bbox_inches='tight', dpi=150)
            buffer.seek(0)
            img = ImageReader(buffer)
            plt.close()

            max_img_height = y_position - 100
            img_width, img_height = img.getSize()
            scaling_factor = min(400 / img_width, max_img_height / img_height)
            scaled_width = img_width * scaling_factor
            scaled_height = img_height * scaling_factor

            img_y = y_position - scaled_height
            c.drawImage(img, 100, img_y, width=scaled_width, height=scaled_height)

            if outliers_text:
                text_y = img_y - 20
                text = c.beginText(100, text_y)
                text.setFont("Helvetica", 8)
                text.setLeading(9)
                for line in outliers_text.split('\n'):
                    text.textLine(line)
                    if text_y < 100:
                        break
                    text_y -= 9
                c.drawText(text)

            c.showPage()

    # Summary pages
    c.setFont("Helvetica-Bold", 16)
    c.drawString(100, 750, "Summary Statistics")
    
    summary_data = [
        ["Metric", "Count"],
        ["Total Columns", len(df.columns)],
        ["Categorical Columns", len(categorical_cols)],
        ['Discrete columns', len(discrete_cols)],
        ['Continuous columns', len(continuous_cols)],
        ["Acceptable-Missingness Columns", len(low_missing_cols)]
    ]
    
    summary_table = Table(summary_data, colWidths=[200, 100])
    summary_table.setStyle(TableStyle([
        ('FONTNAME', (0,0), (-1,0), 'Helvetica-Bold'),
        ('BACKGROUND', (0,0), (-1,0), colors.lightgrey),
        ('GRID', (0,0), (-1,-1), 1, colors.black)
    ]))
    summary_table.wrapOn(c, 400, 200)
    summary_table.drawOn(c, 100, 600)

    c.showPage()
    c.save()
    print(f"Report generated: {output_file}")


In [79]:
generate_pdf_report_v2(df,output_file='report3.pdf')

Report generated: report3.pdf
