In [19]:
!pip install fpdf



# Importing required libraries


In [20]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from fpdf import FPDF

# Function to perform following task
1.   Listing all the columns with missing values
2.   Categorize the columns based on their data type and print.
3.   List all columns with duplicates a)Remove them b)Print before and after.
4.   List the constant columns a)Remove them b)Print before and after
5.   Create box plot to visualise the outlier of all the numeric columns
5.   Create charts for any 6 columns and show their distribution







In [37]:
class PDFReport(FPDF):
    def header(self):
        self.set_font("Arial", style="B", size=12)
        self.cell(0, 10, "Data Analysis Report", align="C", ln=True)
        self.ln(5)

    def footer(self):
        self.set_y(-15)
        self.set_font("Arial", size=8)
        self.cell(0, 10, f"Page {self.page_no()}", align="C")

def gennerate_report(data_path, output_file):
    # Load dataset

    df = pd.read_csv(data_path)

    # Initialize PDF

    pdf = PDFReport()
    pdf.set_auto_page_break(auto=True, margin=15)
    pdf.add_page()
    pdf.set_font("Arial", size=12)

    # Section 1: Columns with Missing Values

    pdf.set_font("Arial", style="B", size=12)
    pdf.cell(0, 10, "1. Columns with Missing Values", ln=True)
    pdf.set_font("Arial", size=10)
    missing_values = df.isnull().sum()
    missing_columns = missing_values[missing_values > 0]
    if not missing_columns.empty:
        for col, val in missing_columns.items():
            pdf.cell(0, 10, f"{col}: {val} missing values", ln=True)
    else:
        pdf.cell(0, 10, "No missing values found.", ln=True)
    pdf.ln(10)

    # Section 2: Categorized Columns

    pdf.set_font("Arial", style="B", size=12)
    pdf.cell(0, 10, "2. Categorized Columns", ln=True)
    pdf.set_font("Arial", size=10)
    numerical_columns = df.select_dtypes(include=['int64', 'float64']).columns.tolist()
    categorical_columns = df.select_dtypes(include=['object']).columns.tolist()
    pdf.cell(0, 10, "Numerical Columns:", ln=True)
    pdf.multi_cell(0, 10, ", ".join(numerical_columns) if numerical_columns else "None")
    pdf.cell(0, 10, "Categorical Columns:", ln=True)
    pdf.multi_cell(0, 10, ", ".join(categorical_columns) if categorical_columns else "None")
    pdf.ln(10)

    # Section 3: Columns with Duplicates

    duplicate_columns = df.T[df.T.duplicated()].index.tolist()
    pdf.set_font("Arial", style="B", size=12)
    pdf.cell(0, 10, "3. Columns with Duplicates", ln=True)
    pdf.set_font("Arial", size=10)
    if duplicate_columns:
        pdf.cell(0, 10, f"Before Removal: {', '.join(duplicate_columns)}", ln=True)
        df = df.loc[:, ~df.columns.duplicated()]
        pdf.cell(0, 10, f"After Removal: {', '.join(df.columns)}", ln=True)
    else:
        pdf.cell(0, 10, "No duplicate columns found.", ln=True)
    pdf.ln(10)

    # Section 4: Constant Columns

    constant_columns = [col for col in df.columns if df[col].nunique() == 1]
    pdf.set_font("Arial", style="B", size=12)
    pdf.cell(0, 10, "4. Constant Columns", ln=True)
    pdf.set_font("Arial", size=10)
    if constant_columns:
        pdf.cell(0, 10, f"Before Removal: {', '.join(constant_columns)}", ln=True)
        df.drop(columns=constant_columns, inplace=True)
        pdf.cell(0, 10, f"After Removal: {', '.join(df.columns)}", ln=True)
    else:
        pdf.cell(0, 10, "No constant columns found.", ln=True)
    pdf.ln(10)

    numerical_columns = df.select_dtypes(include=['int64', 'float64']).columns.tolist()
    pdf.add_page()

    # Section 5: Boxplots for Numerical Columns
    pdf.set_font("Arial", style="B", size=12)
    pdf.cell(0, 10, "5. Boxplots for Numerical Columns", ln=True)
    pdf.set_font("Arial", size=10)
    for col in numerical_columns:

        plt.figure(figsize=(6, 4))
        sns.boxplot(x=df[col])
        plt.title(f"Boxplot of {col}")
        plt.tight_layout()
        image_path = f"boxplot_{col}.png"
        plt.savefig(image_path)
        plt.close()
        pdf.cell(0, 10, f"Boxplot for {col}:", ln=True)
        pdf.image(image_path, x=15, y=pdf.get_y(), w=170)
        pdf.ln(65)
        pdf.add_page()

    # Section 6: Charts for 6 Random Columns

    pdf.set_font("Arial", style="B", size=12)
    pdf.cell(0, 10, "6. Charts for 6 Random Columns", ln=True)
    sample_columns = df.sample(n=min(6, len(df.columns)), axis=1).columns
    for col in sample_columns:

        plt.figure(figsize=(6, 4))
        if col in numerical_columns:
            sns.histplot(data=df, x=col, kde=True)
        else:
            sns.countplot(data=df, x=col)
        plt.title(f"Distribution of {col}")
        plt.tight_layout()
        image_path = f"distribution_{col}.png"
        plt.savefig(image_path)
        plt.close()
        pdf.cell(0, 10,  ln=True)
        pdf.image(image_path, x=15, y=pdf.get_y(), w=170)
        pdf.ln(65)
        pdf.add_page()

    # Save PDF
    pdf.output(output_file)
    print(f"Report generated: {output_file}")



# Provide the input data path

In [39]:
# @title add path to input data file and name of report

data_path = 'DS_Python_Assignment.csv' #@param {type:"string"}
output_file = 'data_analysis.pdf' #@param {type:"string"}
gennerate_report(data_path, output_file)

Report generated: data_analysis.pdf
