In [1]:
import fitz  # PyMuPDF
import pandas as pd
import re
import matplotlib.pyplot as plt
import os


In [3]:

# Function to extract text from a PDF
def extract_text_from_pdf(pdf_path):
    document = fitz.open(pdf_path)
    text = ""
    for page_num in range(len(document)):
        page = document.load_page(page_num)
        text += page.get_text()
    return text

# Function to parse marks from the extracted text
def parse_marks(text):
    marks_data = []
    # Regular expression to find lines with marks
    pattern = re.compile(r"(\d+)\s+(-?\d+)")
    for match in pattern.finditer(text):
        srlno, marks = match.groups()
        marks_data.append((int(srlno), int(marks)))
    return marks_data

# Function to create a dataframe from marks data and city
def create_dataframe(marks_data, city):
    df = pd.DataFrame(marks_data, columns=["Srlno", "Marks"])
    df['City'] = city
    return df

# Function to process all PDFs in a folder
def process_pdfs(folder_path):
    all_data = []
    for filename in os.listdir(folder_path):
        if filename.endswith(".pdf"):
            city = os.path.splitext(filename)[0]  # Use filename as city name
            pdf_path = os.path.join(folder_path, filename)
            text = extract_text_from_pdf(pdf_path)
            marks_data = parse_marks(text)
            df = create_dataframe(marks_data, city)
            all_data.append(df)
    return pd.concat(all_data, ignore_index=True)

def plot_histograms(high_mean_csv, low_std_csv, output_folder='figures'):
    # Create the output folder if it does not exist
    os.makedirs(output_folder, exist_ok=True)
    
    # Load the data from CSV files
    high_mean_data = pd.read_csv(high_mean_csv)
    low_std_data = pd.read_csv(low_std_csv)
    
    # Plot for high mean marks
    plt.figure(figsize=(12, 6))
    plt.hist(high_mean_data['mean'], bins=20, color='skyblue', alpha=0.7, edgecolor='black')
    plt.title('Distribution of Mean Marks for Cities with High Mean')
    plt.xlabel('Mean Marks')
    plt.ylabel('Number of Cities')
    plt.grid(True)
    plt.savefig(os.path.join(output_folder, "high_mean_marks_distribution.png"))
    plt.close()
    
    # Plot for low standard deviation
    plt.figure(figsize=(12, 6))
    plt.hist(low_std_data['std'], bins=20, color='salmon', alpha=0.7, edgecolor='black')
    plt.title('Distribution of Standard Deviation for Cities with Low Std Dev')
    plt.xlabel('Standard Deviation')
    plt.ylabel('Number of Cities')
    plt.grid(True)
    plt.savefig(os.path.join(output_folder, "low_std_dev_distribution.png"))
    plt.close()
    
    print(f"Plots saved to the '{output_folder}' folder.")


In [3]:

# Path to the folder containing the PDFs
pdf_folder = "pdfs"

# Process PDFs and create a dataframe
df = process_pdfs(pdf_folder)

# Save the dataframe to a CSV file
df.to_csv("city_marks_data.csv", index=False)
print("Data saved to city_marks_data.csv")

# Analyze the marks for each city
city_group = df.groupby('City')['Marks'].describe()

# Save the analysis to a CSV file
city_group.to_csv("city_marks_analysis.csv")
print("City-wise marks analysis saved to city_marks_analysis.csv")

# Example of further analysis: detecting potential malpractice
# Detect cities with unusually high mean marks
high_mean_cities = city_group[city_group['mean'] > city_group['mean'].quantile(0.95)]
high_mean_cities.to_csv("high_mean_cities.csv")
print("Cities with unusually high mean marks saved to high_mean_cities.csv")

# Detect cities with unusually low standard deviation of marks
low_std_cities = city_group[city_group['std'] < city_group['std'].quantile(0.05)]
low_std_cities.to_csv("low_std_cities.csv")
print("Cities with unusually low standard deviation of marks saved to low_std_cities.csv")


Data saved to city_marks_data.csv
City-wise marks analysis saved to city_marks_analysis.csv
Cities with unusually high mean marks saved to high_mean_cities.csv
Cities with unusually low standard deviation of marks saved to low_std_cities.csv


In [None]:
# Function to read marks data for a specific city from the corresponding PDF
def read_marks_for_city(pdf_folder, city_name):
    pdf_path = os.path.join(pdf_folder, f"{city_name}.pdf")
    if os.path.exists(pdf_path):
        text = extract_text_from_pdf(pdf_path)
        return parse_marks(text)
    else:
        print(f"PDF for city {city_name} not found.")
        return []

# Function to plot histograms with layers
def plot_histograms_with_layers(high_mean_csv, low_std_csv, pdf_folder, output_folder='figures'):
    # Create the output folder if it does not exist
    os.makedirs(output_folder, exist_ok=True)
    
    # Load the city names from CSV files
    high_mean_data = pd.read_csv(high_mean_csv)
    low_std_data = pd.read_csv(low_std_csv)
    
    # Set up the plot
    plt.figure(figsize=(12, 6))
    
    # Plot for cities with high mean marks
    for city in high_mean_data['City']:
        marks = read_marks_for_city(pdf_folder, city)
        if marks:
            plt.hist(marks, bins=20, alpha=0.5, label=f"{city} (High Mean)")

    plt.title('Marks Distribution for Cities with High Mean Marks')
    plt.xlabel('Marks')
    plt.ylabel('Frequency')
    plt.legend()
    plt.grid(True)
    plt.savefig(os.path.join(output_folder, "high_mean_marks_distribution_layers.png"))
    plt.close()

    # Set up the plot
    plt.figure(figsize=(12, 6))
    
    # Plot for cities with low standard deviation
    for city in low_std_data['City']:
        marks = read_marks_for_city(pdf_folder, city)
        if marks:
            plt.hist(marks, bins=20, alpha=0.5, label=f"{city} (Low Std Dev)")

    plt.title('Marks Distribution for Cities with Low Std Dev')
    plt.xlabel('Marks')
    plt.ylabel('Frequency')
    plt.legend()
    plt.grid(True)
    plt.savefig(os.path.join(output_folder, "low_std_dev_distribution_layers.png"))
    plt.close()

    print(f"Layered histograms saved to the '{output_folder}' folder.")

# Example usage
plot_histograms_with_layers('high_mean_cities.csv', 'low_std_cities.csv', 'path/to/pdf_folder')


In [5]:
# Example usage
plot_histograms('high_mean_cities.csv', 'low_std_cities.csv', output_folder='csv')

Plots saved to the 'csv' folder.
