# Data Exploration Notebook

## Introduction

This notebook provides comprehensive exploratory data analysis (EDA) for deepfake detection datasets. We'll analyze:

1. Dataset structure and organization
2. Class distribution (real vs fake)
3. Image statistics (resolution, aspect ratio, file formats)
4. Face detection success rates
5. Dataset quality metrics

## Dataset Description

The dataset contains images labeled as either "real" or "fake" (deepfake). We'll explore various aspects to understand the data distribution and quality.


In [None]:
import sys
from pathlib import Path

# Resolve project root (dfprojectv2)
project_root = Path().resolve().parent.parent

# Add project root to path (robust)
project_root = Path(__file__).resolve().parents[1] if "__file__" in globals() else Path.cwd().parents[0]

if str(project_root) not in sys.path:
    sys.path.insert(0, str(project_root))

print("Project root added:", project_root)


Project root added: c:\Users\HP\Documents\QUEST\dfprojectv2


In [None]:
# Import required libraries
import sys
import os
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from PIL import Image
import cv2
from tqdm import tqdm
import json

# Add project root to path
project_root = Path().resolve().parent.parent
sys.path.append(str(project_root))

# Import EDA modules
from eda.data_analyzer import DataAnalyzer
from eda.visualization import EDAVisualizer

# Set style
plt.style.use('seaborn-v0_8')
sns.set_palette("Set2")
%matplotlib inline

print("Libraries imported successfully!")


Libraries imported successfully!


In [None]:
from eda import DataAnalyzer, DataQualityChecker, EDAVisualizer
print("EDA package imports working correctly")


EDA package imports working correctly


## 1. Dataset Configuration

Configure the dataset path and initialize the analyzer.


In [None]:
# Configure dataset path
DATASET_PATH = r"C:\Users\HP\Documents\QUEST\deepfake project\dataset"  # Update this to your dataset path
DATASET_NAME = "faceforensics"  # faceforensics, celebdf, dfdc

# Initialize analyzer and visualizer
analyzer = DataAnalyzer(DATASET_PATH)
visualizer = EDAVisualizer()

print(f"Dataset path: {DATASET_PATH}")
print(f"Dataset exists: {os.path.exists(DATASET_PATH)}")


Dataset path: C:\Users\HP\Documents\QUEST\deepfake project\dataset
Dataset exists: True


In [None]:
class_counts = analyzer.get_class_distribution()


AttributeError: 'DataAnalyzer' object has no attribute 'get_class_distribution'

In [None]:
visualizer.plot_class_distribution(class_counts, interactive=True)

NameError: name 'class_counts' is not defined

## 2. Dataset Structure Analysis

Analyze the directory structure and organization of the dataset.


In [None]:
# Analyze dataset structure
structure = analyzer.analyze_dataset_structure()

print("=== Dataset Structure ===")
print(f"Total Images: {structure['total_images']}")
print(f"Classes: {structure['classes']}")
print(f"File Formats: {dict(structure['file_formats'])}")
print(f"Corrupted Files: {len(structure['corrupted_files'])}")

# Display structure
structure


## 3. Class Distribution Analysis

Analyze the distribution of real vs fake images.


In [None]:
# Analyze class distribution
class_dist = analyzer.analyze_class_distribution()

print("=== Class Distribution ===")
print(f"Total Images: {class_dist['total']}")
print(f"Real: {class_dist['counts']['real']} ({class_dist['percentages']['real']:.2f}%)")
print(f"Fake: {class_dist['counts']['fake']} ({class_dist['percentages']['fake']:.2f}%)")
print(f"Imbalance Ratio: {class_dist['imbalance_ratio']:.2f}")

# Visualize class distribution
fig = visualizer.plot_class_distribution(
    class_dist['counts'],
    save_path="../reports/visualizations/class_distribution.png",
    interactive=True
)
fig.show()


## 4. Image Statistics

Analyze image resolutions, aspect ratios, and other image properties.


In [None]:
# Analyze image statistics (sample 1000 images for speed)
img_stats = analyzer.analyze_image_statistics(sample_size=1000)

print("=== Image Statistics ===")
print(f"Mean Resolution: {img_stats['resolutions']['mean']:.0f} x {img_stats['resolutions']['mean']:.0f}")
print(f"Resolution Std: {img_stats['resolutions']['std']:.2f}")
print(f"Min Resolution: {img_stats['resolutions']['min']}")
print(f"Max Resolution: {img_stats['resolutions']['max']}")
print(f"Mean Aspect Ratio: {img_stats['aspect_ratios']['mean']:.2f}")

# Visualize resolution distribution
if img_stats['resolutions']['distribution']:
    fig = visualizer.plot_resolution_distribution(
        img_stats['resolutions']['distribution'],
        save_path="../reports/visualizations/resolution_distribution.png",
        interactive=True
    )
    fig.show()


## 5. Dataset Quality Assessment

Calculate overall dataset quality score and get recommendations.


In [None]:
# Calculate quality score
quality = analyzer.calculate_dataset_quality_score()

print("=== Dataset Quality Score ===")
print(f"Overall Score: {quality['overall_score']:.2f}/1.0")
print("\nQuality Factors:")
for factor, value in quality['factors'].items():
    print(f"  {factor}: {value:.2f}")

print("\n=== Recommendations ===")
for i, rec in enumerate(quality['recommendations'], 1):
    print(f"{i}. {rec}")

quality


## 6. Save Statistics

Save all statistics for later use.


In [None]:
# Save statistics
os.makedirs("../reports/statistics", exist_ok=True)
analyzer.save_statistics(f"../reports/statistics/dataset_stats_{DATASET_NAME}.json")

print("Statistics saved successfully!")


## 7. Automated EDA Report Generation

Generate a comprehensive EDA report using the automated report generator.


In [None]:
# Generate automated EDA report
from eda.report_generator import generate_eda_report

report_path = generate_eda_report(
    dataset_path=DATASET_PATH,
    output_format='html',
    output_dir='../reports',
    dataset_name=DATASET_NAME
)

print(f"EDA report generated at: {report_path}")


## 8. Data Quality Assessment

Use DataQualityChecker to assess dataset quality.


In [None]:
# Assess dataset quality using DataQualityChecker
from eda.data_analyzer import DataQualityChecker

checker = DataQualityChecker()
quality_assessment = checker.assess_dataset(DATASET_PATH)

print("=== Dataset Quality Assessment ===")
print(f"Overall Quality Score: {quality_assessment['overall_score']:.2f}/1.0")
print(f"Quality Level: {quality_assessment['quality_level']}")
print(f"Dataset Size: {quality_assessment['dataset_size']} images")
print(f"Class Balance Ratio: {quality_assessment['class_balance']:.2f}")
print(f"Corrupted Files: {quality_assessment['corrupted_files']}")

print("\n=== Quality Factors ===")
for factor, value in quality_assessment['quality_factors'].items():
    print(f"{factor}: {value:.2f}")

print("\n=== Recommendations ===")
for i, rec in enumerate(quality_assessment['recommendations'], 1):
    print(f"{i}. {rec}")

quality_assessment
