In [None]:
import os
import pandas as pd
import numpy as np
from tqdm import tqdm
from PIL import Image
import shutil
import logging

In [None]:
import cv2
import dlib
from skimage import io

## Paths

In [None]:
directory = "/Users/kyriakospapadopoulos/Desktop/University/Big Blue Data Academy/Personal/Projects/API_Projects/Reddit/Photograph_Analysis/photographs_top_1000"
df_path = os.path.join(directory, "face_detection_results.csv")

# Face identification with OpenCV’s Haar Cascade

## Summary of OpenCV’s Haar Cascade
---
**OpenCV’s Haar Cascade** is a popular object detection algorithm that is primarily used for face detection in images and video streams. It is based on the concept of Haar-like features and uses a cascade of classifiers to detect objects in a given image.

### Key Concepts:

- **Haar-like Features**:
  - **Description**:
    - Haar-like features are digital image features used in object recognition. These features are named after the Haar wavelet and are essentially rectangular patterns that represent the contrast between adjacent regions of an image.
    - For example, a Haar-like feature could capture the difference in brightness between the eyes and the upper cheeks in a face, or between the bridge of the nose and the eyes.
  - **Calculation**:
    - The value of a Haar-like feature is calculated by subtracting the sum of pixel values in one rectangular region of the image from the sum of pixel values in another adjacent rectangular region. These rectangular regions can vary in size and position to capture different types of features.
    - The algorithm evaluates a large number of these features at different scales and locations in the image to detect objects.
  
- **Integral Image**:
  - **Description**:
    - The integral image is a data structure that allows for the rapid calculation of pixel sums over rectangular regions. It is a key component of the Haar Cascade algorithm because it speeds up the computation of Haar-like features.
  - **Calculation**:
    - The integral image is computed by summing all pixel values above and to the left of a given pixel. This means that the sum of pixel values within any rectangular area of the original image can be calculated in constant time, regardless of the size of the rectangle.
    - This efficiency is crucial for real-time object detection, as it allows the algorithm to quickly compute the sum of pixel values over multiple regions at different scales.

- **Cascade of Classifiers**:
  - **Description**:
    - The Haar Cascade algorithm uses a series of classifiers that are arranged in stages, known as a cascade. Each stage contains a number of weak classifiers, which are simple decision rules based on Haar-like features.
    - A weak classifier is considered "weak" because it alone is not very accurate, but when combined with other weak classifiers in a stage, it contributes to a strong decision-making process.
  - **Working**:
    - During detection, the algorithm scans the image and evaluates the Haar-like features in a given region. If a region passes through all the stages of the cascade, it is classified as containing the object (e.g., a face). If it fails at any stage, the region is immediately discarded.
    - This cascading structure allows the algorithm to quickly eliminate regions of the image that do not contain the object of interest, significantly speeding up the detection process.
    - The first stages of the cascade are designed to reject the majority of non-object regions quickly, while later stages become more complex and focus on refining the detection.

### What Haar Cascade Does:

- **Face Detection**:
  - Haar Cascade is most commonly used for detecting faces in images. It is pre-trained on large datasets of face images and can accurately detect faces across different scales and orientations.

- **Other Object Detection**:
  - While face detection is its primary use, Haar Cascade can also be trained to detect other objects, such as eyes, license plates, and even specific animals. However, the accuracy and performance for these tasks may vary depending on the complexity of the object.

- **Real-Time Detection**:
  - Due to its efficiency, Haar Cascade can be used for real-time object detection in applications such as video surveillance, facial recognition systems, and interactive user interfaces.

### Limitations:

- **Sensitivity to Lighting Conditions**:
  - Haar Cascade can be sensitive to changes in lighting and may struggle with detecting objects in low-light or overly bright environments.

- **Accuracy**:
  - While fast, Haar Cascade may not be as accurate as more modern object detection algorithms, especially when it comes to detecting smaller objects or objects with complex backgrounds.

- **False Positives**:
  - The algorithm may sometimes detect objects that are not present (false positives), especially in cluttered or complex images.
---

In [None]:
# Function to detect faces using OpenCV's Haar Cascades
def detect_face_opencv(image_path):
    # Load the pre-trained Haar Cascade classifier for face detection
    face_cascade = cv2.CascadeClassifier(cv2.data.haarcascades + 'haarcascade_frontalface_default.xml')
    
    # Read the image
    image = cv2.imread(image_path)
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)  # Convert the image to grayscale
    
    # Adjust the scaleFactor and minNeighbors for reduced sensitivity (fewer false positives)
    faces = face_cascade.detectMultiScale(gray, scaleFactor=1.3, minNeighbors=8)
    
    # Return True if at least one face is detected, otherwise False
    return len(faces) > 0

### 'detect_face_opencv' function breakdown
---
- **`face_cascade = cv2.CascadeClassifier(cv2.data.haarcascades + 'haarcascade_frontalface_default.xml')`**:
  - This line loads the pre-trained Haar Cascade classifier for face detection from OpenCV. The `'haarcascade_frontalface_default.xml'` file contains the trained data for detecting frontal faces.

- **`image = cv2.imread(image_path)`**:
  - The image is read from the specified `image_path` using OpenCV's `cv2.imread` function. The image is loaded as a color image.

- **`gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)`**:
  - The loaded color image is converted to grayscale using `cv2.cvtColor`. Face detection with Haar Cascades is typically performed on grayscale images because it simplifies the computation and improves performance.

- **`faces = face_cascade.detectMultiScale(gray, scaleFactor=1.3, minNeighbors=8)`**:
  - The `detectMultiScale` method is applied to the grayscale image to detect faces. The `scaleFactor` parameter controls how much the image size is reduced at each image scale, and `minNeighbors` specifies how many neighbors each candidate rectangle should have to retain it. A higher `scaleFactor` and `minNeighbors` reduce sensitivity, potentially leading to fewer false positives.

- **`return len(faces) > 0`**:
  - The function returns `True` if at least one face is detected in the image (i.e., `len(faces)` is greater than 0). If no faces are detected, it returns `False`.
---

In [None]:
# Function to analyze images and create a new DataFrame with face detection results
def analyze_faces_with_opencv(directory, save_interval=100):
    # Initialize a new DataFrame
    df = pd.DataFrame(columns=['File Name', 'OpenCV’s Haar Cascade results'])
        
    # Process each image in the directory
    for idx, filename in enumerate(tqdm(os.listdir(directory), desc="Running OpenCV Face Detection")):
        if filename.endswith(('.jpg', '.jpeg', '.png', '.bmp')):  # Adjust based on your file types
            image_path = os.path.join(directory, filename)
            
            if os.path.exists(image_path):
                face_detected = detect_face_opencv(image_path)
                df = df.append({'File Name': filename, 'OpenCV’s Haar Cascade results': face_detected}, ignore_index=True)
            
        if idx % save_interval == 0 and idx != 0:
            # Save progress periodically
            df.to_csv(os.path.join(directory, "face_detection_results.csv"), index=False)
    
    # Save the final updated DataFrame
    df.to_csv(os.path.join(directory, "face_detection_results.csv"), index=False)
    print("OpenCV face detection complete. Results saved to face_detection_results.csv")

### 'analyze_faces_with_opencv' function breakdown
---
- **`df = pd.DataFrame(columns=['File Name', 'OpenCV’s Haar Cascade results'])`**:
  - This line initializes a new pandas DataFrame with two columns: `'File Name'` to store the name of each image file and `'OpenCV’s Haar Cascade results'` to store the face detection results (True or False).

- **Loop through each image in the directory**:
  - The function iterates over all files in the specified `directory` using `tqdm` to display a progress bar. The loop processes only files with image extensions such as `.jpg`, `.jpeg`, `.png`, and `.bmp`.

- **`image_path = os.path.join(directory, filename)`**:
  - For each image file, the full path is constructed by joining the directory path with the filename.

- **`if os.path.exists(image_path): face_detected = detect_face_opencv(image_path)`**:
  - If the image file exists at the specified path, the function calls `detect_face_opencv(image_path)` to determine if a face is detected in the image. The result (True or False) is returned.

- **`df = df.append({'File Name': filename, 'OpenCV’s Haar Cascade results': face_detected}, ignore_index=True)`**:
  - The filename and face detection result are appended as a new row to the DataFrame. The `ignore_index=True` parameter ensures that the DataFrame index is reset for each new row.

- **Periodically save progress**:
  - Every `save_interval` iterations, the current state of the DataFrame is saved to a CSV file (`face_detection_results.csv`) in the specified directory. This ensures that progress is saved periodically, reducing the risk of data loss in case of an interruption.

- **Final save**:
  - After all images have been processed, the final version of the updated DataFrame is saved to `face_detection_results.csv`. A message is printed to confirm that the OpenCV face detection analysis is complete and the results have been saved.
---

In [None]:
analyze_faces_with_opencv(directory)

# Portrait identification with Dlib’s HOG + SVM model

## Summary of Dlib’s HOG + SVM Face Detector
---
**Dlib’s HOG + SVM Face Detector** is a widely-used method for detecting faces in images. It combines Histogram of Oriented Gradients (HOG) for feature extraction with a Support Vector Machine (SVM) classifier to identify faces. This method is particularly effective in detecting faces with various orientations and under different lighting conditions.

### Key Concepts:

- **Histogram of Oriented Gradients (HOG)**:
  - **Description**:
    - HOG is a feature descriptor used in computer vision to capture the structure and shape of objects within an image. It works by dividing the image into small regions (cells) and computing the gradient direction and magnitude within each cell.
  - **Gradient Calculation**:
    - The gradient at each pixel is computed by finding the difference in intensity between adjacent pixels. The orientation of the gradient is determined by the direction in which the intensity changes the most.
    - These gradients are then binned into orientation histograms, where each bin represents a specific range of angles. The histograms from all cells are concatenated to form a feature vector that describes the overall shape and structure of the object.
  - **Robustness**:
    - HOG is particularly robust to variations in lighting and small changes in the appearance of the object, making it effective for face detection across different environments.

- **Support Vector Machine (SVM)**:
  - **Description**:
    - SVM is a supervised learning algorithm used for classification tasks. It works by finding the hyperplane that best separates the data points of different classes in a high-dimensional space.
  - **Face Classification**:
    - In the context of face detection, the SVM classifier is trained on HOG features extracted from face and non-face images. The classifier learns to distinguish between these two classes based on the patterns in the HOG features.
    - During detection, the SVM classifier evaluates the HOG features of regions in the image and determines whether they correspond to a face or not.

### What Dlib’s HOG + SVM Face Detector Does:

- **Face Detection**:
  - Dlib’s HOG + SVM face detector is used to identify faces in images. It works well in detecting faces that are front-facing or slightly turned and can handle various lighting conditions.
  
- **Portrait Detection**:
  - This method can be extended to detect portraits by analyzing the proportion of the image occupied by a detected face. A face that takes up a significant portion of the image is likely to be a portrait, which can be useful in filtering and categorizing images based on their content.

- **Real-Time Detection**:
  - While not as fast as some deep learning-based methods, Dlib’s HOG + SVM detector is efficient enough for real-time applications on reasonably powerful hardware.

### Limitations:

- **Scale and Orientation Sensitivity**:
  - Although Dlib’s HOG + SVM face detector is robust, it may struggle with extreme variations in face orientation or scale. For instance, it may not perform as well on images where faces are at sharp angles or very small relative to the image size.

- **Speed**:
  - Compared to more modern deep learning methods, Dlib’s HOG + SVM can be slower, especially when processing high-resolution images or large datasets.

- **False Positives**:
  - Like many face detection algorithms, this method can sometimes produce false positives, detecting faces where none exist.
---

In [None]:
# Initialize the face detector from Dlib
detector = dlib.get_frontal_face_detector()

In [None]:
# Function to detect portraits using Dlib's HOG + SVM face detector
def detect_portrait_dlib(image_path, face_min_proportion=0.05):
    image = io.imread(image_path)
    height, width = image.shape[:2]
    
    # Detect faces in the image
    faces = detector(image, 1)
    
    for face in faces:
        # Calculate the area of the detected face
        face_area = (face.right() - face.left()) * (face.bottom() - face.top())
        image_area = height * width
        
        # Check if the face occupies a significant portion of the image (portrait)
        if face_area / image_area >= face_min_proportion:
            return True  # Considered a portrait
    
    return False  # No face or insignificant face size detected

### 'detect_portrait_dlib' function breakdown
---
- **`image_path`**: This parameter represents the file path of the image that will be analyzed.
- **`face_min_proportion`**: This optional parameter specifies the minimum proportion of the image's area that a detected face must occupy to be considered a portrait. The default value is `0.05`, meaning the face must occupy at least 5% of the total image area to be considered a portrait.
- **`io.imread(image_path)`**: This line loads the image from the specified file path using `io.imread`, typically from the `skimage` library. The image is loaded as an array of pixel values.
- **`image.shape[:2]`**: The shape of the image array is used to extract the height and width of the image. These dimensions are needed to calculate the total area of the image and to determine the relative size of any detected faces.
- **`detector(image, 1)`**: This line applies Dlib's face detector to the loaded image. The `1` parameter indicates that the image should be upsampled once before detection, which can help in detecting smaller faces. The `detector` function returns a list of detected faces, where each face is represented by a bounding box.
- **Loop**: The function iterates over each detected face in the `faces` list. For each face, the area is calculated to determine if it qualifies as a portrait.
    - **`face_area`**: The area of the detected face is calculated using the coordinates of the bounding box around the face. `face.right()` and `face.left()` give the x-coordinates of the right and left edges of the bounding box, while `face.bottom()` and `face.top()` give the y-coordinates of the bottom and top edges.
    - **`image_area`**: The total area of the image is calculated by multiplying its height by its width.
    - **Proportion Check**: The function checks whether the detected face occupies at least the specified proportion of the image area (`face_min_proportion`). If it does, the function returns `True`, indicating that the image contains a portrait.
- **Return False**: If no face meets the minimum size requirement, the function returns `False`, indicating that the image does not contain a portrait.
---

In [None]:
# Function to analyze images and update the DataFrame with portrait detection results
def analyze_portraits_with_dlib(directory, df_path, save_interval=100):
    # Load the existing DataFrame
    if os.path.exists(df_path):
        df = pd.read_csv(df_path)
    else:
        raise FileNotFoundError(f"The specified DataFrame file does not exist: {df_path}")

    # Add a new column for portrait detection results if it doesn't exist
    if 'Dlib’s HOG + SVM results' not in df.columns:
        df['Dlib’s HOG + SVM results'] = False

    total_images = len(df)

    # Process each image in the DataFrame
    for idx, row in tqdm(df.iterrows(), total=total_images, desc="Running Dlib Portrait Detection"):
        image_path = os.path.join(directory, row['File Name'])
        
        if os.path.exists(image_path):
            portrait_detected = detect_portrait_dlib(image_path)
            df.at[idx, 'Dlib’s HOG + SVM results'] = portrait_detected
        
        if idx % save_interval == 0 and idx != 0:
            # Save progress periodically
            df.to_csv(df_path, index=False)
    
    # Save the final updated DataFrame
    df.to_csv(df_path, index=False)
    print(f"Dlib portrait detection complete. Results saved to {df_path}")

### 'analyze_portraits_with_dlib' function breakdown
---
- **`directory`**: This parameter specifies the directory containing the images to be analyzed.

- **`df_path`**: This is the file path to the existing DataFrame (CSV file) where the results will be saved.

- **`save_interval`**: This optional parameter defines how often the DataFrame should be saved during processing, with the default value set to 100 images.

- **Load the existing DataFrame**: 
  - The function checks if the specified DataFrame file exists at `df_path`. If it does, the DataFrame is loaded using `pd.read_csv(df_path)`. If the file is not found, a `FileNotFoundError` is raised.

- **Add a new column for portrait detection results**: 
  - The function checks if the DataFrame already contains a column named `'Dlib’s HOG + SVM results'`. If not, it adds this column and initializes it with `False` for all rows.

- **Process each image in the DataFrame**: 
  - The function iterates over each row in the DataFrame using `tqdm` to provide a progress bar. For each row, it constructs the full image path using the directory and the `'File Name'` column from the DataFrame.

- **Check if the image exists**: 
  - If the image file exists at the constructed path, the function calls `detect_portrait_dlib(image_path)` to determine if a portrait is detected in the image.

- **Update the DataFrame with the detection result**: 
  - The result (`True` or `False`) is stored in the `'Portrait Detected'` column for the corresponding row in the DataFrame.

- **Save progress periodically**: 
  - The function saves the updated DataFrame to `df_path` after every `save_interval` images have been processed.

- **Final save**: 
  - Once all images have been processed, the function saves the final version of the updated DataFrame to `df_path` and prints a completion message.
---

In [None]:
analyze_portraits_with_dlib(directory, df_path)

### File gatherinng function to make model evaluation easier

In [None]:
def copy_files_to_new_folder(df, directory, destination_base_dir, folder_name=None):
    # Determine the folder name: use provided name, DataFrame's name attribute, or default to "new_folder"
    if folder_name:
        destination_folder = os.path.join(destination_base_dir, folder_name)
    elif hasattr(df, 'name') and df.name:
        destination_folder = os.path.join(destination_base_dir, df.name)
    else:
        destination_folder = os.path.join(destination_base_dir, "new_folder")
    
    # Create the folder if it doesn't exist
    os.makedirs(destination_folder, exist_ok=True)
    
    # Iterate over each file name in the DataFrame
    for filename in df["File Name"]:
        source_path = os.path.join(directory, filename)
        destination_path = os.path.join(destination_folder, filename)
        
        # Copy the file to the new folder
        if os.path.exists(source_path):
            shutil.copy(source_path, destination_path)
        else:
            print(f"File not found: {source_path}")
    
    print(f"Files have been copied to {destination_folder}")

In [None]:
# copy_files_to_new_folder(df = dlib_true, directory=directory, destination_base_dir=directory, folder_name='dlib_true')