In [1]:
%pip install torch 
%pip install PIL
%pip install facenet-pytorch
%pip install pandas
%pip install matplotlib
%pip install seaborn
%pip install tqdm

Defaulting to user installation because normal site-packages is not writeable
Collecting torch
  Downloading torch-2.3.0-cp310-cp310-manylinux1_x86_64.whl.metadata (26 kB)
Collecting filelock (from torch)
  Downloading filelock-3.13.4-py3-none-any.whl.metadata (2.8 kB)
Collecting sympy (from torch)
  Downloading sympy-1.12-py3-none-any.whl.metadata (12 kB)
Collecting networkx (from torch)
  Downloading networkx-3.3-py3-none-any.whl.metadata (5.1 kB)
Collecting fsspec (from torch)
  Downloading fsspec-2024.3.1-py3-none-any.whl.metadata (6.8 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata

In [3]:
from facenet_pytorch import MTCNN, InceptionResnetV1
import torch
from PIL import Image

# Initialize MTCNN for face detection
mtcnn = MTCNN(keep_all=True)

# Load pre-trained FaceNet model
resnet = InceptionResnetV1(pretrained='casia-webface').eval()

# Load an image containing faces
img = Image.open('../../../../../cds-vis-data/newspapers/GDL/GDL-1798-02-05-a-p0001.jpg') #0 faces
img_2 = Image.open('../../../../../cds-vis-data/newspapers/GDL/GDL-1987-08-21-a-p0012.jpg') #1 face 
img_3 =  Image.open('../../../../../cds-vis-data/newspapers/GDL/GDL-1997-08-09-a-p0021.jpg') #4 faces


# Detect faces in the image
boxes, _ = mtcnn.detect(img)

  0%|          | 0.00/111M [00:00<?, ?B/s]

In [4]:
print(boxes)

None


In [5]:

#img_2 contains 1 face and returns 1 list with 1 of 4 numbers: [[594.2079467773438 1372.92529296875 678.6680908203125 1482.1539306640625]]
#img_3 contains 4 faces and return 1 list with 4 list 

#to get the number of faces access the first element in boxes
boxes.shape[0]

AttributeError: 'NoneType' object has no attribute 'shape'

In [4]:
#with tqdm


import os
from tqdm import tqdm 
from facenet_pytorch import MTCNN, InceptionResnetV1
import torch
from PIL import Image
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


# Define folder path
folderpath = "../newspapers_2/"

# Initialize dictionaries to store results:
faces_rawcount_per_decade = {}  # Store raw counts of images with faces per decade for each newspaper
perc_pages_with_faces_per_decade = {}  # Store percentage of pages with faces per decade for each newspaper
pages_per_decade = {}

# Function to extract decade from filename:
def extract_decade(filename):
    parts = filename.split("-")
    year = int(parts[1])
    decade = (year // 10) * 10
    return decade

def get_num_faces(file_path):
    mtcnn = MTCNN(keep_all=True)    
    resnet = InceptionResnetV1(pretrained='casia-webface').eval()
    img = Image.open(file_path)
    boxes, _ = mtcnn.detect(img)
    if boxes is not None:
        num_faces = boxes.shape[0]  # Count the number of detected faces
    else:
        num_faces = 0  # No faces detected
    return num_faces

# Iterate through each subfolder (newspaper) in folderpath:
for newspaper_folder in sorted(os.listdir(folderpath)):
    # Construct the path to the newspaper we are looping through 
    full_path = os.path.join(folderpath, newspaper_folder)

    # Check if the path full_path exist as a directory
    if os.path.isdir(full_path):
        # Extract the name of the newspaper from the folder path
        newspaper_name = newspaper_folder

        # Initialize dictionaries for the current newspaper
        pages_per_decade[newspaper_name] = {}
        faces_rawcount_per_decade[newspaper_name] = {}
        perc_pages_with_faces_per_decade[newspaper_name] = {}

        # Initialize counter for pages with faces for the current decade
        pages_with_faces = 0 #this is for every newpaper

        # Get list of files in the current newspaper folder
        files = sorted([filename for filename in os.listdir(full_path) if os.path.isfile(os.path.join(full_path, filename)) and filename.endswith(".jpg")])

        # Use tqdm for progress visualization
        for filename in tqdm(files, desc=newspaper_name):
            # Construct the full file path
            file_path = os.path.join(full_path, filename)

            # Extract the decade from the filename
            decade = extract_decade(filename)

            # Increment the count of pages for the current decade
            pages_per_decade[newspaper_name][decade] = pages_per_decade[newspaper_name].get(decade, 0) + 1

            # Get number of faces
            num_faces = get_num_faces(file_path)

            # Increment the counter if at least one face is detected on this page
            if num_faces > 0:
                pages_with_faces += 1

            # Update the count of faces for the current decade
            faces_rawcount_per_decade[newspaper_name][decade] = faces_rawcount_per_decade[newspaper_name].get(decade, 0) + num_faces

            # Update the percentage for the current decade
            perc_pages_with_faces_per_decade[newspaper_name][decade] = (pages_with_faces / pages_per_decade[newspaper_name][decade]) * 100

    # Create DataFrame from perc_pages_with_faces_per_decade
    df = pd.DataFrame.from_dict(perc_pages_with_faces_per_decade[newspaper_name], orient='index', columns=['Percentage'])

    # Create DataFrame from faces_rawcount_per_decade
    df_2 = pd.DataFrame.from_dict(faces_rawcount_per_decade[newspaper_name], orient='index', columns=['count'])

    # Merge the two dataframes by index
    done = pd.merge(df, df_2, left_index=True, right_index=True)
    done['newspaper'] = newspaper_name
    
    # Save the dataframe to CSV
    done.to_csv(f'../out/{newspaper_name}_data.csv')


#merging all dfs
csv_files = [f for f in os.listdir("../out/") if f.endswith('.csv')]
print(csv_files)
dfs = []
for csv in csv_files:
    df = pd.read_csv(os.path.join("../out", csv))
    dfs.append(df)
final_df = pd.concat(dfs, ignore_index=True)
final_df.rename(columns={'Unnamed: 0': 'decade'}, inplace=True)
final_df


#plotting all dfs togerher
plt.figure(figsize = (15, 10))
sns.relplot(data = final_df, kind = "line", x = "decade", y = "Percentage", hue = "newspaper")
plt.xticks(rotation = 45, fontsize = 8)
plt.title('Percentage of pages with faces per pecade', fontsize = 12)
plt.show
plt.savefig("../out/plot_of_percentages.png")



GDL:   0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0.00/111M [00:00<?, ?B/s]

GDL: 100%|██████████| 8/8 [02:18<00:00, 17.36s/it]
IMP: 100%|██████████| 8/8 [02:24<00:00, 18.10s/it]
JDG: 100%|██████████| 8/8 [02:45<00:00, 20.72s/it]


to do:
- 1 problematic file (import to test folder)
- Rewrite to py
- Make run, setup, requirenments 
- Test on small dataset: if running
- Delete newsfolder_2, bigger machine, run on full 
