In [42]:
import os
from PIL import Image
import pandas as pd
import numpy as np
import math

PATH_TO_DATA = r'/Users/hadjermohabeddine/projet_modelisation/corpus_lipade'

image_extensions = ['.jpg', '.jpeg', '.png', '.gif', '.bmp']
# Association between directories and their files
files_dict = {path: files for path,_,files in os.walk(PATH_TO_DATA, topdown=False) if os.path.isdir(path) and files}
print('number of directories that contains files',len(files_dict.keys()))
# Number of images in each file
image_counts = {path: sum(1 for file in files if os.path.splitext(file.lower())[1] in image_extensions) 
                for path, files in files_dict.items()}
# Filtering the files and keeping the files that contains images
filtered_image_counts = {path: count for path, count in image_counts.items() if count > 0}
print('number of directories that contains images',len(filtered_image_counts.keys()))
# Creating a dataframe that contains the path of each file and the number of images in it
df = pd.DataFrame(list(filtered_image_counts.items()), columns=['path', 'number_of_images'])
# Adding a column that contains the list of images in each file
images_files = {path: [file for file in files if os.path.splitext(file.lower())[1] in image_extensions] 
                for path, files in files_dict.items() if path in filtered_image_counts}
df['images_files'] = df['path'].map(images_files)
display(df)
print('number of images',df['number_of_images'].sum())
# Printing the number of images that doesn't contain the word 'pages' in their path
print('number of images that are not journal pages',df[~df['path'].str.contains('pages')]['number_of_images'].sum())

number of directories that contains files 363
number of directories that contains images 215


Unnamed: 0,path,number_of_images,images_files
0,/Users/hadjermohabeddine/projet_modelisation/c...,19,[1_Chérau_28 nov 1911_Henni_atrocités turco-ar...
1,/Users/hadjermohabeddine/projet_modelisation/c...,335,"[btv1b53231709q_1.jpg, btv1b531126179_1.jpg, b..."
2,/Users/hadjermohabeddine/projet_modelisation/c...,228,"[pho_irak2K247143_75_02.jpg, pho_irak2K247143_..."
3,/Users/hadjermohabeddine/projet_modelisation/c...,108,"[pho_2K247178_48_01.jpg, pho_2K247178_35_01.jp..."
4,/Users/hadjermohabeddine/projet_modelisation/c...,162,"[pho_2K247178_48_01.jpg, pho_2K247178_36.jpg, ..."
...,...,...,...
210,/Users/hadjermohabeddine/projet_modelisation/c...,12,"[np_surlevif_19160219_03.jpg, np_surlevif_1916..."
211,/Users/hadjermohabeddine/projet_modelisation/c...,33,"[np_surlevif_19160415_11_02.jpg, np_surlevif_1..."
212,/Users/hadjermohabeddine/projet_modelisation/c...,12,"[np_surlevif_19160415_11.jpg, np_surlevif_1916..."
213,/Users/hadjermohabeddine/projet_modelisation/c...,476,"[thedailytelegram_19111114_P7_01.jpg, bismarck..."


number of images 10629
number of images that are not journal pages 6187


In [43]:
# Reading the test set
df_testset = pd.read_excel(os.path.join(PATH_TO_DATA,'donnees_IS.xlsx'))
df_testset.dropna()
test_data = df_testset.stack().to_list()
print('number of test images',len(test_data))
# obtenir le nombre de classes dans le test set
df_list = df_testset.values.tolist()
print('number of classes in the test set',len(df_list))
# getting only the classes that have more than one image
df_list = [x for x in df_list if len(x)>1]
print('number of classes in the test set that have more than one image',len(df_list))
# getting rid of the nan values
cleaned_list = [[value for value in sublist if type(value)!=float] for sublist in df_list]

number of test images 292
number of classes in the test set 73
number of classes in the test set that have more than one image 73


In [44]:
import os
from PIL import Image
import pandas as pd
import numpy as np

# Association between directories and their files
df['test_images_found'] = np.zeros(len(df))
path_images_dataset = list()
for img in test_data:
    found_rows = df[df['images_files'].apply(lambda files: any(img in file for file in files))]
    
    if not found_rows.empty:
        for _, row in found_rows.iterrows():
            path_images_dataset.append(os.path.join(row['path'],img+'.jpg'))
            #print(f"Path: {os.path.join(row['path'],img+'.jpg')} - Image: {img}")
            df.loc[df['path'] == row['path'], 'test_images_found'] += 1
            
    else:
        print(f"{img} not found in the DataFrame.")
# Test images and their position in the files
display(df[df['test_images_found'] > 0])


thephere_19111104_P5_00_03 not found in the DataFrame.
thephere_19111104_P5_00_01 not found in the DataFrame.
Minnesotskénoviny_19111214_P7_01 not found in the DataFrame.
thephere_19111104_P5_00_05 not found in the DataFrame.
thephere_19111104_P5_00_02 not found in the DataFrame.
Minnesotskénoviny_19111207_P7_01 not found in the DataFrame.
Chérau__29 nov 1911_Tripoli not found in the DataFrame.
1_Chérau_28 nov 1911_Henni_atrocités turco-arabes (8) not found in the DataFrame.
Analogue_Mémoire diplo italien_CAD Courneuve_NS Turquie_vol435_008 not found in the DataFrame.
matin_19111208 not found in the DataFrame.
matin_19111208_02 not found in the DataFrame.
1_Chérau_28 nov 1911_Henni_atrocités turco-arabes (5) not found in the DataFrame.
4_Chérau_8 ou 9 dec 1911_Tripoli_pendaison notable (4) not found in the DataFrame.
Analogue_Coll Schill_carte postale_pendaison des 14_6 dec 1911 (2) not found in the DataFrame.
3_Chérau_6 dec 1911_Tripoli_pendaison des 14 (2) not found in the 

Unnamed: 0,path,number_of_images,images_files,test_images_found
1,/Users/hadjermohabeddine/projet_modelisation/c...,335,"[btv1b53231709q_1.jpg, btv1b531126179_1.jpg, b...",8.0
27,/Users/hadjermohabeddine/projet_modelisation/c...,67,"[pho_2K47160_25_01.jpg, pho_2K47160_27_04.jpg,...",17.0
28,/Users/hadjermohabeddine/projet_modelisation/c...,134,"[pho_2K47160_19ter.jpg, pho_2K47160_25_01.jpg,...",17.0
29,/Users/hadjermohabeddine/projet_modelisation/c...,146,"[pho_2K47161_15_02.jpg, pho_2K47161_68_02.jpg,...",6.0
213,/Users/hadjermohabeddine/projet_modelisation/c...,476,"[thedailytelegram_19111114_P7_01.jpg, bismarck...",245.0


In [54]:
import shutil

PATH_TO_TESTSET = os.path.join(PATH_TO_DATA,'test_dataset')

# Check if the directory exists
if not os.path.exists(PATH_TO_TESTSET):
    os.makedirs(PATH_TO_TESTSET)
    
for path in path_images_dataset:
    destination_path = os.path.join(PATH_TO_TESTSET, os.path.basename(path))
    if not os.path.exists(destination_path):
        shutil.copy(path, os.path.join(PATH_TO_TESTSET, os.path.basename(path)))
    
# keepin only the images that are in the test dataset in cleaned_list with the nasted structure
df_test_data = [[value+'.jpg' for value in sublist if value+'.jpg' in os.listdir(PATH_TO_TESTSET)] 
                for sublist in cleaned_list if sublist]
df_testset = pd.DataFrame(df_test_data)
df_testset.to_excel(os.path.join(PATH_TO_DATA,'donnees_IS_new.xlsx'), index=False, header=False)
display(df_testset)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
0,Aamulehti_19111119_P10_01.jpg,turunsanomat_19111112_P5_01.jpg,,,,,,,,,,,,,,
1,Aamulehti_19111203_P22_01.jpg,theadelaidechronicle_19120113_P30_00_03.jpg,,,,,,,,,,,,,,
2,bismarckdailytribune_19111114_P3_01.jpg,perthamboyeveningnews_19111109_P1_01.jpg,thechroniclenews_19111116_P1_01.jpg,thediamonddrill_19111125_P7_01.jpg,thepenascolajournal_19111119_P7_01.jpg,theseattlestar_19111102_P6_01.jpg,,,,,,,,,,
3,bismarckdailytribune_19111117_P1_01.jpg,DasinferessanteBlatt_19111109_P9_05.jpg,eveningbulletin_19111230_P16_01.jpg,eveningjournal_19111115_P9_01.jpg,illustracionartistica_191111113_P8_00_02.jpg,perthamboyeveningnews_19111114_P1_01.jpg,thecairobulletin_19111117_P1_01.jpg,thepenascolajournal_19111123_02.jpg,,,,,,,,
4,bismarckdailytribune_19111128_P3_01.jpg,perthamboyeveningnews_19111124_P12_01.jpg,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
68,illustration_19111209.jpg,,,,,,,,,,,,,,,
69,Atro turco-arabes_Bataille-syndicaliste_1914-0...,,,,,,,,,,,,,,,
70,,,,,,,,,,,,,,,,
71,DasinferessanteBlatt_19120112_P7_02.jpg,carte_postale.jpg,matin_19111226_04.jpg,anno_19120111.jpg,anno_19120106.jpg,,,,,,,,,,,


In [56]:
import os
import pandas as pd

list_file_legends = list()
dict_df_legends_images = dict()

for root, dirs, files in os.walk(PATH_TO_DATA, topdown=False):
    for file in files:
        file_path = os.path.join(root, file)

        # Vérifier si le fichier n'est pas directement dans le répertoire racine
        if os.path.dirname(file_path) != PATH_TO_DATA and file.endswith('.xlsx') and not file.startswith('~') and not '_' in file:
            list_file_legends.append(file_path)
            #print(file_path)
            df = pd.read_excel(file_path)
            
            # Data cleaning
            df = df.dropna()
            # List of possible column names for legends
            legend_column_names = ['legende', 'legendes', 'Legende','Legendes', 'Légende', 'légendes', 'légende', 'Legend', 'Legends', 'legends']
            num_photo_column_names = ['Num photo', 'Num Photo', 'num photo', 'num Photo']
            # Find the first existing legend column
            found_legend_column = next((col for col in legend_column_names if col in df.columns), None)
            found_num_photo_column = next((col for col in num_photo_column_names if col in df.columns), None)
            
            if found_legend_column:
                # Rename all legend-related columns to 'legend'
                df = df.rename(columns={col: 'legende' for col in legend_column_names if col in df.columns})
                
            if found_num_photo_column:
                # Rename all num photo related columns to 'num photo'
                df = df.rename(columns={col: 'num photo' for col in num_photo_column_names if col in df.columns})
                
            if 'Num page' in df.columns:
                # Rename all num page related columns to lowercase 'num page'
                df = df.rename(columns={'Num page': 'num page'})
                
            # Get the name of the subdirectory
            subdirectory = os.path.split(root)[-1]
            
            if 'guerreillustree' in file_path or 'surlevif' in file_path:
                for num_page,num_photo,legend in zip(df['num page'],df['num photo'], df['legende']):
                    
                    if num_page and num_page!='/' and num_photo and num_photo!='/' and legend and legend!='/':
                        # Convert the page and photo numbers to integers
                        num_page = int(num_page)
                        num_photo = int(num_photo)
                        # Add a leading zero if the number is less than 10
                        num_page = num_page if len(str(num_page)) == 2 else f'0{num_page}'
                        num_photo = num_photo if len(str(num_photo)) == 2 else f'0{num_photo}'
                        # Get the name of the parent directory
                        parent_directory = os.path.split(os.path.split(root)[0])[-1]
                        # Create the file name
                        file_name = f'photos/jpg/np_{parent_directory}_{subdirectory}_{num_page}_{num_photo}.jpg'
                        full_path = os.path.join(root, file_name)
                        # Check if the file exists and if it's not in the test set
                        if os.path.exists(full_path) and file_name not in test_data:
                            # Add the image path and its legend to the dictionary
                            dict_df_legends_images[full_path] = legend
                        else:
                            print(f'File does not exist: {full_path}')
  
            elif 'fonds_forbin' in file_path:
                for num_page,num_photo,legend in zip(df['num page'],df['num photo'], df['legende']):                    
                    if num_page and num_page!='/' and num_photo and num_photo!='/' and legend and legend!='/':
                        # Convert the page and photo numbers to integers
                        num_page = int(num_page)
                        num_photo = int(num_photo)
                        # Add a leading zero if the number is less than 10
                        num_page = num_page if len(str(num_page)) == 2 else f'0{num_page}'
                        num_photo = num_photo if len(str(num_photo)) == 2 else f'0{num_photo}'
                        # Get the name of the parent directory
                        parent_directory = os.path.split(os.path.split(root)[0])[-1]
                        # Create the file name
                        file_name = f'photos/jpg/pho_{subdirectory}_{num_page}_{num_photo}.jpg'
                        full_path = os.path.join(root, file_name)
                        # Check if the file exists and if it's not in the test set
                        if os.path.exists(full_path) and file_name not in test_data:
                            dict_df_legends_images[full_path] = legend
                        else:
                            print(f'File does not exist: {full_path}')
                                

print('number of images with labels',len(dict_df_legends_images.keys()))


File does not exist: /Users/hadjermohabeddine/projet_modelisation/corpus_lipade/fonds_forbin/2K47161/photos/jpg/pho_2K47161_07_03.jpg
File does not exist: /Users/hadjermohabeddine/projet_modelisation/corpus_lipade/fonds_forbin/2K47161/photos/jpg/pho_2K47161_31_03.jpg
File does not exist: /Users/hadjermohabeddine/projet_modelisation/corpus_lipade/fonds_forbin/2K47161/photos/jpg/pho_2K47161_32_04.jpg
File does not exist: /Users/hadjermohabeddine/projet_modelisation/corpus_lipade/fonds_forbin/2K47161/photos/jpg/pho_2K47161_35_03.jpg
File does not exist: /Users/hadjermohabeddine/projet_modelisation/corpus_lipade/fonds_forbin/2K47161/photos/jpg/pho_2K47161_65_03.jpg
File does not exist: /Users/hadjermohabeddine/projet_modelisation/corpus_lipade/fonds_forbin/2K47161/photos/jpg/pho_2K47161_71_02.jpg
File does not exist: /Users/hadjermohabeddine/projet_modelisation/corpus_lipade/fonds_forbin/2K47161/photos/jpg/pho_2K47161_72_01.jpg
File does not exist: /Users/hadjermohabeddine/projet_modelisat

In [4]:
# Creating a dataframe that contains the path of each image and its legend
df_clip = pd.DataFrame(list(dict_df_legends_images.items()), columns=['path', 'legend'])

3746


In [60]:
import os
import shutil

# Define the path for the training set folder
training_set_folder = 'training_set'

# Create the training set folder if it doesn't exist
os.makedirs(training_set_folder, exist_ok=True)

# Iterate through each row in the df_clip dataframe
for image_path in df_clip['path']:
    
    # Ensure the image file exists before copying
    if os.path.exists(image_path):
        # Define the destination path in the training set folder with a new file name
        destination_path = os.path.join(training_set_folder, os.path.basename(image_path))
        if not os.path.exists(destination_path):
            # Copy the image file to the training set folder
            shutil.copy(image_path, destination_path)
            print(f'Image copied: {image_path} -> {destination_path}')
    else:
        print(f'Image not found: {image_path}')

Image copied: /Users/hadjermohabeddine/projet_modelisation/corpus_lipade/fonds_forbin/2K24795/photos/jpg/pho_2K24795_04_01.jpg -> training_set/pho_2K24795_04_01.jpg
Image copied: /Users/hadjermohabeddine/projet_modelisation/corpus_lipade/fonds_forbin/2K24795/photos/jpg/pho_2K24795_04_02.jpg -> training_set/pho_2K24795_04_02.jpg
Image copied: /Users/hadjermohabeddine/projet_modelisation/corpus_lipade/fonds_forbin/2K24795/photos/jpg/pho_2K24795_06_01.jpg -> training_set/pho_2K24795_06_01.jpg
Image copied: /Users/hadjermohabeddine/projet_modelisation/corpus_lipade/fonds_forbin/2K24795/photos/jpg/pho_2K24795_06_02.jpg -> training_set/pho_2K24795_06_02.jpg
Image copied: /Users/hadjermohabeddine/projet_modelisation/corpus_lipade/fonds_forbin/2K24795/photos/jpg/pho_2K24795_07_01.jpg -> training_set/pho_2K24795_07_01.jpg
Image copied: /Users/hadjermohabeddine/projet_modelisation/corpus_lipade/fonds_forbin/2K24795/photos/jpg/pho_2K24795_07_02.jpg -> training_set/pho_2K24795_07_02.jpg
Image copi

In [61]:
# Add a column that contains the image name
df_clip['image_name'] = [os.path.basename(path) for path in df_clip['path']]
# save the dataframe as a xlsx file
columns_to_save = ['image_name', 'legend']


In [62]:
def text_cleaning(text):
  """
    Cleans the input text by replacing newline characters with spaces,
    converting the text to lowercase, and stripping leading and trailing whitespaces.

    Parameters:
    - text (str): The input text to be cleaned.

    Returns:
    str: The cleaned text with newline characters replaced, converted to lowercase, and stripped.
  """
  text = text.replace("\n", " ")
  text = text.lower()
  text = text.strip()
  return text

# clean the legend column
df_clip['legend'] = df_clip['legend'].apply(text_cleaning)
display(df_clip)

Unnamed: 0,path,legend,image_name
0,/Users/hadjermohabeddine/projet_modelisation/c...,forbin delagement,pho_2K24795_04_01.jpg
1,/Users/hadjermohabeddine/projet_modelisation/c...,forbin delagement,pho_2K24795_04_02.jpg
2,/Users/hadjermohabeddine/projet_modelisation/c...,forbin,pho_2K24795_06_01.jpg
3,/Users/hadjermohabeddine/projet_modelisation/c...,forbin,pho_2K24795_06_02.jpg
4,/Users/hadjermohabeddine/projet_modelisation/c...,forbin kara borun (salonique) the french sailo...,pho_2K24795_07_01.jpg
...,...,...,...
3741,/Users/hadjermohabeddine/projet_modelisation/c...,a 155 m/m rimailho piece in the snow that has ...,np_surlevif_19160415_08_05.jpg
3742,/Users/hadjermohabeddine/projet_modelisation/c...,"a trench upset by the struggle, and hence the ...",np_surlevif_19160415_08_06.jpg
3743,/Users/hadjermohabeddine/projet_modelisation/c...,command levels,np_surlevif_19160415_11_01.jpg
3744,/Users/hadjermohabeddine/projet_modelisation/c...,underground work,np_surlevif_19160415_11_02.jpg


In [63]:
from langdetect import detect
from googletrans import Translator

def translate_to_english(text):
  """
    Translates the input text from French to English using the Google Translate API.

    Parameters:
    - text (str): The input text to be translated.

    Returns:
    str: The translated text in English.

    Note:
    The function detects the language of the input text, and if it is already in English,
    the original text is returned without translation.

    External Dependencies:
    - The function relies on the `detect` and `Translator` functions from the `langdetect` and `googletrans` libraries.
      Make sure to install these dependencies before using this function.
  """
  # Detect the language of the input text
  detected_language = detect(text)

  if detected_language == 'en':
    # Text is already in English
    return text
  else:
    translator = Translator()
    translation = translator.translate(text, src='fr', dest='en')
    # print('text fr', text)
    # print('text en', translation.text)
    return translation.text
  
df_clip['legend'] = df_clip['legend'].apply(translate_to_english)
display(df_clip) 

Unnamed: 0,path,legend,image_name
0,/Users/hadjermohabeddine/projet_modelisation/c...,Forbin Delament,pho_2K24795_04_01.jpg
1,/Users/hadjermohabeddine/projet_modelisation/c...,Forbin Delament,pho_2K24795_04_02.jpg
2,/Users/hadjermohabeddine/projet_modelisation/c...,forbin,pho_2K24795_06_01.jpg
3,/Users/hadjermohabeddine/projet_modelisation/c...,forbin,pho_2K24795_06_02.jpg
4,/Users/hadjermohabeddine/projet_modelisation/c...,forbin kara borun (salonique) the french sailo...,pho_2K24795_07_01.jpg
...,...,...,...
3741,/Users/hadjermohabeddine/projet_modelisation/c...,a 155 m/m rimailho piece in the snow that has ...,np_surlevif_19160415_08_05.jpg
3742,/Users/hadjermohabeddine/projet_modelisation/c...,"a trench upset by the struggle, and hence the ...",np_surlevif_19160415_08_06.jpg
3743,/Users/hadjermohabeddine/projet_modelisation/c...,command levels,np_surlevif_19160415_11_01.jpg
3744,/Users/hadjermohabeddine/projet_modelisation/c...,Underground Work,np_surlevif_19160415_11_02.jpg


In [64]:
# save the dataframe as a xlsx file
df_clip[columns_to_save].to_excel(os.path.join(training_set_folder, 'df_training_translated.xlsx'))