# Image preprocessing for predicting popularity of Instagram posts!
I start by resizing the images to 224x224 as this is best for the ResNet50 model I will use. This is based on the intrinsic popularity assesment paper and the Zhang paper.

In [20]:
import os
from PIL import Image
import numpy as np
import json

# Paths to your dataset folders
data_dir = 'D:/DSS Thesis/Data/Instagram_alternative_brand/extracted_img_fi02/img_fi02/img_resized'
output_dir = 'D:/DSS Thesis/Data/Instagram_alternative_brand/image_pre_processing/output'
post_info_path = 'D:/DSS Thesis/Data/Instagram_alternative_brand/post_info.txt'  

# Load filenames from post_info_sample.txt
with open(post_info_path, 'r') as file:
    lines = file.readlines()

image_filenames = set()
for line in lines:
    parts = line.strip().split('\t')
    # Extract filenames and remove surrounding brackets and quotes
    filenames = json.loads(parts[4].replace("'", '"'))
    image_filenames.update(filenames)

# Batching
batch_size = 15000
batch_count = 10
image_data = []
filepaths = []

# List and process the files
with os.scandir(data_dir) as entries:
    for idx, entry in enumerate(entries):
        if entry.is_file() and entry.name.endswith('.jpg') and entry.name in image_filenames:
            file_path = entry.path
            try:
                with Image.open(file_path) as image:
                    if image.mode != 'RGB':
                        image = image.convert('RGB')
                    im = image.resize((224,224), Image.LANCZOS)
                    image_data.append(np.array(im))
                    filepaths.append(entry.name)
                
                # Save in batches
                if (idx + 1) % batch_size == 0:
                    batch_count += 1
                    np.save(os.path.join(output_dir, f"data120_batch{batch_count}.npy"), np.array(image_data))
                    np.save(os.path.join(output_dir, f"filenames_batch{batch_count}.npy"), filepaths)
                    image_data, filepaths = [], []

            except Exception as e:
                print(f"Error processing image '{file_path}': {e}")
                continue  # Skip this image and move on to the next one

# Save any remaining images
if image_data:
    batch_count += 1
    np.save(os.path.join(output_dir, f"data120_batch{batch_count}.npy"), np.array(image_data))
    np.save(os.path.join(output_dir, f"filenames_batch{batch_count}.npy"), filepaths)


KeyboardInterrupt: 