In [1]:
import cv2
import os
import shutil as sh
import pandas as pd
import numpy as np
from numpy import argmax
os.chdir('/Users/iandouglas/Projects/Repos/generative-ai/')

#### Setup

Move gallery images into before and after folders so they can be accessed by the dataloader.

In [2]:
gallery_dir = 'data/image_extraction/progresspics/gallery'
galleries = os.listdir(gallery_dir)
[os.listdir(f"{gallery_dir}/{x}")for x in galleries[0:2]]

[['imgur_BMdqM_004_OmMXO2v.jpg',
  'imgur_BMdqM_003_dw0JL2M.jpg',
  'imgur_BMdqM_002_S62FoPS.jpg',
  'imgur_BMdqM_001_WDXyKXs.jpg'],
 ['imgur_zXvx0_003_DXgggtF.jpg',
  'imgur_zXvx0_002_aDrVOqE.jpg',
  'imgur_zXvx0_001_gN7WdB0.jpg',
  'imgur_zXvx0_004_xcv5yjV.jpg',
  'imgur_zXvx0_005_Xxs4PWP.jpg']]

In [16]:
before_dir = 'data/img/before'
after_dir = 'data/img/after'
[os.makedirs(x, exist_ok = True) for x in [before_dir, after_dir]]

[None, None]

In [17]:
def copy_before_after(dir_):
    filename = f"{dir_}.jpg"
    images = os.listdir(f"{gallery_dir}/{dir_}")
    if len(images) > 1:
        image_num = [int(x.split('_')[2]) for x in images]
        before = images[0]
        after = images[argmax(image_num)]
        sh.copy(f"{gallery_dir}/{dir_}/{before}",
                f"{before_dir}/{filename}")
        sh.copy(f"{gallery_dir}/{dir_}/{after}",
                f"{after_dir}/{filename}")

In [18]:
for i in galleries:
    copy_before_after(i)

In [20]:
print(len(os.listdir('data/img/before')))
len(os.listdir('data/img/after'))

8307


8307

### Splitting images

Before/after images that were downloaded as a single file need to be split. Most of the images are simply pasted together, while others have borders. If borders are detected, crop the individual images easily from them. Otherwise:

* Using a sliding window of size h x 2 (two pixels wide), find the columns that have the largest mean squared difference in hue (on greyscale).
* This is most likely the point where they created their collage and appended their two images.
* Search is narrowed to the inner 3/5 of the image.

In [22]:
def find_best_line_and_split(file_path, before_dir, after_dir):
    filename = file_path.split('/')[-1]
    id_ = filename.split('.')[0]
    
    # Read the grayscale image
    img_gray = cv2.imread(file_path, cv2.IMREAD_GRAYSCALE)

    # Window size
    # It should be as small as possible
    window_width = 2

    # Define the range for the window search (1/5 to 4/5 from the left)
    start_col = int(img_gray.shape[1] / 5)
    end_col = int(4 * img_gray.shape[1] / 5)

    # Initialize variables to store the best line information
    best_line_col1 = start_col
    best_line_col2 = start_col + window_width
    max_mse = 0 # start at no difference between columns

    # Iterate through columns within the specified range
    for col in range(start_col, end_col - window_width):
        # Extract two columns in the window
        window_cols = img_gray[:, col:col+window_width]

        # Calculate mean squared absolute difference
        mse = np.mean(np.square(np.diff(window_cols)))

        # Update best line if current mse is greater
        if mse > max_mse:
            max_mse = mse
            best_line_col1 = col
            best_line_col2 = col + window_width

    # Read the original color image to split it (don't save greyscale!)
    img_color = cv2.imread(file_path)

    # Crop the color image into "before" and "after"
    before_img = img_color[:, :best_line_col1, :]
    after_img = img_color[:, best_line_col1:, :]

    # Save the cropped color images with the original filename
    out_before = f"{before_dir}/{filename}"
    out_after = f"{after_dir}/{filename}"
    cv2.imwrite(out_before, before_img)
    cv2.imwrite(out_after, after_img)


In [33]:
img_dir = 'data/image_extraction/progresspics/url/'
url_images = [img_dir + x for x in os.listdir(img_dir)] # full paths

In [34]:
test_urls = pd.Series(url_images).sample(n = 100)

In [35]:
before_dir = 'data/image_extraction/progresspics/split_url/before/'
after_dir = 'data/image_extraction/progresspics/split_url/after/'
test_urls.apply(lambda x: find_best_line_and_split(x, before_dir, after_dir))

82295    None
25316    None
34577    None
63077    None
33927    None
         ... 
55977    None
12696    None
35312    None
25631    None
56340    None
Length: 100, dtype: object