In [1]:
import os
os.chdir('../')

In [2]:
import pandas as pd
from pathlib import Path
from PIL import Image
import json
import random
import subprocess
from tqdm.notebook import tqdm
from ipywidgets import FloatProgress

In [3]:
root_path = Path('./data/')

In [4]:
style_df = pd.read_csv(root_path / 'metadata' / 'style.csv')
style_df = style_df[~style_df['File_name'].isna()]
style_df = style_df.sort_values('File_name').reset_index(drop=True)

content_df = pd.read_csv(root_path / 'metadata' / 'content.csv')
content_df = content_df[~content_df['File_name'].isna()]
content_df = content_df.sort_values('File_name').reset_index(drop=True)

content_df.shape, style_df.shape

((126, 8), (100, 6))

In [5]:
def cartesian_product(d):
    index = pd.MultiIndex.from_product(d.values(), names=d.keys())
    return pd.DataFrame(index=index).reset_index()

weights = [5e4]

prod = cartesian_product({
    'content': content_df['File_name'],
    'style': style_df['File_name'],
    'weight': weights,
})

prod['index'] = list(prod.index+1)
prod['gifs'] = False
prod['images'] = False
prod['metadata'] = False
prod['to_review'] = False

In [6]:
content_merged = prod.merge(content_df, left_on='content', right_on='File_name')
content_merged = content_merged.drop(['Orientation', 'Resolution', 'File_name', 'Looks_good', 'Copyright link'], axis=1)
content_merged.columns = ['content', 'style', 'weight', 'index', 'gifs', 'images', 'metadata', 'to_review', 'content_title', 'content_author', 'content_copyight']

df = content_merged.merge(style_df, left_on='style', right_on='File_name')
df = df.drop(['Resolution', 'File_name', 'Strength'], axis=1)
df.columns = ['content', 'style', 'weight', 'index', 'gifs', 'images', 'metadata', 'to_review', 'content_title', 'content_author', 'content_copyright', 'style_title', 'style_author', 'style_copyright']

df = df[['content', 'content_title', 'content_author', 'content_copyright', 'style', 'style_title', 'style_author', 'style_copyright', 'weight', 'index', 'gifs', 'images', 'metadata', 'to_review']]
df[['content_copyright', 'style_copyright']] = False

df = df.sort_values('index').reset_index(drop=True)

In [7]:
import boto3
client = boto3.client('s3')

In [8]:
def iterate_bucket_items(bucket, prefix):
    """
    Generator that iterates over all objects in a given s3 bucket

    See http://boto3.readthedocs.io/en/latest/reference/services/s3.html#S3.Client.list_objects_v2 
    for return data format
    :param bucket: name of s3 bucket
    :return: dict of metadata for an object
    """

    paginator = client.get_paginator('list_objects_v2')
    page_iterator = paginator.paginate(Bucket=bucket, Prefix=prefix)

    for page in page_iterator:
        if page['KeyCount'] > 0:
            for item in page['Contents']:
                yield item

In [20]:
def check_component(component):
    prefix = f'output/{component}/'
    
    for item in tqdm(iterate_bucket_items(bucket='neuralism-assets', prefix=prefix), total=12500):
        if '.' not in item['Key']:
            continue
            
        key = item['Key'].split('/')[-1].split('.')[0]
        if component=='gifs':
            key = key[1:]
        
        df.loc[prod['index']==int(key), component] = True

In [21]:
check_component('gifs')
check_component('images')
check_component('metadata')

  0%|          | 0/12500 [00:00<?, ?it/s]

  0%|          | 0/12500 [00:00<?, ?it/s]

  0%|          | 0/12500 [00:00<?, ?it/s]

In [53]:
df.to_csv('final_excel.csv', index=False)