In [8]:
import os
import glob
from pathlib import Path
import pandas as pd
import numpy as np

#substitute where you have all your .csv files saved
filepath = Path('G:/LZX Stuff/11-11-2022/data')

In [9]:
#lists directories to make sure you're in the right place, 
#you can skip running this cell. It just prints all the files in the dir

for filename in os.listdir(filepath):
    f = os.path.join(filepath, filename)
    print(f)

G:\LZX Stuff\11-11-2022\data\11-11-2022_1.csv
G:\LZX Stuff\11-11-2022\data\11-11-2022_2.csv
G:\LZX Stuff\11-11-2022\data\11-11-2022_3.csv
G:\LZX Stuff\11-11-2022\data\11-11-2022_4.csv
G:\LZX Stuff\11-11-2022\data\11-11-2022_5.csv


In [10]:
#make a glob of files and concatenate all .csvs into a single dataframe

all_files = glob.glob(os.path.join(filepath, "*.csv"))
df = pd.concat((pd.read_csv(f) for f in all_files), ignore_index=True)

#check the dataframe (should see 2 columns, image and prompt)
df.head()


Unnamed: 0,image,prompt
0,00005.png,"a bottle of wine sitting on top of a table, a ..."
1,00002.png,a purple box sitting on top of a snow covered ...
2,00001.png,"a computer mouse sitting on top of a table, a ..."
3,00003.png,"a close up of a clock with numbers on it, insp..."
4,00004.png,"a close up of a clock on a wall, inspired by S..."


In [11]:
#check size of dataframe
print(df.shape)

(6287, 2)


In [93]:
#splits prompt into "subject," genre", "artist," and "flavor"
#i'd made the assumption clip interrogator writes these things always in the
# order subject-genre-artist-flavors, but seems like genre/artist flip sometimes.
#not sure how to account for this.

split_df = df['prompt'].str.split(',',n=3, expand=True)
split_df = split_df.rename(columns={0: "Subject", 1: "Genre", 2:"Artist",3:"Flavor"})
print(split_df.head())



                                             Subject  \
0         a bottle of wine sitting on top of a table   
1  a purple box sitting on top of a snow covered ...   
2         a computer mouse sitting on top of a table   
3           a close up of a clock with numbers on it   
4                    a close up of a clock on a wall   

                               Genre                       Artist  \
0                  a raytraced image   inspired by Tadanori Yokoo   
1                       a screenshot     inspired by Sigmar Polke   
2                  a raytraced image               crystal cubism   
3   inspired by Stanisław Witkiewicz                    video art   
4   inspired by Stanisław Witkiewicz                    video art   

                                              Flavor  
0   video art, seapunk mecha, camera looking down...  
1   video art, seapunk mecha, old distorted camco...  
2   damaged camcorder video, purple color pallete...  
3   jsrf, vapourwave, splato

In [94]:
# grab 20% of whatever the size of the Series is
percentile = split_df['Artist'].value_counts().size/5
percentile = round(percentile)
#check your math by uncommenting the next two lines and running cell
#print(percentile)
#print(df['Artist'].value_counts().size)

#then use as a slicer to print top 20% of results
print(split_df['Artist'].value_counts()[:percentile])


 video art                           983
 inspired by Roy Newell              979
 polycount                           883
 inspired by Stanisław Witkiewicz    384
 inspired by Alfred Manessier        268
                                    ... 
 water mirrored water                  6
 inspired by Sadamichi Hirasawa        6
 cubism                                6
 inspired by Bedwyr Williams           5
 inspired by Benoit B. Mandelbrot      5
Name: Artist, Length: 62, dtype: int64


In [15]:
#Flavors are a column of lists "Flavor 1, flavor 2, flavor 3..."
#So this splits them into separate columns for each list entry
flavors = split_df['Flavor'].str.split(',', expand=True)

#Then melts it into a SINGLE column, which we can now use to see
flavors_melted = flavors.melt(var_name = 'Column', value_name = 'Flavor')

#how often each INDIVIDUAL flavor appears
flavors_melted['Flavor'].value_counts()

 video art                        2921
 2 0 1 5 live music video         1321
 aspect ratio 16:9                 911
 cartoon network stillframe        810
 polycount                         640
                                  ... 
 mechanical butterfly                1
 taken on a 1960s kodak camera       1
 retro                               1
 void manifold                       1
 film color                          1
Name: Flavor, Length: 6228, dtype: int64

In [95]:
#reduces the size of the input list to the top X% of entries
def get_top_percentile(percentile, frame, column):
    percentile = split_df[column].value_counts().size*percentile
    percentile = round(percentile)
    
    #there's gotta be a cleaner way to do this.
    new_frame = frame[column].value_counts()[:percentile].to_frame().reset_index()
    new_frame = new_frame.rename(columns={'index': column, column: 'count'})
    return new_frame

#uncomment to make sure it's working
#print(get_top_percentile(.1, flavors_melted, 'Flavor'))

                             Flavor  count
0                         video art   2921
1          2 0 1 5 live music video   1321
2                 aspect ratio 16:9    911
3        cartoon network stillframe    810
4                         polycount    640
..                              ...    ...
611   bass sound waves on circuitry     11
612                   color tearing     11
613                        rippling     11
614        purple blue color scheme     11
615          the ring is horizontal     11

[616 rows x 2 columns]


In [16]:
#determines what percentiles to take from 

pct = .1
top_subjects = get_top_percentile(pct*.75, split_df, 'Subject')
top_artists = get_top_percentile(pct*2, split_df, 'Artist')
top_genres = get_top_percentile(pct, split_df, 'Genre')
top_flavors = get_top_percentile(pct*.5, flavors_melted, 'Flavor')

print(top_flavors)

                              Flavor  count
0                          video art   2921
1           2 0 1 5 live music video   1321
2                  aspect ratio 16:9    911
3         cartoon network stillframe    810
4                          polycount    640
..                               ...    ...
303                              vhs     21
304   cinematic panavision 5384 film     21
305                 anime screenshot     21
306           sharp irregular shapes     21
307                  vhs distortions     21

[308 rows x 2 columns]


In [91]:
#okay, so what's next? get weighted random choice from each of these.

import random

def prompt_piece(dataframe, num):
    return random.choices(dataframe[dataframe.columns[0]], dataframe[dataframe.columns[1]],k = num)

def get_flavors(flavs):
    for flavor in flavs:
        return prompt_piece(top_flavors)[0]

#change the number here to set the maximum number of flavors
flavor = prompt_piece(top_flavors, 8)

#the below lines de-dupe the list of flavors. that's what's happening if you
#get fewer than the max
deduped = list(set(flavor))
flavs_out = ", ".join(deduped)


print(f'{prompt_piece(top_subjects, 1)[0]},{prompt_piece(top_artists,1)[0]},{prompt_piece(top_genres,1)[0]},',flavs_out)

a multicolored image of an abstract design, inspired by Stanisław Witkiewicz, computer graphics,  ripple,  shot with a arriflex 35 ii,  tv color test pattern,  aspect ratio 16:9,  64x64,  2 0 1 5 live music video,  red and blue color scheme,  blue color bleed


In [92]:
#this one will just pull out a list of flavors from one individual image
#this will produce slightly different results, depending on how homogenous
#the input data is

with np.printoptions(edgeitems=50):
    print(split_df["Flavor"].sample().tolist())

[' polycount, computer art, weeping tears of black oil, large triangular shapes, exploitable image, still frame from a movie, mountain water']
