In [10]:
import pandas as pd
import numpy as np
import os
import fnmatch
from datetime import datetime,timedelta  

def read_image_lst_info(srcdir):
    """Walk through base folder and collect paths for all image files.
        category info, return as a dataframe w/ 
        samp_index, cat_index, relpath, class name"""
    
    fileexts=['*.jpg']

    # search through source folder for sample files
    relpath = []
    subdirname = []
    for ext in fileexts:
        for root, dirnames, filenames in os.walk(srcdir):
            for filename in fnmatch.filter(filenames, ext):
                subdir = root.split('\\')[-1]
                relpath.append( subdir + '/' + filename)
                subdirname.append(subdir)
                
    # make sample id
    sampid = np.arange(len(subdirname))
    
    # subdir names will be used as class names
    classnames = np.unique(subdirname)
    
    # generate class id for each sample
    d = dict(zip(classnames,np.arange(len(classnames))))
    classid = [d[x] for x in subdirname]
    
    # return dataframe with file info
    return pd.DataFrame({'sampid': sampid, 
                         'classid':  classid,
                         'path': relpath,
                         'classname': subdirname} )   
    
# dir containing image files
# NOTE: code assumes this script is run from directory 
#  containing srcdir.
srcdir = './dog-breeds'

df = read_image_lst_info(srcdir)

In [11]:
from sklearn.model_selection import StratifiedShuffleSplit

# num K folds train/test split sets you want to generate
n_splits = 1 

# None = select all classes
n_classes = None 

# prefix for generated LST file names
filenameroot = 'dog_breeds'

# include all classes of image
if n_classes is None:
    filenameroot += '_all'
    df_filt = df
# include only the first specified number of classes
else:
    filenameroot += '_'+str(n_classes)
    df_filt = df[df.classid<n_classes]

# this split method ensures each class gets equal sample sizes
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=0)

print('Generating train/test lst files:')
for i, (train_index, test_index) in zip(range(n_splits), 
                    sss.split(df_filt, df_filt['classid'])):
    df_train = df_filt.iloc[train_index,:3]
    fname_train = filenameroot+'_fold_%d_train.lst'%(i+1)
    df_train.to_csv(fname_train, index=False, header=False, sep='\t')
    
    df_test = df_filt.iloc[test_index,:3]
    fname_test = filenameroot+'_fold_%d_test.lst'%(i+1)
    df_test.to_csv(fname_test,index=False, header=False, sep='\t')
    
    print('split',i+1)
    print('  train: %d samples, %d classes, %s'%(
        df_train.shape[0], len(df_train.classid.unique()), fname_train))
    print('  test: %d samples, %d classes, %s'%(
        df_test.shape[0], len(df_test.classid.unique()), fname_test))

Generating train/test lst files:
split 1
  train: 1636 samples, 13 classes, dog_breeds_all_fold_1_train.lst
  test: 409 samples, 13 classes, dog_breeds_all_fold_1_test.lst
