In [47]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import shutil

import helper as hp

from random import sample

In [2]:
def load_dataframe(filename):
    dataframe = pd.read_csv(filename)
    dataframe.drop(columns = ['Unnamed: 0'], inplace = True)
    print('Found {} rows '.format(str(len(dataframe))))
    return dataframe

In [3]:
all_train_df = load_dataframe('full_datasets/all_train.csv')
all_train_df.head()

Found 7224612 rows 


Unnamed: 0,label,path
0,Lube,/2425/1/115_Lube_45484.jpg
1,Spencerian,/2425/1/114_Spencerian_73323.jpg
2,accommodatingly,/2425/1/113_accommodatingly_613.jpg
3,CARPENTER,/2425/1/112_CARPENTER_11682.jpg
4,REGURGITATING,/2425/1/111_REGURGITATING_64100.jpg


In [4]:
all_valid_df = load_dataframe('full_datasets/all_validation.csv')
all_valid_df.head()

Found 802734 rows 


Unnamed: 0,label,path
0,MONIKER,/2697/6/466_MONIKER_49537.jpg
1,Ecclesiastics,/2697/6/465_Ecclesiastics_24500.jpg
2,FIRESTORM,/2697/6/464_FIRESTORM_29099.jpg
3,Psi,/2697/6/463_Psi_60982.jpg
4,Repurchases,/2697/6/462_Repurchases_64997.jpg


In [5]:
all_test_df = load_dataframe('full_datasets/all_test.csv')
all_test_df.head()

Found 891927 rows 


Unnamed: 0,label,path
0,slinking,/3000/7/182_slinking_71711.jpg
1,REMODELERS,/3000/7/181_REMODELERS_64541.jpg
2,Chronographs,/3000/7/180_Chronographs_13538.jpg
3,Impeaching,/3000/7/179_Impeaching_38222.jpg
4,discombobulated,/3000/7/178_discombobulated_22063.jpg


In [12]:
def create_full_directory(df):
    directory = hp.assets_directory()
    df['full_path'] = directory + df['path']
    return df

In [28]:
train_df = all_train_df.sample(100000)
valid_df = all_valid_df.sample(20000)
test_df = all_test_df.sample(20000)

train_df = create_full_directory(train_df)
valid_df = create_full_directory(valid_df)
test_df = create_full_directory(test_df)

print('Training dataset size: ' + str(len(train_df)))
print('Validation dataset size: ' + str(len(valid_df)))
print('Test dataset size: ' + str(len(test_df)))

train_df.head()

Training dataset size: 100000
Validation dataset size: 20000
Test dataset size: 20000


Unnamed: 0,label,path,full_path
5996700,Eerier,/408/3/125_Eerier_24690.jpg,/Users/jbenavidesv/Workspace/ocr_detector/90kD...
4416676,Begets,/944/3/62_Begets_6696.jpg,/Users/jbenavidesv/Workspace/ocr_detector/90kD...
1950801,hutches,/1771/3/462_hutches_37481.jpg,/Users/jbenavidesv/Workspace/ocr_detector/90kD...
3829770,lobbyist,/1141/4/369_lobbyist_44975.jpg,/Users/jbenavidesv/Workspace/ocr_detector/90kD...
4786390,cheekiest,/814/6/233_cheekiest_12962.jpg,/Users/jbenavidesv/Workspace/ocr_detector/90kD...


In [31]:
def save_files(root, df):
    print('\n')
    paths = df['path'].values
    full_paths = df['full_path'].values
    
    print('\n')
    print('Processing ' + str(root))
    for idx in range(0, len(paths)):
        path = paths[idx]
        f_path = full_paths[idx]
        
        if idx % 500 == 0:
            print('Processed {} images of {}'.format(idx, len(paths)))
        
        dest_path = root + path
        os.makedirs(os.path.dirname(dest_path), exist_ok=True)
        shutil.copy(f_path, dest_path)

In [32]:
save_files('datasets/train', train_df)
save_files('datasets/valid', valid_df)
save_files('datasets/test', test_df)





Processing demo/train
Processed 0 images of 100000
Processed 500 images of 100000
Processed 1000 images of 100000
Processed 1500 images of 100000
Processed 2000 images of 100000
Processed 2500 images of 100000
Processed 3000 images of 100000
Processed 3500 images of 100000
Processed 4000 images of 100000
Processed 4500 images of 100000
Processed 5000 images of 100000
Processed 5500 images of 100000
Processed 6000 images of 100000
Processed 6500 images of 100000
Processed 7000 images of 100000
Processed 7500 images of 100000
Processed 8000 images of 100000
Processed 8500 images of 100000
Processed 9000 images of 100000
Processed 9500 images of 100000
Processed 10000 images of 100000
Processed 10500 images of 100000
Processed 11000 images of 100000
Processed 11500 images of 100000
Processed 12000 images of 100000
Processed 12500 images of 100000
Processed 13000 images of 100000
Processed 13500 images of 100000
Processed 14000 images of 100000
Processed 14500 images of 100000
Processe

Processed 5000 images of 20000
Processed 5500 images of 20000
Processed 6000 images of 20000
Processed 6500 images of 20000
Processed 7000 images of 20000
Processed 7500 images of 20000
Processed 8000 images of 20000
Processed 8500 images of 20000
Processed 9000 images of 20000
Processed 9500 images of 20000
Processed 10000 images of 20000
Processed 10500 images of 20000
Processed 11000 images of 20000
Processed 11500 images of 20000
Processed 12000 images of 20000
Processed 12500 images of 20000
Processed 13000 images of 20000
Processed 13500 images of 20000
Processed 14000 images of 20000
Processed 14500 images of 20000
Processed 15000 images of 20000
Processed 15500 images of 20000
Processed 16000 images of 20000
Processed 16500 images of 20000
Processed 17000 images of 20000
Processed 17500 images of 20000
Processed 18000 images of 20000
Processed 18500 images of 20000
Processed 19000 images of 20000
Processed 19500 images of 20000


In [38]:
train_df = train_df.drop('full_path', axis = 1)
train_df.head()

KeyError: "['full_path'] not found in axis"

In [41]:
def extend_path(prefix, df):
    df['path'] = prefix + df['path']
    return df

In [43]:
train_df = extend_path('/datasets/train', train_df)
train_df.head()

Unnamed: 0,label,path
5996700,Eerier,/datasets/train/408/3/125_Eerier_24690.jpg
4416676,Begets,/datasets/train/944/3/62_Begets_6696.jpg
1950801,hutches,/datasets/train/1771/3/462_hutches_37481.jpg
3829770,lobbyist,/datasets/train/1141/4/369_lobbyist_44975.jpg
4786390,cheekiest,/datasets/train/814/6/233_cheekiest_12962.jpg


In [48]:
df = valid_df.copy()
df = extend_path('/datasets/valid', df)
hp.store_dataframe_to_csv(df, '/datasets/valid_df.csv', ['label', 'path'])
df.head()

AttributeError: module 'helper' has no attribute 'store_dataframe_to_csv'