In [13]:
#### 시작

# General purpose libraries
import pandas as pd
import numpy as np
import warnings
import math 
from PIL import Image
from pathlib import Path 
from multiprocessing import Pool

# Plotting libraries
import plotly 
import plotly.express as px
import plotly.graph_objects as go 
from plotly.subplots import make_subplots
import plotly.io as pio 

# Model
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam, RMSprop
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras import backend as K

# Metrics
from sklearn.metrics import (confusion_matrix, 
                             accuracy_score, 
                             f1_score, 
                             recall_score, 
                             precision_score)

# Setting the plotly theme
pio.templates.default = 'plotly_white'

# Filter warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

# Define categories and encoding 
CATS = ['E', 'S', 'SB']
CATS_TO_IDX = {v: k for k, v in enumerate(CATS)}
IDX_TO_CATS = {k: v for k, v in enumerate(CATS)}

# Define categories colors 
CATS_COLORS = ['red', 'blue', 'green']
COLOR_MAP = {k: v for k, v in zip(CATS, CATS_COLORS)}

# Define images shapes 
SHAPES = ['small', 'medium', 'large']

# Data preparation
TEST_FRAC = 0.3

# Models params
IMG_PARAMS = {
    'small':{
        'height': 69, 
        'width': 69, 
        'channels': 3, 
    }, 
    'medium':{
        'height': 227, 
        'width': 227, 
        'channels': 3
    },
    'large':{
        'height': 299, 
        'width': 299, 
        'channels': 3
    }
}
TRAIN_PATH = Path('/kaggle/working/train')
TEST_PATH = Path('/kaggle/working/test')
AUTOTUNE = tf.data.AUTOTUNE
BATCH_SIZE = 32
TRAIN_TEST_SPLIT = 0.3
EPOCHS = 30

project_path = Path('D:\galaxy')
path_69 = project_path.joinpath('images_E_S_SB_69x69_a_03')
path_227 = project_path.joinpath('images_E_S_SB_227x227_a_03')
path_299 = project_path.joinpath('images_E_S_SB_299x299_a_03')
path_csv = project_path.joinpath('3class_map_a(p).csv')



In [14]:
df = pd.read_csv(path_csv)
df.head()

Unnamed: 0.1,Unnamed: 0,dr7objid,asset_id,gz2class,total_classifications,total_votes,agreement
0,0,587732591714893851,58957,Sc+t,45,342,1.0
1,1,588009368545984617,193641,Sb+t,42,332,1.0
2,2,587732484359913515,55934,Ei,36,125,0.384527
3,3,587741723357282317,158501,Sc+t,28,218,0.766954
4,4,587738410866966577,110939,Er,43,151,0.399222


In [15]:
def itterate_folder(folder):
    img_type = '.jpg'
    files = []
    for sub_folder in folder.iterdir(): 
        for active_folder in sub_folder.iterdir(): 
            files += [f for f in active_folder.iterdir() if f.suffix == img_type]
    return files 

def build_df(files):
    path = list(map(str, files))
    asset_id = list(map(lambda x: x.stem, files))
    return pl.DataFrame(dict(path=path, asset_id=asset_id))

small_files = build_df(itterate_folder(path_69)).rename({'path': 'path_small'})
medium_files = build_df(itterate_folder(path_227)).rename({'path': 'path_medium'})
large_files = build_df(itterate_folder(path_299)).rename({'path': 'path_large'})

images_df = (
    small_files
    .join(medium_files, on='asset_id')
    .join(large_files, on='asset_id')
    .with_columns(
        target=pl.col('path_small').str.split("/").list.get(-2)
    )
    .select(['asset_id', 'target', 'path_small', 'path_medium', 'path_large'])
)

df = (
    df
    .with_columns(pl.col('asset_id').cast(pl.Utf8))
    .join(images_df, on='asset_id', how='left')
)
df.describe()

df = df.drop_nulls()
print(f'The new dataset has {df.shape[0]} lines')

NameError: name 'pl' is not defined

In [12]:
#### 전처리
def symlink_files(dests, source, shape, cat):
    #cat_idx = str(CATS_TO_IDX[cat])
    cat_idx = cat
    source_path = source.joinpath(shape).joinpath(cat_idx)
    source_path.mkdir(exist_ok=True, parents=True)
    dests = map(lambda x: Path(x), dests)
    for f in dests: 
        filename = f.name
        source = source_path.joinpath(filename)
        source.symlink_to(f)
    

def generate_infra(df, source, n_samples, test_frac=TEST_FRAC):
    train_frac = 1 - test_frac
    train_size = np.ceil(train_frac*n_samples).astype(int)
    test_size = n_samples - train_size
    folder_col = ['train']*train_size + ['test']*test_size
    for cat in CATS:
        df_res = (
            df[df['target'] == cat]
            .sort_values(by='agreement', ascending=False)
            .sample(n=n_samples, random_state=45)
            .filter(['path_small', 'path_medium', 'path_large'])
        )
        df_res['dest_folder'] = folder_col
        for folder in ['test', 'train']:
            folder_source = source.joinpath(folder)
            data = df_res[df_res['dest_folder'] == folder]
            for shape in SHAPES:
                dests = data['path_' + shape].tolist()
                symlink_files(dests, folder_source, shape, cat)
                
wk_path = Path('/kaggle/working/')
n_samples = df[df['target'] == 'SB'].shape[0]

try:
    generate_infra(df, wk_path, n_samples)
except FileExistsError:
    pass 

for shape in SHAPES:
    p = TRAIN_PATH.joinpath(shape)
    for cat in p.iterdir():
        files = [f for f in cat.iterdir()]
        l = len(files)
        s = files[0].is_symlink()
        print(f'The dataset {shape} for category {cat.name} has {l} files. Symlink is {s}')

KeyError: 'target'