In [None]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [None]:
from fastai.imports import *
from fastai.transforms import *
from fastai.conv_learner import *
from fastai.model import *
from fastai.dataset import *
from fastai.sgdr import *
from fastai.plots import *
import torch 

In [None]:
PATH = os.path.abspath('..')

In [None]:
# get labeled data (excluding test)
label_csv = f'{PATH}/catalogs/SDSSspecgalsDR14_boada.csv'

n = len(list(open(label_csv))) - 1 
val_idxs = get_cv_idxs(n)

# see a few
df = pd.read_csv(label_csv, index_col='objID')
df.head()

In [None]:
id_num = 1237657070629027993
os.path.isfile(f'{PATH}/images/{id_num}.jpg')

In [None]:
df.columns

### Examine some of the data

In [None]:
image_ids = [os.path.split(fname)[1].strip('.png') for fname in glob(f'{PATH}/images/*.jpg')]
image_ids[:5]

In [None]:
from IPython.display import display
for obj in df.index[:5]:
    display(PIL.Image.open(f'{PATH}/images/{obj}.jpg'))

### Examine distributions of each class

In [None]:
classes = ['nii_6584_flux', 'h_alpha_flux', 'oiii_5007_flux', 'h_beta_flux', 'h_delta_flux', 
           'd4000', 'bptclass', 'oh_p50', 'lgm_tot_p50', 'sfr_tot_p50']

fig, axes = plt.subplots(nrows=2, ncols=len(classes) // 2 + 1, figsize=(12, 6), sharey=True)

for ax, col in zip(axes.flat, classes):
    data = df[col]
    ax.hist(data, range=np.nanpercentile(data, [5, 95]))
    ax.set_xlabel(col)


Let's flag `bptclass` and move on.

## Make a mini training and test sample

I executed the code below to copy a bunch of images to a training set (~10000) and a test set (~5000).

```python
# make a mini training sample of ~15000 images
split_idxs = get_cv_idxs(n, val_pct=20000 / n) 

train_idxs = split_idxs[:-5000]
test_idxs  = split_idxs[-5000:]

# copy files to train-small dir, also make copy of data frame which only has 
# valid images
valid_train_idxs = []
for objid, idx in tqdm_notebook(zip(df.iloc[train_idxs].index, train_idxs), total=len(train_idxs)):
    try:
        shutil.copyfile(f'{PATH}/images/{objid}.jpg', f'{PATH}/train-small/{objid}.jpg')
        valid_train_idxs.append(idx)
    except FileNotFoundError:
        continue

# save mini-dataframe
df_train_small = df.iloc[valid_train_idxs].copy()
df_train_small.to_csv(f'{PATH}/catalogs/train-small.csv')

# do the same thing, except for test-small dataset
valid_test_idxs = []
for objid, idx in tqdm_notebook(zip(df.iloc[test_idxs].index, test_idxs), total=len(test_idxs)):
    try:
        shutil.copyfile(f'{PATH}/images/{objid}.jpg', f'{PATH}/test-small/{objid}.jpg')
        valid_test_idxs.append(idx)
    except FileNotFoundError:
        continue

# save mini-dataframe
df_test_small = df.iloc[valid_test_idxs].copy()
df_test_small.to_csv(f'{PATH}/catalogs/test-small.csv')
```

## Adapt the dataloader for using continuous variable output
Here I used some helper functions defined by @farlion from the fast.ai forums.

In [None]:
def parse_csv_multi_class_values(path_to_csv):
    """Parse filenames and values for classes from a CSV file.

    This method expects that the csv file at path :fn: has one column for filenames,
    while all the other columns represent classes.
    Expects a header with class names

    Arguments:
        path_to_csv: Path to a CSV file.

    Returns:
        a three-tuple of:            
            a list of filenames
            a list of values in the same order
            a dictionary of classes by classIndex           
    """
    with open(path_to_csv) as fileobj:
        reader = csv.reader(fileobj)
        header = next(reader)
        csv_lines = [l for l in reader]

    fnames = [fname for fname, *_ in csv_lines]
    classes = header[1:]
    values = [vals for _, *vals in csv_lines]
    idx2class = {i:c for i, c in enumerate(classes)}
   
    return fnames, values, idx2class

def csv_source_multi_class(folder, csv_file, suffix=''):
    fnames, values, idx2class = parse_csv_multi_class_values(csv_file)
    full_names = [os.path.join(folder,fn+suffix) for fn in fnames]
   
    val_arr = np.array(values).astype(np.float32)
    
    return full_names, val_arr, idx2class

@classmethod
def from_multiclass_csv(cls, path, folder, csv_fname, bs=64, tfms=(None,None),
           val_idxs=None, suffix='', test_name=None, num_workers=8):
    """ Read in images and their labels given as a CSV file.
--
    This method should be used when training image labels are given in an CSV file as opposed to
    sub-directories with label names.

    Arguments:
        path: a root path of the data (used for storing trained models, precomputed values, etc)
        folder: a name of the folder in which training images are contained.
        csv_fname: a name of the CSV file which contains target labels.
        bs: batch size
        tfms: transformations (for data augmentations). e.g. output of `tfms_from_model`
        val_idxs: index of images to be used for validation. e.g. output of `get_cv_idxs`.
            If None, default arguments to get_cv_idxs are used.
        suffix: suffix to add to image names in CSV file (sometimes CSV only contains the file name without file
                extension e.g. '.jpg' - in which case, you can set suffix as '.jpg')
        test_name: a name of the folder which contains test images.
        skip_header: skip the first row of the CSV file.
        num_workers: number of workers

    Returns:
        ImageClassifierData
    """
    fnames,y,idx2class = csv_source_multi_class(folder, csv_fname, suffix)

    val_idxs = get_cv_idxs(len(fnames)) if val_idxs is None else val_idxs
    ((val_fnames,trn_fnames),(val_y,trn_y)) = split_by_idx(val_idxs, np.array(fnames), y)

    test_fnames = read_dir(path, test_name) if test_name else None
    
    f = FilesIndexArrayRegressionDataset
    datasets = cls.get_ds(f, (trn_fnames,trn_y), (val_fnames,val_y), tfms,
                           path=path, test=test_fnames)
    return cls(path, datasets, bs, num_workers, classes=list(idx2class.values()))

ImageClassifierData.from_multiclass_csv = from_multiclass_csv

# Train a simple network to predict `oh_p50`

In [None]:
train_label_csv = f'{PATH}/catalogs/metallicity-train-small.csv'
test_label_csv = f'{PATH}/catalogs/metallicity-test-small.csv'

#df_train_small[['oh_p50']].to_csv(train_label_csv)
#df_test_small[['oh_p50']].to_csv(test_label_csv)

df_train_small = pd.read_csv(train_label_csv, index_col='objID')
df_test_small = pd.read_csv(test_label_csv, index_col='objID')

In [None]:
# load in data with transforms
arch = resnet34
sz = 32
bs = 64

In [None]:
val_idxs = get_cv_idxs(len(list(open(train_label_csv))) - 1)

In [None]:
def get_data(sz, bs):
    tfms = tfms_from_model(arch, sz, aug_tfms=transforms_top_down, max_zoom=1.1)
    return ImageClassifierData.from_multiclass_csv(PATH, 'train-small', train_label_csv, tfms=tfms,
                    suffix='.jpg', val_idxs=val_idxs, test_name='test-small', 
                    num_workers=4, )

In [None]:
data = get_data(sz, bs)

~~It appears that a batch size of `16` pushes my RAM to the limits (maybe even triggering swap space?), but I don't get an out of memory error -- except when lots of apps are open.~~ Now that I'm using categorical morphology, this seems to be going much more quickly and is light on my memory usage.

In [None]:
# initialize network
learn = ConvLearner.pretrained(arch, data)

In [None]:
idx = 5 
print(learn.data.trn_y[0])
PIL.Image.open(PATH + '/' + learn.data.trn_ds.fnames[idx])

In [None]:
# find learning rate
lrf=learn.lr_find()

In [None]:
learn.sched.plot()

### Start fitting the los-res images using `rmse` metric
I'm using a learning rate of 0.1.

In [None]:
def rmse(x, y):
    return torch.sqrt(F.mse_loss(x,y))

In [None]:
learn = ConvLearner.pretrained(arch, data)
metrics = [rmse]

learn.crit = rmse

In [None]:
lr = 0.1
learn.fit(lr, 3, cycle_len=1)

In [None]:
learn.save(f'{sz}-small_init-train')

In [None]:
learn.fit(lr, 3, cycle_len=1, cycle_mult=2)

Looks like loss still has a little ways to go, but we'll move on. Before training on bigger images, let's unfreeze the earlier layers and train them too.

In [None]:
# differential learning rates
learn.unfreeze()

lrs = np.array([1/100, 1/10, 1]) * lr
learn.fit(lrs, 3, cycle_len=1, cycle_mult=2)

In [None]:
learn.sched.plot_loss()

In [None]:
learn.save(f'{sz}-small_diff-learn-1')

From the look of it, we're still not overfitting yet. In fact, the training `rmse` is *higher* than the crossval `rmse`... Does that mean that we're nowhere near overfitting? 

In [None]:
learn.fit(lrs, 3, cycle_len=1, cycle_mult=3)

In [None]:
learn.sched.plot_loss()

In [None]:
learn.save(f'{sz}-small_diff-learn-2')

## Examining predictions using the validation set

In [None]:
learn.load(f'{sz}-small_diff-learn-2')

In [None]:
learn.set_data(get_data(sz, bs))
pred = learn.predict()

pred.shape

In [None]:
print('Validation data examples')
for i in range(5):
    p, p_val = pred[i, 0], learn.data.val_y[i, 0]
    print('Prediction: {:.3f}, True: {:.3f}, error: {:.3f}'.format(p, p_val, np.abs(p-p_val)))
    display(PIL.Image.open(PATH + '/' + learn.data.val_ds.fnames[i]))

### Check test set error distribution

In [None]:
#pred_test = learn.predict(is_test=True)
#y_test = pd.read_csv(test_label_csv).oh_p50

plt.hist((pred_test[:, 0] - y_test), bins=50);

Hm, let's see if we can do better by upgrading the image sizes...

## Continuing using larger image sizes

In [None]:
# doubling up from 32
sz = 64
bs = 64

learn.set_data(get_data(sz, bs))


In [None]:
learn.lr_find()

In [None]:
learn.sched.plot()

Stick with 0.1 I guess

In [None]:
lr = 0.1

learn.freeze()
learn.fit(lr, 3)

In [None]:
learn.sched.plot_loss()

### Unfreeze early layers again

It looks like training is pretty slow so we'll want to zip things along...

In [None]:
learn.unfreeze()

lrs = np.array([1/9, 1/3, 1]) * lr
learn.fit(lrs, n_cycle=3, cycle_len=1, cycle_mult=2)

In [None]:
learn.sched.plot_loss()

In [None]:
learn.save(f'32-64-small_diff-learn-1')

### Train for a really long time

In [None]:
learn.fit(lrs, n_cycle=5, cycle_len=2, cycle_mult=3)

In [None]:
learn.sched.plot_loss()

In [None]:
learn.save(f'32-64-small_diff-learn-2')

## Examine results...

### Test data set

In [None]:
pred_test, _ = learn.TTA(is_test=True)
y_test = pd.read_csv(test_label_csv).oh_p50

In [None]:
pred_test = np.mean(pred_test, axis=0)

In [None]:
plt.hist((pred_test[:, 0] - y_test), bins=50);
print('rmse = {:.3f}'.format(np.sqrt(np.mean((pred_test[:, 0] - y_test)**2))))

In [None]:
print('Test examples')
for i in range(10):
    p = pred_test[i, 0]
    y = y_test[i]
    print('Prediction: {:.3f}, True: {:.3f}, error: {:.3f}'.format(p, y, np.abs(p-y)))
    display(PIL.Image.open(PATH + '/' + learn.data.test_ds.fnames[i]))

### And here's the validation distribution and examples:

In [None]:
pred_val = learn.predict()
y_val = learn.data.val_y

plt.hist(pred_val[:, 0] - y_val[:, 0], bins=50, range=[-0.5, 0.5])
plt.xlim(-0.5, 0.5)
plt.xlabel('Validation: O/H p50 rmse [dex]')

In [None]:
print('Validation examples')
for i in range(10):
    p = pred_val[i, 0]
    y = y_val[i, 0]
    print('Prediction: {:.3f}, True: {:.3f}, error: {:.3f}'.format(p, y, np.abs(p-y)))
    display(PIL.Image.open(PATH + '/' + learn.data.trn_ds.fnames[i]))

### Verdict: valdiation set is overfitting but the test set is still underfitting

## Revisiting the training, validation, and test sets...

In [None]:
df_train_small.describe()

In [None]:
df_test_small.describe()

In [None]:
fig, axes = plt.subplots(3, 5, figsize=(12, 9))

details = ['ra', 'dec', 'z']
classes = ['nii_6584_flux', 'h_alpha_flux', 'oiii_5007_flux', 'h_beta_flux', 'h_delta_flux', 
           'd4000', 'bptclass', 'oh_p50', 'lgm_tot_p50', 'sfr_tot_p50']

for ax, col in zip(axes.flat, details+classes):
    ax.set_xlabel(col)
    ax.hist(df_train_small[df_train_small.modelMag_r < 21.0][col], histtype='bar', bins=30)
    ax.hist(df_test_small[df_test_small.modelMag_r < 21.0][col], histtype='bar', bins=30)

Well, aside from the quantities that are limited in dynamic range I think that we are okay here.

### Check if test indices are misaligned in the data loader?

In [None]:
for idx in get_cv_idxs(len(df_test_small), val_pct=5 / len(df_test_small)):
    img_name = data.test_ds.fnames[idx]
    img_base = np.int64(os.path.splitext(os.path.basename(img_name))[0])
    img = PIL.Image.open(PATH + '/' + img_name)
    display(img)
    
    print(img_base)
    print('Prediction (dataset): {:.3f}'.format(pred_test[idx, 0]))
    print('Truth: {:.3f}'.format( df_test_small.loc[img_base].oh_p50))
    print('Indexed: {:.3f}'.format(df_test_small.iloc[idx].oh_p50))
    print('-----------------------------------------------------------')

### Looks like the test indices got misaligned from the dataframe's

How did this happen? ~~I need to figure that out...~~ Turns out that the dataframe needs to be sorted by index (see section 1.4.3 below)

In [None]:
pred_test = learn.predict(is_test=True)

# reorder by image name
img_names = [np.int64(os.path.splitext(os.path.basename(img_name))[0]) for img_name in data.test_ds.fnames]
y_test = df_test_small.loc[img_names].oh_p50

plt.hist((pred_test[:, 0] - y_test), bins=50, range=[-0.5, 0.5])
plt.xlim(-0.5, 0.5)

print('rmse = {:.4f}'.format(np.sqrt(np.mean((pred_test[:, 0] - y_test)**2))))
plt.xlabel('Test: O/H p50 rmse [dex]');

### Fixing the order of the test_label_csv

In [None]:
df_test_small.index

In [None]:
arch = resnet34
val_idxs = get_cv_idxs(len(df_train_small))
data = get_data(64, 64)

img_names = [np.int64(os.path.splitext(os.path.basename(img_name))[0]) for img_name in data.test_ds.fnames]
img_names[:10]

In [None]:
sorted(df_test_small.index)[:10]

There we go! What went wrong is that the test image filenames were loaded in alphanumeric order, whereas the dataframe was not. So we just need to sort the data frame by index.

In [None]:
df_test_small.sort_index(inplace=True)
df_test_small.to_csv(test_label_csv)

In [None]:
learn = ConvLearner.pretrained(arch, data)
learn.load('32-64-small_diff-learn-2')

In [None]:
y_test = df_test_small.oh_p50.as_matrix()
pred_test = learn.predict(is_test=True)[:, 0]
print('Test examples (again)')

for i in range(10):
    p = pred_test[i]
    y = y_test[i]
    print('Prediction: {:.3f}, True: {:.3f}, error: {:.3f}'.format(p, y, np.abs(p-y)))
    display(PIL.Image.open(PATH + '/' + learn.data.test_ds.fnames[i]))

In [None]:
plt.hist(pred_test - y_test, bins=50, range=[-0.5, 0.5])
plt.xlabel('Test O/H p50 [dex])');

print(np.sqrt(np.mean((pred_test - y_test)**2)))

## Larger batch size

In [None]:
# increase batch size
bs = 128
sz = 32

data = get_data(sz, bs)
learn = ConvLearner.pretrained(arch, data)
learn.crit = rmse



# find learning rate again
learn.lr_find()
learn.sched.plot()

In [None]:
lr = 0.1

learn.fit(lr, n_cycle=5)

In [None]:
learn.unfreeze()

lrs = np.array([1/9, 1/3, 1]) * lr
learn.fit(lrs, n_cycle=3, cycle_len=1, cycle_mult=2)

In [None]:
learn.sched.plot_loss()

In [None]:
pred_test = learn.predict(is_test=True)[:, 0]
y_test = df_test_small.oh_p50.as_matrix()

print('Test rmse = {:.4f}'.format(np.sqrt(np.mean((pred_test - y_test)**2))))

### Move up to 64x64

In [None]:
learn.set_data(get_data(64, bs))
learn.lr_find()
learn.sched.plot()

In [None]:
learn.freeze()
lr = 0.1
learn.fit(lr, 3)

In [None]:
lrs = lr * np.array([1/9, 1/3, 1])
learn.unfreeze()
learn.fit(lrs, n_cycle=3, cycle_len=1, cycle_mult=2)

In [None]:
learn.sched.plot_loss()

In [None]:
learn.fit(lrs, 5)
learn.sched.plot_loss()

In [None]:
learn.fit(lrs, n_cycle=3, cycle_len=1, cycle_mult=3)

## Next steps?

- Perhaps we can make a pipeline moving from 32 -> 64 -> 128, with batchsize ~ 128? 
- We can also add more data (rather than ~10^4 galaxies, throw ~10^5 at it). 
- Another option is to use ResNet50 or Resnet101.

# Use more data

In [None]:
train_label_csv = f'{PATH}/catalogs/train.csv'
df = pd.read_csv(train_label_csv, index_col=0)

val_idxs = get_cv_idxs(len(df))

### Create test-train split

```python
# randomly do ~80/20 split
split_idxs = np.arange(len(df))
np.random.shuffle(split_idxs)

train_idxs = split_idxs[:-25000]
test_idxs  = split_idxs[-25000:]

# copy files to train-small dir, also make copy of data frame which only has 
# valid images
valid_train_idxs = []
for objid, idx in tqdm_notebook(zip(df.iloc[train_idxs].index, train_idxs), total=len(train_idxs)):
    try:
        shutil.copyfile(f'{PATH}/images/{objid}.jpg', f'{PATH}/train/{objid}.jpg')
        valid_train_idxs.append(idx)
    except FileNotFoundError:
        continue

# save mini-dataframe
df_train = df.iloc[valid_train_idxs].copy()
df_train.to_csv(f'{PATH}/catalogs/train.csv')

# do the same thing, except for test-small dataset
valid_test_idxs = []
for objid, idx in tqdm_notebook(zip(df.iloc[test_idxs].index, test_idxs), total=len(test_idxs)):
    try:
        shutil.copyfile(f'{PATH}/images/{objid}.jpg', f'{PATH}/test/{objid}.jpg')
        valid_test_idxs.append(idx)
    except FileNotFoundError:
        continue

# save mini-dataframe *and sort by index*
df_test = df.iloc[valid_test_idxs].copy()
df_test.sort_index(inplace=True)
df_test.to_csv(f'{PATH}/catalogs/test.csv')
```

### Initialize network

In [None]:
sz = 32
bs = 64
arch = resnet34

def get_data(sz, bs):
    tfms = tfms_from_model(arch, sz, aug_tfms=transforms_top_down, max_zoom=1.1)
    return ImageClassifierData.from_multiclass_csv(PATH, 'images', train_label_csv, tfms=tfms,
                    suffix='.jpg', val_idxs=val_idxs, test_name='test', num_workers=4, )

In [None]:
data = get_data(sz, bs)
learn = ConvLearner.pretrained(arch, data)

def rmse(x, y):
    return torch.sqrt(F.mse_loss(x,y))

metrics = [rmse]
learn.crit = rmse

learn.lr_find()
learn.sched.plot()

## Early training

~~Let's be adventurous and select a high learning rate of 0.3~~ $\leftarrow$ too high, that started to diverge.

~~Let's try a learning rate of 3e-3.~~ $\leftarrow$ too low, that took over 5 epochs to make it to RMSE = 0.10.

Perhaps let's try lr=0.1 again?

In [None]:
lr = 0.1
learn.fit(lr, 5, cycle_len=1)

In [None]:
learn.sched.plot_loss(n_skip=500)

In [None]:
learn.unfreeze()

# lower the rate a little
lrs = 3e-2 * np.array([1/9, 1/3, 1])

learn.fit(lr, 3, cycle_len=1, cycle_mult=2)

In [None]:
learn.sched.plot_loss()

In [None]:
learn.save('32_diff-1')

In [None]:
# anneal more?
lrs = 1e-2 * np.array([1/16, 1/4, 1])
learn.fit(lrs, 3, cycle_len=1, cycle_mult=2)

In [None]:
learn.sched.plot_loss()

In [None]:
learn.save('32_diff-2')

## 64-size training

In [None]:
learn.load('32_diff-2')

In [None]:
data = get_data(64, 128)
learn.set_data(data)

learn.freeze()

lr = 0.1
learn.fit(lr, 3, cycle_len=1, cycle_mult=2)
learn.sched.plot_loss()

In [None]:
learn.save('32-64_init')

In [None]:
# unfreeze and train more
learn.unfreeze()

lrs = 0.01 * np.array([1/25, 1/5, 1])

learn.fit(lr, 5, cycle_len=1, cycle_mult=2)


In [None]:
learn.sched.plot_loss()

In [None]:
#learn.save('32-64_diff-1')
learn.load('32-64_diff-1')

## Evaluation time

### Check validation set using test-time augmentation (TTA)

In [None]:
logp, y_val = learn.TTA()

In [None]:
p_val = np.mean(logp, axis=0)

for i in range(10):
    p = p_val[i, 0]
    y = y_val[i, 0]
    print('Prediction: {:.3f}, True: {:.3f}, error: {:.3f}'.format(p, y, np.abs(p-y)))
    display(PIL.Image.open(PATH + '/' + learn.data.test_ds.fnames[i]))

In [None]:
plt.hist(p_val[:,0] - y_val[:,0], bins=50, range=[-0.5, 0.5]);

print('Val accuracy (with TTA) is {:.3f}'.format(np.sqrt(np.mean((p_val[:,0] - y_val[:,0])**2))))

### In the "eyes" of the resnet, which objects have the lowest or highest metallicity?

The lowest metallicity:

In [None]:
for [idx] in np.argsort(p_val, axis=0)[:10]:
    p = p_val[idx, 0]
    y = y_val[idx, 0]
    print('Prediction: {:.3f}, True: {:.3f}, error: {:.3f}'.format(p, y, np.abs(p-y)))
    display(PIL.Image.open(PATH + '/' + learn.data.test_ds.fnames[idx]))

In [None]:
for [idx] in np.argsort(p_val, axis=0)[:-10:-1]:
    p = p_val[idx, 0]
    y = y_val[idx, 0]
    print('Prediction: {:.3f}, True: {:.3f}, error: {:.3f}'.format(p, y, np.abs(p-y)))
    display(PIL.Image.open(PATH + '/' + learn.data.test_ds.fnames[idx]))

### Evaluate using test dataset

This takes a while because there are about 20000 test images! Thus I'm using `learn.predict` rather than `learn.TTA` -- even though the latter would yield slightly better results.

In [None]:
p_test = learn.predict(is_test=True)

In [None]:
test_label_csv = f'{PATH}/catalogs/test.csv'
y_test = pd.read_csv(test_label_csv).oh_p50

plt.hist((p_test[:, 0] - y_test), bins=50);
print('Test (no TTA) rmse is = {:.3f}'.format(np.sqrt(np.mean((p_test[:, 0] - y_test)**2))))