## Imports

In [1]:
import os
import pandas as pd
import numpy as np
from zipfile import ZipFile

import fastai
from fastai.vision import *
from fastai.tabular import *

from image_tabular.core import *
from image_tabular.dataset import *
from image_tabular.model import *
from image_tabular.metric import *

import PIL
import xgboost as xgb
from xgboost import DMatrix
from datetime import date
from sklearn.metrics import mean_squared_error as MSE
from sklearn.model_selection import RandomizedSearchCV
import seaborn as sb
from datetime import timedelta
from sklearn.linear_model import ElasticNetCV
from pickle import dump
from fastai.callbacks import SaveModelCallback
print(fastai.__version__)

1.0.61


In [2]:
# use gpu by default if available
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

data_dir = os.getcwd() + '/data/'
#image_dir = os.getcwd() + "/train_images_69pix/"
image_dir = os.getcwd() + "/train_images_128pix/"
eval_image_dir = os.getcwd() + "/evaluation_images_128pix/"

meta = pd.read_csv(data_dir + "train_metadata.tsv", sep = "\t")
meta['Path'] = image_dir + meta['image_name']

meta_test = pd.read_csv(data_dir + "eval_metadata.tsv", sep = "\t")
meta_test['Path'] = eval_image_dir + meta_test['image_name']

## Build Dataloader

In [3]:
def train_imagetab(val_p, tfms, bs, size, ps, tab_in_sz, tab_layers,
                   tab_out_sz):
    
    x = np.array(meta['SDSS_ID'])
    indices = np.random.permutation(x.shape[0])
    train_n = round((x.shape[0]) * (1-val_p))
    training_idx, val_idx = indices[:train_n], indices[train_n:]
    training_idx.sort(); val_idx.sort()

    # load image data using train_df and prepare fastai LabelLists
    df = meta[['image_name', 'M/L']]

    image_data = (ImageList.from_df(df, path = image_dir)
                  .split_by_idx(val_idx)
                  .label_from_df(cols="M/L", label_cls = FloatList)
                  .transform(tfms, size=size, padding_mode = 'zeros'))

    # add test data so that we can make predictions
    test_image_data = ImageList.from_df(meta_test, path=eval_image_dir, cols="image_name")
    image_data.add_test(test_image_data)
    
    tab_cols = ['L_g', 'distance_Mpc', 'galsize_kpc', 'size_69', 'size_128']
    procs = [Normalize]

    tab_data = (TabularList.from_df(meta, cat_names = [], cont_names=tab_cols, procs=procs)
                               .split_by_idx(val_idx)
                               .label_from_df(cols='M/L', label_cls = FloatList))

    # add test
    tab_data.add_test(TabularList.from_df(meta_test, cont_names=tab_cols))

    #from image_tabular import get_imagetabdatasets
    integrate_train, integrate_valid, integrate_test = get_imagetabdatasets(image_data, tab_data)

    # package train, valid, and test datasets into a fastai databunch
    db = DataBunch.create(integrate_train, integrate_valid, integrate_test,
                          path=image_dir, bs=bs)

    # image normalization with imagenet_stats
    db.norm, db.denorm = normalize_funcs_image_tab(*imagenet_stats)
    db.add_tfm(db.norm)
    
    
    # cnn model for images, use Resnet50 as an example
    cnn_arch = models.resnet50

    # cnn_out_sz is the output size of the cnn model that will be concatenated with tabular model output
    cnn_out_sz = 256

    # use fastai functions to get a cnn model
    image_data_db = image_data.databunch()
    image_data_db.c = cnn_out_sz
    cnn_learn = cnn_learner(image_data_db, cnn_arch, ps=ps)
    cnn_model = cnn_learn.model
    
    
    # get embedding sizes of categorical data
    emb_szs = [(0,0)]

    # use fastai functions to get a tabular model
    tabular_model = TabularModel(emb_szs, n_cont = tab_in_sz, out_sz=tab_out_sz,
                                 layers=tab_layers, ps=ps)
    tabular_model

    integrate_model = CNNTabularModel(cnn_model,
                                      tabular_model,
                                      layers = [cnn_out_sz + tab_out_sz, 32],
                                      ps=0.2,
                                      out_sz=1).to(device)
    
    learn = Learner(db, integrate_model, loss_func = MSELossFlat())

    # organize layer groups in order to use differential learning rates provided by fastai
    learn.layer_groups = [nn.Sequential(*flatten_model(cnn_learn.layer_groups[0])),
                          nn.Sequential(*flatten_model(cnn_learn.layer_groups[1])),
                          nn.Sequential(*(flatten_model(cnn_learn.layer_groups[2]) +
                                          flatten_model(integrate_model.tabular_model) +
                                          flatten_model(integrate_model.layers)))]
    
    return learn

### Begin Training

In [4]:
# data prep
val_p = .3
tfms = get_transforms(do_flip=True, flip_vert=True)
size = 69
bs = 128
ps = 0.1

# output size of the tabular model
tab_in_sz = 5
tab_layers = [2]
tab_out_sz = 5

learn = train_imagetab(val_p = val_p, tfms = tfms, bs = bs, size = size, 
                       tab_in_sz = tab_in_sz, tab_layers = tab_layers, 
                       tab_out_sz = tab_out_sz, ps = ps)

callback_save_file = 'galileo_vision_copy_vF'
learn.fit_one_cycle(20, 1e-3, 
                    callbacks=[SaveModelCallback(learn, every='epoch', 
                                                 monitor = 'valid_loss',
                                                name=callback_save_file)])

epoch,train_loss,valid_loss,time
0,8.085036,7.879394,01:47
1,7.03125,6.219033,01:43
2,3.64968,2.296193,01:44
3,1.379299,0.808823,01:43
4,1.094473,0.726093,01:42
5,0.911241,0.678882,01:44
6,0.830365,0.655763,01:43
7,0.764749,0.640623,01:04
8,0.752055,0.623739,00:55
9,0.759702,0.600679,00:55


In [5]:
callback_save_file = 'galileo_vision_copy_vF'
learn.fit_one_cycle(30, 8e-4, start_epoch = 20,
                    callbacks=[SaveModelCallback(learn, every='epoch', 
                                                 monitor = 'valid_loss',
                                                name=callback_save_file)])

Loaded galileo_vision_copy_vF_19


epoch,train_loss,valid_loss,time
20,0.673634,0.569375,01:44
21,0.667049,0.55384,01:46
22,0.615806,0.548805,01:44
23,0.614417,0.548914,01:47
24,0.646901,0.549824,01:48
25,0.59983,0.547067,00:59
26,0.636295,0.543618,00:58
27,0.647918,0.545037,01:00
28,0.640938,0.545236,00:59
29,0.633156,0.545747,00:58


In [6]:
callback_save_file = 'galileo_vision_copy_vF'
learn.fit_one_cycle(40, 2e-4, start_epoch = 30,
                    callbacks=[SaveModelCallback(learn, every='epoch', 
                                                 monitor = 'valid_loss',
                                                name=callback_save_file)])

Loaded galileo_vision_copy_vF_29


epoch,train_loss,valid_loss,time
30,0.625011,0.545304,01:47
31,0.624,0.546702,01:50
32,0.632292,0.544366,01:51
33,0.623012,0.544203,01:51
34,0.611802,0.542878,01:48
35,0.631642,0.545104,01:51
36,0.612698,0.546785,01:57
37,0.625834,0.544963,02:00
38,0.617575,0.546885,01:55
39,0.608616,0.544432,01:52


In [10]:
size = 128
bs=16
ps = 0.1
learn = train_imagetab(val_p = val_p, tfms = tfms, bs = bs, size = size, 
                       tab_in_sz = tab_in_sz, tab_layers = tab_layers, 
                       tab_out_sz = tab_out_sz, ps = ps)

learn.unfreeze()

callback_save_file = 'galileo_vision_copy_vF'
learn.fit_one_cycle(70, 3e-5, start_epoch = 40,
                    callbacks=[SaveModelCallback(learn, every='epoch',
                                                 monitor = 'valid_loss',
                                                name=callback_save_file)])


RuntimeError: CUDA out of memory. Tried to allocate 2.00 MiB (GPU 0; 7.93 GiB total capacity; 2.94 GiB already allocated; 5.50 MiB free; 2.96 GiB reserved in total by PyTorch)

In [None]:
size = 220
bs=32
ps = .15
val_p
learn = train_imagetab(val_p = val_p, tfms = tfms, bs = bs, size = size, 
                       tab_in_sz = tab_in_sz, tab_layers = tab_layers, 
                       tab_out_sz = tab_out_sz, ps = ps)

learn.unfreeze()

callback_save_file = 'galileo_vision_copy_vF'
learn.fit_one_cycle(100, 5e-6, start_epoch = 70,
                    callbacks=[SaveModelCallback(learn, every='epoch',
                                                 monitor = 'valid_loss',
                                                name=callback_save_file)])


In [None]:
# make predictions for the test set
preds, y = learn.get_preds(DatasetType.Test)

meta_test['CNN_pred'] = preds.numpy()

# Save output
final_df = meta_test[['SDSS_ID', 'image_name', 'CNN_pred']]

final_df.to_csv("output/AIDataCompetition_GalileoVision_copy_v1.tsv", sep="\t")