In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
import functools
import traceback
def get_ref_free_exc_info():
    "Free traceback from references to locals/globals to avoid circular reference leading to gc.collect() unable to reclaim memory"
    type, val, tb = sys.exc_info()
    traceback.clear_frames(tb)
    return (type, val, tb)

def gpu_mem_restore(func):
    "Reclaim GPU RAM if CUDA out of memory happened, or execution was interrupted"
    @functools.wraps(func)
    def wrapper(*args, **kwargs):
        try:
            return func(*args, **kwargs)
        except:
            type, val, tb = get_ref_free_exc_info() # must!
            raise type(val).with_traceback(tb) from None
    return wrapper

In [3]:
from fastai.vision import *
from fastai.metrics import error_rate

In [4]:
path = Path('/home/ekami/workspace/cifar100/')

In [5]:
train = path/'train'
test = path/'test'

In [6]:
bs = 108

In [7]:
path.ls()

[PosixPath('/home/ekami/workspace/cifar100/test'),
 PosixPath('/home/ekami/workspace/cifar100/models'),
 PosixPath('/home/ekami/workspace/cifar100/train')]

In [8]:
np.random.seed(42)
data = ImageDataBunch.from_folder(path, train=train, valid_pct=0.2,
        ds_tfms=get_transforms(), size=224, num_workers=4, bs=bs).normalize(cifar_stats)

In [9]:
class gpu_mem_restore_ctx():
    " context manager to reclaim GPU RAM if CUDA out of memory happened, or execution was interrupted"
    def __enter__(self): return self
    def __exit__(self, exc_type, exc_val, exc_tb):
        if not exc_val: return True
        traceback.clear_frames(exc_tb)
        raise exc_type(exc_val).with_traceback(exc_tb) from None

In [10]:
#Allow crashing
learn = create_cnn(data, models.resnet18 , metrics=accuracy)

## Resnet 18

In [11]:
int(bs*3.75)

405

In [12]:
np.random.seed(42)
data = ImageDataBunch.from_folder(path, train=train, valid_pct=0.2,
        ds_tfms=get_transforms(), size=224, num_workers=4, bs=408).normalize(cifar_stats)
learn = create_cnn(data, models.resnet18, metrics=accuracy)

In [13]:
with gpu_mem_restore_ctx():
    learn.fit_one_cycle(1)

epoch,train_loss,valid_loss,accuracy
1,2.441905,1.621814,0.561000


In [14]:
with gpu_mem_restore_ctx():
    learn.fit_one_cycle(30, max_lr=1e-2)

epoch,train_loss,valid_loss,accuracy
1,1.686131,1.220904,0.651333
2,1.388124,0.996638,0.703500
3,1.216391,0.915040,0.721250
4,1.165618,0.937397,0.716500
5,1.166939,1.005142,0.696083
6,1.167172,1.036181,0.690250
7,1.155099,1.098619,0.678833
8,1.141750,1.031859,0.692500
9,1.122253,0.973910,0.700917
10,1.078584,0.987744,0.700667
11,1.049187,0.925535,0.718917
12,1.023786,0.923716,0.723000
13,0.994283,0.950193,0.714333
14,0.978886,0.939824,0.715583
15,0.947734,0.900096,0.729167
16,0.912281,0.827112,0.747167
17,0.877508,0.794671,0.759000
18,0.848763,0.807713,0.751250
19,0.832954,0.764302,0.766583
20,0.783972,0.722574,0.776667
21,0.765427,0.726920,0.779167
22,0.740291,0.716551,0.779667
23,0.711279,0.689046,0.785083
24,0.683742,0.683559,0.788750
25,0.657032,0.672859,0.791583
26,0.634756,0.666527,0.795333
27,0.622170,0.663841,0.796250
28,0.613146,0.658232,0.797417
29,0.613548,0.658717,0.797667
30,0.612183,0.658733,0.798000


## Resnet 18 (Mixed Prec)

In [15]:
bs

108

In [16]:
np.random.seed(42)
data = ImageDataBunch.from_folder(path, train=train, valid_pct=0.2,
        ds_tfms=get_transforms(), size=224, num_workers=4, bs=720).normalize(cifar_stats)

In [17]:
learn = to_fp16(create_cnn(data, models.resnet18, metrics=accuracy))

In [18]:
with gpu_mem_restore_ctx():
    learn.fit_one_cycle(1)

epoch,train_loss,valid_loss,accuracy
1,2.870623,1.839730,0.519083


In [19]:
with gpu_mem_restore_ctx():
    learn.fit_one_cycle(30)

epoch,train_loss,valid_loss,accuracy
1,2.207675,1.754365,0.537417
2,2.115157,1.612026,0.565667
3,1.955724,1.431037,0.600417
4,1.766447,1.264467,0.639083
5,1.590725,1.155779,0.663833
6,1.441305,1.066461,0.682167
7,1.334857,1.007020,0.702500
8,1.246409,0.961123,0.711833
9,1.181972,0.911834,0.725500
10,1.128325,0.881794,0.732333
11,1.078243,0.861121,0.739583
12,1.031876,0.839541,0.744167
13,1.008928,0.831407,0.744000
14,0.977578,0.809504,0.751833
15,0.945696,0.797486,0.757000
16,0.919172,0.782975,0.761917
17,0.899146,0.776590,0.766583
18,0.882939,0.767809,0.766000
19,0.856347,0.763404,0.767417
20,0.841534,0.754744,0.767250
21,0.817957,0.747933,0.771083
22,0.803184,0.746400,0.774833
23,0.791340,0.743390,0.773417
24,0.781753,0.739683,0.774750
25,0.769866,0.738879,0.776167
26,0.767070,0.737335,0.775583
27,0.755852,0.735069,0.775833
28,0.747134,0.734977,0.775833
29,0.744960,0.733925,0.776667
30,0.742359,0.735230,0.776667


## Resnet 34 

In [20]:
np.random.seed(42)
data = ImageDataBunch.from_folder(path, train=train, valid_pct=0.2,
        ds_tfms=get_transforms(), size=224, num_workers=4, bs=248).normalize(cifar_stats)

In [21]:
learn = create_cnn(data, models.resnet34, metrics=accuracy)

Downloading: "https://download.pytorch.org/models/resnet34-333f7ec4.pth" to /home/ekami/.torch/models/resnet34-333f7ec4.pth
100%|██████████| 87306240/87306240 [00:07<00:00, 11767995.32it/s]


In [22]:
with gpu_mem_restore_ctx():
    learn.fit_one_cycle(30)

epoch,train_loss,valid_loss,accuracy
1,4.100102,3.264899,0.285417
2,2.793747,1.978519,0.517667
3,1.910982,1.300082,0.636833
4,1.455095,0.999775,0.702500
5,1.241369,0.867122,0.738250
6,1.109905,0.796427,0.754583
7,1.048591,0.770618,0.762250
8,0.979058,0.730820,0.773833
9,0.928035,0.709470,0.780250
10,0.890598,0.684087,0.786833
11,0.864331,0.679980,0.785333
12,0.813269,0.656503,0.796083
13,0.793111,0.645735,0.799333
14,0.758749,0.637417,0.799167
15,0.733792,0.624778,0.803500
16,0.713488,0.622848,0.804250
17,0.695918,0.618520,0.805333
18,0.670478,0.600957,0.812333
19,0.655789,0.600830,0.811750
20,0.638686,0.596192,0.814667
21,0.617020,0.593554,0.813500
22,0.598800,0.587362,0.814333
23,0.588485,0.585536,0.816500
24,0.560206,0.583934,0.818000
25,0.552242,0.579802,0.818750
26,0.544388,0.576987,0.820917
27,0.544016,0.577123,0.820667
28,0.544756,0.576735,0.819333
29,0.535235,0.576290,0.820167
30,0.533571,0.576551,0.819833


## Resnet 34 (Mixed Precision)

In [23]:
np.random.seed(42)
data = ImageDataBunch.from_folder(path, train=train, valid_pct=0.2,
        ds_tfms=get_transforms(), size=224, num_workers=4, bs=496).normalize(cifar_stats)

In [24]:
learn = to_fp16(create_cnn(data, models.resnet34, metrics=accuracy))

In [25]:
with gpu_mem_restore_ctx():
    learn.fit_one_cycle(30)

epoch,train_loss,valid_loss,accuracy
1,4.522862,3.640730,0.200583
2,3.459070,2.374401,0.443583
3,2.469919,1.560894,0.582833
4,1.817846,1.165402,0.662000
5,1.449373,0.959060,0.714667
6,1.236690,0.863116,0.740500
7,1.113157,0.795895,0.756417
8,1.027290,0.765271,0.763417
9,0.967804,0.721662,0.774667
10,0.904266,0.705837,0.780333
11,0.865761,0.687858,0.786917
12,0.844098,0.668408,0.791083
13,0.802036,0.665113,0.791667
14,0.783970,0.649133,0.798250
15,0.745470,0.634941,0.805250
16,0.727000,0.629780,0.804333
17,0.709905,0.619154,0.805417
18,0.688805,0.614638,0.810250
19,0.673002,0.612835,0.808083
20,0.647279,0.606833,0.811917
21,0.630066,0.601341,0.813167
22,0.615739,0.599353,0.814083
23,0.600889,0.592636,0.815833
24,0.595916,0.593515,0.817667
25,0.578139,0.589500,0.815500
26,0.576346,0.590978,0.815750
27,0.571478,0.586962,0.818833
28,0.560052,0.587171,0.818000
29,0.556342,0.586621,0.818083
30,0.552511,0.587052,0.818333


## Resnet 50

In [26]:
#bs = 512

In [27]:
np.random.seed(42)
data = ImageDataBunch.from_folder(path, train=train, valid_pct=0.2,
        ds_tfms=get_transforms(), size=224, num_workers=4, bs=98).normalize(cifar_stats)

In [28]:
learn = create_cnn(data, models.resnet50, metrics=accuracy)

Downloading: "https://download.pytorch.org/models/resnet50-19c8e357.pth" to /home/ekami/.torch/models/resnet50-19c8e357.pth
100%|██████████| 102502400/102502400 [00:19<00:00, 5150001.64it/s]


In [29]:
with gpu_mem_restore_ctx():
    learn.fit_one_cycle(30)

epoch,train_loss,valid_loss,accuracy
1,2.515773,1.886690,0.541167
2,1.696823,1.202539,0.663750
3,1.346115,0.940126,0.719500
4,1.105000,0.807197,0.751083
5,1.012399,0.757759,0.765250
6,0.941379,0.734523,0.770833
7,0.891324,0.690761,0.783500
8,0.819676,0.679745,0.790583
9,0.785965,0.657922,0.794000
10,0.737616,0.631472,0.806583
11,0.716565,0.626581,0.805500
12,0.703448,0.611971,0.810333
13,0.643908,0.610946,0.809333
14,0.619197,0.596491,0.818167
15,0.588613,0.582370,0.821167
16,0.572277,0.584467,0.823833
17,0.537526,0.581618,0.819917
18,0.520785,0.573473,0.824500
19,0.479418,0.570315,0.828000
20,0.462298,0.565643,0.826917
21,0.447708,0.562472,0.830500
22,0.413717,0.560359,0.833417
23,0.376912,0.557939,0.834167
24,0.372460,0.559411,0.835167
25,0.367571,0.556380,0.837167
26,0.347707,0.554721,0.837667
27,0.334381,0.551504,0.838417
28,0.351300,0.549640,0.838250
29,0.335910,0.549918,0.837000
30,0.340140,0.553000,0.836417


## Resnet 50 (Mixed Precision)

In [30]:
np.random.seed(42)
data = ImageDataBunch.from_folder(path, train=train, valid_pct=0.2,
        ds_tfms=get_transforms(), size=224, num_workers=4, bs=164).normalize(cifar_stats)

In [31]:
learn = to_fp16(create_cnn(data, models.resnet50, metrics=accuracy))

In [32]:
with gpu_mem_restore_ctx():
    learn.fit_one_cycle(30)

epoch,train_loss,valid_loss,accuracy
1,2.834446,2.072671,0.511667
2,1.883764,1.311698,0.639583
3,1.395265,0.999120,0.704250
4,1.161582,0.848280,0.743583
5,1.024427,0.772155,0.759583
6,0.940368,0.736896,0.770417
7,0.892585,0.707394,0.776083
8,0.829782,0.685450,0.790000
9,0.780883,0.662696,0.798583
10,0.755269,0.638769,0.802083
11,0.695746,0.623059,0.809167
12,0.674482,0.617339,0.806833
13,0.614091,0.608190,0.814583
14,0.609434,0.598253,0.812250
15,0.561976,0.589217,0.818333
16,0.544676,0.583386,0.821833
17,0.503154,0.581094,0.821750
18,0.489639,0.572993,0.823667
19,0.469337,0.574861,0.826917
20,0.445635,0.570555,0.829500
21,0.421942,0.570226,0.830333
22,0.395235,0.558666,0.831833
23,0.376425,0.563551,0.835083
24,0.366422,0.559915,0.835250
25,0.344141,0.557956,0.835000
26,0.328442,0.559216,0.835250
27,0.322673,0.556995,0.835167
28,0.324144,0.557831,0.835250
29,0.320713,0.555050,0.836333
30,0.327241,0.557459,0.836417


## Resnet 101

In [33]:
np.random.seed(42)
data = ImageDataBunch.from_folder(path, train=train, valid_pct=0.2,
        ds_tfms=get_transforms(), size=224, num_workers=4, bs=64).normalize(cifar_stats)

In [34]:
learn = create_cnn(data, models.resnet101, metrics=accuracy)

Downloading: "https://download.pytorch.org/models/resnet101-5d3b4d8f.pth" to /home/ekami/.torch/models/resnet101-5d3b4d8f.pth
100%|██████████| 178728960/178728960 [00:29<00:00, 6127228.47it/s]


In [35]:
with gpu_mem_restore_ctx():
    learn.fit_one_cycle(30)

epoch,train_loss,valid_loss,accuracy
1,2.083457,1.451530,0.638333
2,1.321773,0.908532,0.733333
3,1.074472,0.728461,0.775750
4,0.985465,0.666169,0.793667
5,0.838859,0.618405,0.806750
6,0.803868,0.600917,0.812500
7,0.770296,0.592337,0.814750
8,0.712573,0.587856,0.818417
9,0.700638,0.557925,0.827667
10,0.656998,0.543478,0.833500
11,0.587333,0.537264,0.833167
12,0.576772,0.523525,0.844917
13,0.546539,0.511617,0.843750
14,0.531197,0.519306,0.839500
15,0.467777,0.507615,0.845667
16,0.468377,0.504536,0.848250
17,0.454281,0.497895,0.849583
18,0.420571,0.491803,0.852833
19,0.401252,0.486161,0.853583
20,0.359169,0.492218,0.853583
21,0.353262,0.479224,0.857250
22,0.317846,0.482238,0.860833
23,0.305395,0.482287,0.860000
24,0.302323,0.477879,0.861250
25,0.285679,0.477347,0.863667
26,0.265691,0.472855,0.864000
27,0.235859,0.476183,0.865417
28,0.253295,0.475753,0.864083
29,0.252339,0.476051,0.865167
30,0.238589,0.474209,0.863583


## Resnet 101 (Mixed Precision)

In [36]:
np.random.seed(42)
data = ImageDataBunch.from_folder(path, train=train, valid_pct=0.2,
        ds_tfms=get_transforms(), size=224, num_workers=4, bs=116).normalize(cifar_stats)

In [37]:
learn = to_fp16(create_cnn(data, models.resnet101, metrics=accuracy))

In [38]:
with gpu_mem_restore_ctx():
    learn.fit_one_cycle(30)

epoch,train_loss,valid_loss,accuracy
1,2.402573,1.641406,0.603583
2,1.470369,0.982278,0.721500
3,1.102205,0.770284,0.762917
4,0.931413,0.675970,0.792833
5,0.843089,0.631572,0.804000
6,0.773870,0.607783,0.812583
7,0.747394,0.579102,0.823000
8,0.698355,0.569311,0.822667
9,0.665380,0.569775,0.824250
10,0.601234,0.540831,0.833250
11,0.583828,0.515747,0.842083
12,0.515766,0.514028,0.841167
13,0.501062,0.508455,0.844500
14,0.475348,0.509687,0.845667
15,0.463935,0.501683,0.846500
16,0.433365,0.493446,0.852083
17,0.399650,0.496941,0.854250
18,0.380342,0.498303,0.853250
19,0.350620,0.483819,0.857833
20,0.329624,0.486034,0.855667
21,0.320623,0.480329,0.858750
22,0.299029,0.477023,0.859917
23,0.264857,0.476886,0.863000
24,0.261844,0.477507,0.864083
25,0.254985,0.476712,0.864250
26,0.255878,0.476127,0.864417
27,0.231718,0.474823,0.866250
28,0.221085,0.473201,0.866667
29,0.221620,0.471682,0.865083
30,0.234261,0.476006,0.865417
