-
Notifications
You must be signed in to change notification settings - Fork 0
/
train.py
152 lines (121 loc) · 6.49 KB
/
train.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
import time
import torch
from options.train_options import TrainOptions
from data import create_dataset
from models import create_model
from util.visualizer import Visualizer
import wandb
from tqdm import tqdm
from util.util import get_logger
import ipdb
if __name__ == '__main__':
opt = TrainOptions().parse() # get training options
train_logger = get_logger(opt.checkpoints_dir+'/'+opt.name+'/train.log')
train_dataset = create_dataset(opt) # create a dataset given opt.dataset_mode and other options
train_dataloader = train_dataset.dataloader
train_dataset_size = len(train_dataset) # get the number of images in the dataset.
batch_size = opt.batch_size
opt.phase='test'
val_dataset = create_dataset(opt)
val_dataloader = val_dataset.dataloader
val_dataset_size = len(val_dataset)
opt.phase='train'
model = create_model(opt) # create a model given opt.model and other options
train_logger.info(model.netG)
model.setup(opt)
model.parallelize()
if opt.use_wandb:
wandb.init(project=opt.project,name=opt.name)
#wandb.watch(model)
print('The number of training images = %d' % train_dataset_size)
total_iters = 0 # the total number of training iterations
optimize_time = 0.1
times = []
for epoch in tqdm(range(opt.epoch_count, opt.n_epochs + opt.n_epochs_decay + 1)): # outer loop for different epochs; we save the model by <epoch_count>, <epoch_count>+<save_latest_freq>
epoch_start_time = time.time() # timer for entire epoch
iter_data_time = time.time() # timer for data loading per iteration
epoch_iter = 0 # the number of training iterations in current epoch, reset to 0 every epoch
#visualizer.reset() # reset the visualizer: make sure it saves the results to HTML at least once every epoch
train_dataset.set_epoch(epoch)
running_psnr=0
running_loss = 0
running_ssim=0
running_rmse=0
model.train()
for i, data in enumerate(tqdm(train_dataloader)): # inner loop within one epoch
iter_start_time = time.time() # timer for computation per iteration
if total_iters % opt.print_freq == 0:
t_data = iter_start_time - iter_data_time
total_iters += 1
epoch_iter += 1
if len(opt.gpu_ids) > 0:
torch.cuda.synchronize()
optimize_start_time = time.time()
model.set_input(data) # unpack data from dataset and apply preprocessing
model.optimize_parameters() # calculate loss functions, get gradients, update network weights
if len(opt.gpu_ids) > 0:
torch.cuda.synchronize()
optimize_time = (time.time() - optimize_start_time) / batch_size * 0.005 + 0.995 * optimize_time
#loss_G,psnr,ssim,rmse=model.compute_metrics()
loss_D,loss_G,psnr,ssim,rmse=model.compute_metrics()
running_loss += loss_G
running_psnr += psnr
running_ssim += ssim
running_rmse += rmse
if total_iters % opt.print_freq == 0: # print training losses and save logging information to the disk
message='(epoch: %d, iters: %d,loss_D: %.6f, loss_G: %.6f,,train_psnr: %.4f, train_ssim: %.4f,train_rmse:.%.4f) ' % (epoch, epoch_iter,loss_D,loss_G, psnr, ssim,rmse)
print(message)
if opt.use_wandb:
wandb.log({ "train_loss_D": loss_D,
"train_loss_G": loss_G,
'train_psnr':psnr,
'train_ssim':ssim,
'train_rmse':rmse} )
epoch_loss = running_loss/len(train_dataloader)
epoch_psnr= running_psnr/len(train_dataloader)
epoch_ssim=running_ssim/len(train_dataloader)
epoch_rmse=running_rmse/len(train_dataloader)
train_logger.info('Epoch: [{}/{}],epoch_loss: {:.6f}, train_psnr: {:.4f}, train_ssim: {:.4f},epoch_rmse:{:.4f}'.format(epoch ,opt.n_epochs, epoch_loss, epoch_psnr, epoch_ssim, epoch_rmse))
print('validation:')
test_running_psnr = 0
test_running_ssim=0
test_running_loss = 0
test_running_rmse=0
with torch.no_grad():
model.eval()
for i, data in enumerate(tqdm(val_dataloader)):
model.set_input(data) # unpack data from data loader
model.test() # run inference
_,loss,psnr,ssim,rmse=model.compute_metrics()
test_running_loss += loss
test_running_psnr += psnr
test_running_ssim += ssim
test_running_rmse += rmse
epoch_test_loss = test_running_loss /len(val_dataloader)
epoch_test_psnr= test_running_psnr/len(val_dataloader)
epoch_test_ssim=test_running_ssim/len(val_dataloader)
epoch_test_rmse=test_running_rmse/len(val_dataloader)
train_logger.info('val:Epoch: [{}/{}],epoch_loss: {:.6f}, val_psnr: {:.4f}, val_ssim: {:.4f},test_rmse: {:.4f}'.format(epoch , opt.n_epochs, epoch_test_loss, epoch_test_psnr, epoch_test_ssim,epoch_test_rmse))
if opt.use_wandb:
wandb.log({"epoch_train_loss": epoch_loss,
'epoch_train_psnr':epoch_psnr,
'epoch_train_ssim':epoch_ssim,
'epoch_train_rmse':epoch_rmse,
"epoch_test_loss":epoch_test_loss,
'epoch_test_psnr':epoch_test_psnr,
'epoch_test_ssim':epoch_test_ssim,
'epoch_test_rmse':epoch_test_rmse,
'epoch':epoch
} )
if epoch % opt.save_epoch_freq == 0: # cache our model every <save_epoch_freq> epochs
print('saving the model at the end of epoch %d, iters %d' % (epoch, total_iters))
model.save_networks('latest')
model.save_networks(epoch)
torch.cuda.empty_cache()
print('End of epoch %d / %d \t Time Taken: %d sec' % (epoch, opt.n_epochs + opt.n_epochs_decay, time.time() - epoch_start_time))
model.update_learning_rate() # update learning rates at the end of every epoch.
data=next(iter(train_dataloader))
model.set_input(data)
model.optimize_parameters()
ipdb.set_trace()
print('finish training')