# demo.sh
1. Use open pose to detect keypoints (body joints) in the input images located in the 'input' directory.  These key points are written to the 'cache/' directory
2. run matlab scrpit1
3. test segmentation model using test_seg.lua and latest_2.t7
4. run matlab script2
5. test initial language encoding model using test_lang_initial.py
6. run lua script test_gan.lua (generates outputs)
7. create output folder

Files needed to implement:  
[openpose.bin](https://github.com/CMU-Perceptual-Computing-Lab/openpose)  
[script1](https://vscode.dev/github/jak-weston/WardrobeWizard/blob/main/demo.sh.ipynb#C3:L19)  
[test_seg.lua](https://vscode.dev/github/jak-weston/WardrobeWizard/blob/main/demo.sh.ipynb#C6)  
[script2](https://vscode.dev/github/jak-weston/WardrobeWizard/blob/main/demo.sh.ipynb#C8)  
[test_lang_initial.py](https://vscode.dev/github/jak-weston/WardrobeWizard/blob/main/demo.sh.ipynb#C10)  
[test_gan.lua](https://vscode.dev/github/jak-weston/WardrobeWizard/blob/main/demo.sh.ipynb#C13:L109)  

In [None]:
1. 
./build/examples/openpose/openpose.bin --image_dir $CURRENT_DIR/input/ --no_display --write_keypoint=$CURRENT_DIR/cache

2.
matlab -nodesktop -nojvm -r "script1; exit"

3.
SEG_MODEL=./latest_2.t7 th test_seg.lua

4. 
matlab -nodesktop -nojvm -r "script2; exit"

5.
python test_lang_initial.py

6.
th test_gan.lua

7. 
mkdir output

# script1
matlab scrpt that preprocesses the input images by resizing and cropping them to focus on the largest person detected in each image, and then saves the processed images in the cache/ directory. Additionally, it creates a text file listing the names of the processed images.

1. lists all the PNG images in the input/ directory.
2. checks if there are any input images available.
3. creates a text file named script1.txt in the cache/ directory to store the processed image names.
4. For each input image:
    * parsee the YAML file containing pose information generated by OpenPose located in the cache/ directory.
    * extracts the pose coordinates of detected keypoints (body joints) using custorm function (parseYML).
    * selects the largest person in the image based on the bounding box around the pose keypoints.
    * calculates a transformation to resize and crop the image to a standardized size of 224x224 pixels, focusing on the selected person.
    * applies the transformation to the input image using imtransform MATLAB function.
    * saves the transformed image in the cache/ directory.
    * writes the filename (without extension) of the processed image to the script1. txt file (located in cache/).

## parseYML.m
reads size information and data from a YAML file, reshapes the data into a 3D array, and returns the array with dimensions consistent with MATLAB conventions.



In [None]:
#1. List all the PNG images in the input/ directory.
fn = dir('./input/*.png');

#2. Check if there are any input images available.
assert(length(fn) > 0, 'There is no input images (png and jpg only).');

#3. Create a text file named script1.txt in the cache/ directory to store the processed image names.
f = fopen('./cache/script1.txt','w');

#4. For each input image:
for i = 1:length(fn)
    n = fn(i).name;
    
    #4.1. Parse the YAML file containing pose information generated by OpenPose located in the cache/ directory.
    yml_fn = ['./cache/' n(1:end-4) '_pose.yml'];
    x = parseYML(yml_fn);
    
    #4.2. Extract the pose coordinates of detected keypoints (body joints) using custom function (parseYML).
    xdxd = zeros(size(x,1), 4);
    for j = 1:size(x,1)
        y = squeeze(x(j,:,:));
        idx = find(y(:,3) > 0.1);
        y = y(idx, :);
        if length(idx) > 0
            xdxd(j,1) = min(y(:,1));
            xdxd(j,2) = max(y(:,1));
            xdxd(j,3) = min(y(:,2));
            xdxd(j,4) = max(y(:,2));
        end;
    end;

    #4.3. Select the largest person in the image based on the bounding box around the pose keypoints.
    [~, id] = max((xdxd(:,4)-xdxd(:,3)) .* (xdxd(:,2)-xdxd(:,1)));

    #4.4. Calculate a transformation to resize and crop the image to a standardized size of 224x224 pixels, focusing on the selected person.
    xmin = xdxd(id,1);
    xmax = xdxd(id,2);
    ymin = xdxd(id,3);
    ymax = xdxd(id,4);
    win_size = 224;
    cx = 0.5;
    cy = 0.5;
    ylen = 0.75/2;
    xlen = ylen / (ymax - ymin) * (xmax - xmin);
    target_xmin = win_size * (cx - xlen);
    target_xmax = win_size * (cx + xlen);
    target_ymin = win_size * (cy - ylen);
    target_ymax = win_size * (cy + ylen);

    #4.4. Apply the transformation to the input image using imtransform MATLAB function.
    t = cp2tform([xmin,ymin;xmax,ymin;xmin,ymax;xmax,ymax],...
        [target_xmin,target_ymin; target_xmax, target_ymin; ...
        target_xmin, target_ymax; target_xmax, target_ymax],...
        'nonreflective similarity');
    im = imread(['./input/' fn(i).name]);
    im = imtransform(im2single(im), t, 'XData', [1 win_size], ...
        'YData', [1 win_size], 'XYScale', 1);

    #4.5. Save the transformed image in the cache/ directory.
    imwrite(im, ['./cache/' fn(i).name]);
    
    #4.6. Write the filename (without extension) of the processed image to the script1.txt file (located in cache/).
    fprintf(f, '%s\n', fn(i).name(1:end-4));
end;

#4.7. Close the script1.txt file.
fclose(f);


In [None]:
function x = parseYML(yml_file_name)
# 1. Opens the YAML file specified by yml_file_name in read mode and returns the file identifier (fid).
fid = fopen(yml_file_name, 'r');

# 2. Skips the first two lines in the YAML file. These lines are typically headers or comments and are not relevant to the data parsing.
fgetl(fid);
fgetl(fid);

# 3. Reads the line containing the sizes information from the YAML file. This line specifies the dimensions of the data array. The %f format specifier is used to read floating-point numbers.
s = fscanf(fid, '   sizes: [ %f, %f, %f ]');
s = s(:)';
assert(length(s) == 3);

# 4. Skips the line containing the "data" tag in the YAML file.
fgetl(fid);

# 5. Reads the line containing the start of the data array.
fscanf(fid, '   data: [ ');

# 6. Reads the data elements from the YAML file. The %f format specifier is used to read floating-point numbers, and prod(s) calculates the total number of elements in the data array.
data = fscanf(fid, '%f, ', prod(s));

# 7. Reshapes the data array according to the specified dimensions (s) and reorders the dimensions to match the MATLAB convention (rows, columns, depth).
x = reshape(data, s([3 2 1]));

# 8. Permutes the dimensions of the data array to the desired order (depth, rows, columns) before returning it.
x = permute(x, [3 2 1]);

# 9. Closes the file.
fclose(fid);

end

# test_seg.lua
This script  takes the preprocessed images generated by script1, passes them through the pre-trained segmentation model, and saves the segmentation results as a MAT file.
1. loads required libraries/modules: cunn, cudnn, image, and matio (MATLAB input/output library).
2. reads the list of processed image names from the script1.txt file located in the ./cache/ directory.
3. loads a pre-trained segmentation model from the path specified in the SEG_MODEL environment variable.
4. initializes a tensor input to hold the processed images.
5. iterates over each image:
    * Loads the image using image.load.
    * Converts the image to a FloatTensor and assigns it to the corresponding index in the input tensor.
6. moves the input tensor to the GPU (if available).
7. passes the input tensor through the segmentation model to obtain the output.
8. applies softmax to the output tensor to get class probabilities.
9. saves the class probabilities to a MATLAB MAT file named test_seg.mat in the ./cache/ directory.

Look into;  
SEG_MODEL

In [None]:
# 1. Load required Torch modules
require 'cunn'
require 'cudnn'
require 'image'
local matio = require 'matio'

# 2. Read the list of processed image names from the script1.txt file
local namesFile = io.open('./cache/script1.txt')
local idx = 1
local nameList = {}
for line in namesFile:lines() do
    nameList[idx] = line
    idx = idx + 1
end
namesFile:close()

# 3. Load a pre-trained segmentation model
local model = torch.load(os.getenv('SEG_MODEL'))

# 4. Initialize a tensor to hold the processed images
local win_size = 224
local input = torch.zeros(idx - 1, 3, win_size, win_size):float()

# 5. Iterate over each image:
for i = 1, idx - 1 do
    # Load the image using image.load
    local im = image.load('./cache/' .. nameList[i] .. '.png', 3, 'float')
    
    # Convert the image to a FloatTensor and assign it to the corresponding index in the input tensor
    input[{{i},{},{},{}}] = im
end

# 6. Move the input tensor to the GPU (if available)
input = input:cuda()

# 7. Pass the input tensor through the segmentation model to obtain the output
local output = model:forward(input)

# 8. Apply softmax to the output tensor to get class probabilities
local prob = cudnn.SpatialSoftMax():cuda():forward(output):float()

# 9. Save the class probabilities to a MATLAB MAT file
matio.save('./cache/test_seg.mat', {prob=prob})

# script2
1. loads preporcessed image names from script1.txt and segmentation probabilities from test_seg.mat
2. rearranges the segmentation probabilites tensor (prob) to the correct dimensions
3. defines parameters window size (win_size) and the number of classes (L)
4. Loops through each image and preforms post-processing on segmentation results
    * Utilizes the DenseCRF algorithm (DCRF) to refine the segmentation results (bb) based on the input image and probabilities.
    * maps the refined segmentation results to a predefined set of labels (label_assign).
    * resizes and prepares the segmentation results for further processing.
5. Language Preprocessing
    * extracts and preprocesses text descriptions associated with each image
    * maps words in descriptions to numberical embeddings using the predefined "map"
6. saves processed segmentation results (seg_final), language embeddings (codej) and resized segmentation maps (b_) to a MATLAB MAT file script2.mat

Look into:
DCRF.mex

In [None]:
clear;
addpath('./mex');  # 1. Add mex directory to the MATLAB search path
load map map;  # 2. Load the 'map' variable from a file named 'map' into the workspace

f = fopen('./cache/script1.txt','r');  # 1. Open the file 'script1.txt' for reading
load ./cache/test_seg.mat prob;  # 1. Load the 'prob' variable from a MAT file into the workspace
prob = permute(prob, [3,4,2,1]);  # 2. Permute the dimensions of 'prob' array

win_size = 224;  # 3. Set the window size
L = 18;  # 3. Set the number of classes
i = 0;  # Initialize counter variable

label_assign = [1,1,2,3,3,   3,3,3,3,3,   2,3,3,3,3,   3,3,4];  # Define label assignments
lrc = cell(10000,1);  # Initialize cell array to store results
codeJ = cell(10000,1);  # Initialize cell array to store language embeddings
seg_final = cell(10000,1);  # Initialize cell array to store segmentation results

# 4. Loop through each line in the file 'script1.txt'
while ~feof(f)
    fn = fgetl(f);  # 4. Read a line from the file
    i = i + 1;  # Increment counter

    img = imread(['./cache/' fn '.png']);  # Read the corresponding image file
    bb = DCRF(img, prob(:,:,:,i), L, [3 3 5 30 30 10 10 10 9 5]);  # Utilize DenseCRF for segmentation refinement

    bb = bb + 1;  # Increment segmentation results
    seg_final{i} = bb;  # Store refined segmentation results

    # 4. Apply label assignment to segmentation results
    b_temp = bb;
    for j = 1:L
        b_temp(bb == j) = label_assign(j);
    end;

    # 4. Resize and prepare segmentation results
    lrc{i} = zeros([8,8,4]);
    for j = 1:4
        lrc{i}(:,:,j) = imresize(single(b_temp == j), [8,8]);
    end;

    # 5. Extract and preprocess text descriptions
    fid = fopen(['./input/' fn '.txt'], 'r');
    s = fgetl(fid);
    fclose(fid);
    s = strtrim(s);
    while(s(end) == '.')
        s = s(1:end-1);
    end;
    ss = strsplit(s, ' ');
    for j = 1:length(ss)
        codeJ{i} = [codeJ{i} double(map(ss{j}))];  # Map words to numerical embeddings
    end;
    codeJ{i} = codeJ{i}(:);  # Reshape language embeddings
end;

fclose(f);  # Close the file 'script1.txt'
lrc = lrc(1:i);  # Trim excess cells from lrc
codeJ = codeJ(1:i);  # Trim excess cells from codeJ

lr = zeros(8,8,4,i);  # Initialize array for storing lrc results
for j = 1:i
    lr(:,:,:,j) = lrc{j};  # Populate lr array with lrc results
end;

seg_final = seg_final(1:i);  # Trim excess cells from seg_final

fine_size = 128;  # Set fine size
b_ = zeros(fine_size, fine_size, 1, i);  # Initialize array for resized segmentation maps
label_assign = [1,1,2,3,4, 4,4,4,5,5, 2,5,5,6,6, 4,3,0];  # Define label assignments for resizing

# 4. Resize segmentation maps and store in b_
for j = 1:i
    t = seg_final{j};
    v = t;
    for k = 1:L
        v(t == k) = label_assign(k);
    end;
    b_(:,:,:,j) = imresize(v, [fine_size, fine_size], 'nearest');
end;

# 6. Save processed data to a MATLAB MAT file
save('./cache/script2.mat', 'lr', 'codeJ', 'b_', 'seg_final');
```

# test_lang_initial.py
The neural network consists of an RNN layer followed by several linear layers. The RNN layer processes sequential input data (language embeddings) and generates hidden states. The linear layers take the last hidden state from the RNN and produce predictions for different attributes (category, color, gender, sleeve).
1. implement the __init__ method
    * Use nn.RNN for the RNN layer and nn.Linear for the linear layers.
    * Define the dimensions for the input vocabulary (dim_voc), hidden state (dim_h), and other output dimensions (dim_cate_new, dim_color, dim_gender, dim_sleeve).
    * Specify the number of RNN layers (num_layers).
2. implement the forward method
    * Initialize the hidden state (h0) with zeros using torch.zeros.
    * Pass the input tensor (x) through the RNN layer using self.rnn.
    * Extract the last hidden state (hn2) from the RNN output.
    * Pass hn2 through the linear layers (net_cate_new, net_color, net_gender, net_sleeve) to get predictions for different attributes.
    * Return the final hidden state (hn2) along with the predictions.
3. Create Model Instance
    * Create an instance of the define_network class.
    * Move the model to the GPU using .cuda() to utilize GPU acceleration.
    * Load pre-trained weights into the model using model.load_state_dict(torch.load('rnn_latest.pth')).
    * Set the model to evaluation mode using model.eval().
4. Pass Input Through the Model:
    * Convert the input data (language embeddings) to the appropriate format (one-hot encoding).
    * Pass the input through the model by calling the model instance with the input tensor as an argument.
    * Obtain the final hidden state and predictions from the model output.
5. Save the computed final hidden states to a MATLAB MAT file for further analysis.

In [None]:
# Import necessary libraries and modules
import numpy as np
import torch
import torch.nn as nn
from torch.autograd import Variable
from scipy.io import loadmat, savemat

# Load preprocessed language embeddings from script2.mat
mat = loadmat('./cache/script2.mat')
codeJ = mat['codeJ']

# Define parameters and dimensions for the neural network
dim_voc = 539  # Dimension of the vocabulary
bsz = 1  # Batch size
dim_h = 100  # Dimension of the hidden state
dim_cate_new = 19  # Dimension of category new
dim_color = 17  # Dimension of color
dim_gender = 2  # Dimension of gender
dim_sleeve = 4  # Dimension of sleeve
num_layers = 2  # Number of RNN layers

# Define the neural network architecture
class define_network(nn.Module):
    def __init__(self):
        super(define_network, self).__init__()
        # 1. Implement the __init__ method
        self.rnn = nn.RNN(dim_voc, dim_h, num_layers)
        self.net_cate_new = nn.Linear(dim_h, dim_cate_new)
        self.net_color = nn.Linear(dim_h, dim_color)
        self.net_gender = nn.Linear(dim_h, dim_gender)
        self.net_sleeve = nn.Linear(dim_h, dim_sleeve)

    def forward(self, x):
        # 2. Implement the forward method
        # Initialize hidden state
        h0 = Variable(torch.zeros(num_layers, bsz, dim_h).cuda())
        # Forward pass through the RNN
        _, hn = self.rnn(x, h0)
        # Extract the last hidden state
        hn2 = hn[-1]
        # Pass the hidden state through linear layers to get predictions
        y_cate_new = self.net_cate_new(hn2)
        y_color = self.net_color(hn2)
        y_gender = self.net_gender(hn2)
        y_sleeve = self.net_sleeve(hn2)
        return hn2, y_cate_new, y_color, y_gender, y_sleeve

# Create an instance of the neural network, move it to the GPU, and load pre-trained weights
model = define_network()
model.cuda()
model.load_state_dict(torch.load('rnn_latest.pth'))
model.eval()

# Initialize an array to store the final hidden states for each sample
test_hn2 = np.zeros((len(codeJ), dim_h))

# Iterate over each sample in the codeJ array
for sample_id in range(len(codeJ)):
    c = codeJ[sample_id][0]
    l = len(c)
    # Initialize a tensor to store one-hot encoded input
    cuda_c_onehot = torch.zeros(l, bsz, dim_voc).cuda()
    
    # Convert each codeJ sample to one-hot encoding
    for i in range(l):
        cuda_c_onehot[i][0][int(c[i][0]-1)] = 1
    
    # Wrap the one-hot encoding in a Variable and pass it through the neural network
    cuda_c_onehot_v = Variable(cuda_c_onehot)
    hn2, _, _, _, _ = model(cuda_c_onehot_v)
    
    # Store the final hidden state in the test_hn2 array
    test_hn2[sample_id] = hn2.data[0].cpu().numpy()

# Save the computed final hidden states to a MATLAB MAT file
result = {"hn2": test_hn2}
savemat("./cache/test_lang_initial.mat", result)

# test_gan.lua
implement a neural network-based image synthesis process.

1. Load Configuration: 
    * Initialize the testConf table with various configuration (number of input, output channels, window sizes, and the number of conditions)
2. Load Data: Load data from MAT files and perform necessary operations:
    * Load segmentation results (lr) and language embeddings (text) from MAT files.
    * Load ih_mean matrix.
    * Load test_set_b_ array.
    * Load pre-trained models (G1, G2) from files (sr1.t7, ih1_skip.t7).
3. Preprocess Data:
    * Permute dimensions and ensure data is contiguous.
    * Normalize data and create noise (z).
4. Forward Pass Through Model:
    * Pass the input data (z, text, lr) through the pre-trained models (G1, G2).
    * Apply convolutions and other operations to generate the synthesized image.
5. Postprocess and Save Results:
    * Postprocess the output to obtain the final synthesized image.
    * Save the synthesized images to files.

files used in the Lua script:
'./sr1.t7'  
'./ih1_skip.t7'  
'./ih_mean.mat'  
'./cache/script2.mat'  
'./cache/test_lang_initial.mat'  
'./cache/script1.txt'  
'./cache/' .. line .. '.png' (where line is a filename read from 'script1.txt')  

In [None]:
require 'nngraph'
require 'cunn'
require 'cudnn'
require 'image'
local matio = require 'matio'

local testConf = {}
testConf.n_map_all = 7
testConf.n_z = 80
testConf.nt_input = 100
testConf.n_condition = 4
testConf.disp_win_id = 50
testConf.win_size = 128
testConf.lr_win_size = 8
testConf.nc = 3
testConf.n_condition_2 = 3

local lr = matio.load('./cache/script2.mat','lr')  # 2. Load segmentation results
lr = lr:permute(4,3,1,2)  # 2. Permute dimensions
lr = lr:contiguous()  # 2. Ensure data is contiguous
local m = lr:size(1)  # Get size of lr
local text = matio.load('./cache/test_lang_initial.mat', 'hn2')  # 2. Load language embeddings
text = text:contiguous()  # 2. Ensure data is contiguous
text = text:view(m, testConf.nt_input, 1, 1)  # 2. Reshape text
local ih_mean_temp = matio.load('./ih_mean.mat', 'ih_mean')  # 2. Load ih_mean matrix
local ih_mean = ih_mean_temp:permute(3,1,2):contiguous():view(1,testConf.nc,testConf.win_size,testConf.win_size)  # 2. Permute dimensions and reshape ih_mean
local test_set_b_ = matio.load('./cache/script2.mat', 'b_')  # 2. Load test_set_b_
test_set_b_ = test_set_b_:permute(4,3,1,2)  # 2. Permute dimensions
test_set_b_:contiguous()  # 2. Ensure data is contiguous

local a = torch.load('./sr1.t7')  # 2. Load pre-trained model G1
local G1 = a.G
a = nil
local b = torch.load('./ih1_skip.t7')  # 2. Load pre-trained model G2
local G2 = b.G
b = nil

lr = lr:cuda()  # 3. Move lr to GPU
text = text:cuda()  # 3. Move text to GPU
local z = torch.Tensor(m, testConf.n_z, 1,1)  # Initialize noise
z = z:cuda()  # 3. Move noise to GPU
z:normal(0,1)  # 3. Generate random normal noise
local out1 = G1:forward{z, text, lr}:float()  # 4. Forward pass through G1
local kernel = image.gaussian(5, 3):float():contiguous()  # 4. Create Gaussian kernel
for i = 1,m do  # 4. Loop through each sample
    for j = 1, testConf.n_map_all do  # 4. Loop through each segmentation map
        out1[{{i},{j},{},{}}] = image.convolve(out1[{{i},{j},{},{}}]:view(testConf.win_size, testConf.win_size), kernel, 'same'):contiguous():view(1,1,testConf.win_size, testConf.win_size)  # 4. Smooth the output of G1
    end
end
_, out1pmax = torch.max(out1, 2)  # 4. Get the indices of maximum values along the second dimension
out1pmax[out1pmax:eq(testConf.n_map_all)] = 0  # 4. Replace the maximum index if it's equal to n_map_all

local cb = {torch.Tensor{3,2,1,1,2,3,2}, torch.Tensor{2,3,3,2,1,1,2}, torch.Tensor{1,1,2,3,3,2,2}}  # 4. Define color balance coefficients
for i = 1,testConf.n_condition_2 do cb[i] = cb[i] * 0.25 end  # 4. Adjust color balance coefficients
local H = torch.Tensor{0.0030,0.0133,0.0219,0.0133,0.0030,0.0133,0.0596,0.0983,0.0596,0.0133,0.0219,0.0983,0.1621,0.0983,0.0219,0.0133,0.0596,0.0983,0.0596,0.0133,0.0030,0.0133,0.0219,0.0133,0.0030}:view(5,5):float()  # 4. Define Gaussian blur kernel

local batch_condition = torch.Tensor(m, testConf.n_condition_2, testConf.win_size, testConf.win_size)  # 4. Initialize tensor for batch conditions
for i = 1,m do  # 4. Loop through each sample
    local t = out1pmax[{{i},{1},{},{}}]  # 4. Get the segmentation map
    for j = 1,testConf.n_condition_2 do  # 4. Loop through each condition
        local u = torch.Tensor(1,1,testConf.win_size, testConf.win_size)  # 4. Initialize temporary tensor
        for k = 1,testConf.n_map_all do  # 4. Loop through each class
            u[t:eq(k%testConf.n_map_all)] = cb[j][k]  # 4. Apply color balance
        end
        local v = image.convolve(u:squeeze():float(), H:float(), 'same'):contiguous()  # 4. Apply Gaussian blur
        batch_condition[{{i},{j},{},{}}] = v:view(1,1,testConf.win_size, testConf.win_size)  # 4. Store the conditioned result
    end
end
batch_condition = batch_condition - 0.5  # 4. Adjust batch condition

z:normal(0,1)  # 5. Generate random normal noise
batch_condition = batch_condition:cuda()  # 5. Move batch condition to GPU
local out2 = G2:forward{z, text, batch_condition}:float()  # 5. Forward pass through G2

local namesFile = io.open('./cache/script1.txt')  # 6. Open script1.txt
local test_set_ih = torch.zeros(m, testConf.nc, testConf.win_size, testConf.win_size):float()  # 6. Initialize tensor for test_set_ih
local nameList = {}  # 6. Initialize list for names
local idx = 0  # 6. Initialize index
for line in namesFile:lines() do  # 6. Iterate over each line in script1.txt
    idx = idx + 1  # 6. Increment index
    test_set_ih[{{idx},{},{},{}}] = image.scale(image.load('./cache/' .. line .. '.png', 3, 'float'), testConf.win_size, testConf.win_size)  # 6. Load and resize images
    nameList[idx] = line  # 6. Store the filename
end
namesFile:close()  # 6. Close script1.txt

local out2_final = out2:clone()  # 6. Clone the output of G2
for i = 1,m do  # 6. Loop through each sample
    for j = 1, testConf.nc do  # 6. Loop through each channel
        local t = out2[{{i},{j},{},{}}] + ih_mean[{{},{j},{},{}}]  # 6. Add ih_mean
        local s = test_set_ih[{{i},{j},{},{}}]  # 6. Get test_set_ih
        local ori_b_ = test_set_b_[{{i},{},{},{}}]  # 6. Get original segmentation map
        local now_b_ = out1pmax[{{i},{},{},{}}]  # 6. Get new segmentation map
        t[ori_b_:eq(1)] = s[ori_b_:eq(1)]  # 6. Replace background pixels
        t[ori_b_:eq(2)] = s[ori_b_:eq(2)]  # 6. Replace foreground pixels
        if ori_b_:eq(5):sum() > 0 and now_b_:eq(5):sum() > 0 then  # 6. Check if class 5 exists
            local sc5 = s[ori_b_:eq(5)]:median()  # 6. Compute median of class 5
            t[now_b_:eq(5)] = (t[now_b_:eq(5)] + sc5[1]) / 2  # 6. Adjust class 5 pixels
        end
        if ori_b_:eq(6):sum() > 0 and now_b_:eq(6):sum() > 0 then  # 6. Check if class 6 exists
            local sc6 = s[ori_b_:eq(6)]:median()  # 6. Compute median of class 6
            t[now_b_:eq(6)] = (t[now_b_:eq(6)] + sc6[1]) / 2  # 6. Adjust class 6 pixels
        end
        out2_final[{{i},{j},{},{}}] = t  # 6. Store the final output
    end
end

for i = 1, m do  # 6. Loop through each sample
    image.save('./output/' .. nameList[i] .. '.png', out2_final[{{i},{},{},{}}]:view(testConf.nc, testConf.win_size, testConf.win_size))
end


# net_graph_st1.lua
Gshape:

Starts from local input_data = nn.Identity()() to local g_extra = g1 - deconv(ngf*16, ngf*8, 4,4,2,2,1,1) - bn4(ngf*8) - relu(inplace)
This part processes the original segmentation map and the encoded text to generate an intermediate feature map.
Gimage:

Starts from local input_condition = nn.Identity()() to local g5 = g4 - deconv(ngf, nc, 4,4,2,2,1,1)
This part processes the intermediate feature map and the condition input to generate the final output image.

In [None]:
require 'nngraph'

# Function to initialize weights
local function weights_init(m)
   local name = torch.type(m)
   if name:find('Convolution') then
      m.weight:normal(0.0, 0.02 / 16)
      m:noBias()
   elseif name:find('BatchNormalization') then
      if m.weight then m.weight:normal(1.0, 0.02) end
      if m.bias then m.bias:fill(0) end
   end
end

# Load configuration
local config = dofile('./config_sr1.lua')

local nc = config.n_map_all
local ncondition = config.n_condition

local nz = config.nz
local nt_input = config.nt_input
local nt = config.nt

local ndf = 64
local ngf = 64
local inplace = true

local bn4 = nn.SpatialBatchNormalization
local conv = nn.SpatialConvolution
local deconv = nn.SpatialFullConvolution
local relu = nn.ReLU
local lerelu = nn.LeakyReLU

# ##########################
# # Gshape: Part of netG that handles segmentation generation #
# ##########################

local input_data = nn.Identity()()  # Input: Original segmentation map S0
local input_encode = nn.Identity()()  # Input: Encoded text description d

# Encode text description
local h1 = input_encode - conv(nt_input, nt, 1, 1) - lerelu(0.2, inplace)

# Concatenate input data and encoded text
local input_data_encode = nn.JoinTable(2)({input_data, h1})

# Initial layers of the generator to generate segmentation map
local g1 = input_data_encode - deconv(nz+nt, ngf*16, 4,4) - bn4(ngf*16) - relu(inplace)
local g_extra = g1 - deconv(ngf*16, ngf*8, 4,4,2,2,1,1) - bn4(ngf*8) - relu(inplace)

# ##########################
# # Gimage: Part of netG that handles image generation #
# ##########################

local input_condition = nn.Identity()()  # Input: Condition (e.g., low-res image or additional attributes)

# Process condition input
local f1 = input_condition - conv(ncondition, ngf, 3,3,1,1,1,1) - bn4(ngf) - lerelu(0.2, inplace)
local f_extra = f1 - conv(ngf, ngf*2, 3,3,1,1,1,1) - bn4(ngf*2) - lerelu(0.2, inplace)

# Join processed inputs
local g2 = nn.JoinTable(2)({g_extra, f_extra}) - deconv(ngf*10, ngf*4, 4,4,2,2,1,1) - bn4(ngf*4) - relu(inplace)

# Additional convolutional and deconvolutional layers for image synthesis
local m1 = g2 - conv(ngf*4, ngf*8, 3,3,1,1,1,1) - bn4(ngf*8) - lerelu(0.2, inplace)
local m2 = m1 - conv(ngf*8, ngf*8, 3,3,1,1,1,1) - bn4(ngf*8) - lerelu(0.2, inplace)
local m3 = m2 - conv(ngf*8, ngf*4, 3,3,1,1,1,1) - bn4(ngf*4) - lerelu(0.2, inplace)

local g3 = m3 - deconv(ngf*4, ngf*2, 4,4,2,2,1,1) - bn4(ngf*2) - relu(inplace)
local g4 = g3 - deconv(ngf*2, ngf*1, 4,4,2,2,1,1) - bn4(ngf*1) - relu(inplace)
local g5 = g4 - deconv(ngf, nc, 4,4,2,2,1,1)

# Combine the generator network
local netG = nn.gModule({input_data, input_encode, input_condition},{g5})
netG:apply(weights_init)

# ##########################
# # Discriminator Network (netD) #
# ##########################

local output_data = nn.Identity()()
local output_data_softmax = output_data - cudnn.SpatialSoftMax()

local d1 = output_data_softmax - conv(nc, ndf, 4, 4, 2, 2, 1, 1) - lerelu(0.2, inplace)
local d2 = d1 - conv(ndf, ndf*2, 4, 4, 2, 2, 1, 1) - bn4(ndf*2) - lerelu(0.2, inplace)
local d3 = d2 - conv(ndf*2, ndf*4, 4, 4, 2, 2, 1, 1) - bn4(ndf*4) - lerelu(0.2, inplace)

local mid1 = d3 - conv(ndf*4, ndf*8, 3, 3, 1, 1, 1, 1) - bn4(ndf*8) - lerelu(0.2, inplace)
local mid2 = mid1 - conv(ndf*8, ndf*8, 3, 3, 1, 1, 1, 1) - bn4(ndf*8) - lerelu(0.2, inplace)
local mid3 = mid2 - conv(ndf*8, ndf*4, 3, 3, 1, 1, 1, 1) - bn4(ndf*4) - lerelu(0.2, inplace)

local d4 = mid3 - conv(ndf*4, ndf*8, 4, 4, 2, 2, 1, 1) - bn4(ndf*8) - lerelu(0.2, inplace)

local output_condition = nn.Identity()()
local c1 = output_condition - conv(ncondition, ndf, 3, 3, 1, 1, 1, 1) - lerelu(0.2, inplace)
local c2 = c1 - conv(ndf, ndf*2, 3, 3, 1, 1, 1, 1) - bn4(ndf*2) - lerelu(0.2, inplace)

local d_extra = nn.JoinTable(2)({d4, c2}) - conv(ndf*10, ndf*8, 4, 4, 2, 2, 1, 1) - bn4(ndf*8) - lerelu(0.2, inplace)

# ##########################
# # Discriminator Auxiliary Network #
# ##########################

local output_encode = nn.Identity()()
local b1 = output_encode - conv(nt_input, nt, 1, 1) - bn4(nt) - lerelu(0.2, inplace) - nn.Replicate(4, 3) - nn.Replicate(4, 4)
local d_extra_b1 = nn.JoinTable(2)({d_extra, b1}) - conv(ndf*8 + nt, ndf*8, 1, 1) - bn4(ndf*8) - lerelu(0.2, inplace)

local d5 = d_extra_b1 - conv(ndf*8, 1, 4, 4) - nn.Sigmoid()

# Combine the discriminator network
local netD = nn.gModule({output_data, output_encode, output_condition},{d5})
netD:apply(weights_init)

return netG, netD


# train_lua
the implementation of the first stage of FashionGAN, which involves generating human segmentation maps based on the input images and textual descriptions. The generated segmentation maps serve as the basis for the subsequent stage of texture rendering to produce the final synthesized images.

1. Initialization and Setup: The code initializes necessary libraries, loads configuration files, and sets up the required environment for training the GAN.

2. Data Preparation: The code loads data, including low-resolution images (lr), text encodings (text), and segmentation maps (b_). It preprocesses the data for training.

3. Model Setup: The GAN architecture for generating the segmentation map (Gshape) is defined and loaded. The Discriminator network (D) is also set up.

4. Training Loop: The code contains a training loop where the Generator (Gshape) and Discriminator (D) are trained iteratively. The training process involves optimizing the parameters of both networks using the Adam optimizer.

5. Loss Calculation: Within the training loop, the code computes the adversarial loss for the Discriminator (errD) and the Generator (errG). Additionally, it computes the segmentation loss (errSeg) to ensure that the generated segmentation map matches the desired output.

6. Visualization and Logging: The code periodically displays the training progress and visualizes the generated segmentation maps.

7. Model Saving: The trained models are saved periodically for later use or evaluation.

In [None]:
require 'nngraph' # Load the nngraph library for building neural networks
require 'cunn' # Load the cunn library for CUDA (GPU) support
require 'cudnn' # Load the cudnn library for CUDA deep neural networks
require 'hdf5' # Load the hdf5 library for handling HDF5 files
require 'image' # Load the image library for image processing
require 'optim' # Load the optim library for optimization algorithms
require 'paths' # Load the paths library for file system operations
local vis = dofile('../codes_lua/vis.lua') # Load a custom visualization script
local matio = require 'matio' # Load the matio library for MATLAB file format support

train_ind = matio.load('../data_release/benchmark/ind.mat','train_ind'):view(-1) # Load and reshape training indices

local dispSurrogate = dofile('../codes_lua/dispSurrogate.lua') # Load a custom display script
local disp = require 'display' # Load the display library for visualizations
local getNet = dofile('../codes_lua/getNet.lua') # Load a custom network setup script
torch.setdefaulttensortype('torch.FloatTensor') # Set the default tensor type to float

local theme = 'sr1' # Define the theme or experiment name
assert(theme == 'sr1') # Ensure the theme is correctly set

local config = dofile('./config_sr1.lua') # Load the configuration file
local G, D
G, D = dofile('./net_graph_sr1.lua') # Load the network architectures for Generator (G) and Discriminator (D)

local h5file = hdf5.open('../data_release/supervision_signals/G1.h5', 'r') # Open an HDF5 file for reading
local b_ = h5file:read('/b_'):all() # Read the dataset
h5file:close() # Close the HDF5 file
local n_file = b_:size(1) # Get the number of files

local lr = matio.load('../data_release/test_phase_inputs/sr1_8.mat','d') # Load low-resolution images
lr = lr:permute(4,3,1,2) # Permute dimensions of the low-resolution images

local text = matio.load('../data_release/test_phase_inputs/encode_hn2_rnn_100_2_full.mat', 'hn2') # Load text encodings
text = text:contiguous() # Ensure the text tensor is contiguous in memory

config.lr = 0.0002 # Set the learning rate
config.beta1 = 0.5 # Set the beta1 parameter for Adam optimizer

local criterion = nn.BCECriterion() # Binary Cross Entropy loss for GAN
local cri_seg = cudnn.SpatialCrossEntropyCriterion() # Cross Entropy loss for segmentation
local optimStateG = { # Optimization state for Generator
    learningRate = config.lr,
    beta1 = config.beta1,
}
local optimStateD = { # Optimization state for Discriminator
    learningRate = config.lr,
    beta1 = config.beta1,
}

local nz = config.nz # Latent vector size
local input = torch.Tensor(config.batchSize, config.n_map_all, config.win_size, config.win_size) # Initialize input tensor
local condition = torch.Tensor(config.batchSize, config.n_condition, config.lr_win_size, config.lr_win_size) # Initialize condition tensor

print(nz) # Print the latent vector size
local noise = torch.Tensor(config.batchSize, nz, 1, 1) # Initialize noise tensor
local label = torch.Tensor(config.batchSize) # Initialize label tensor
local encode = torch.Tensor(config.batchSize, config.nt_input, 1, 1) # Initialize encoding tensor
local seg_target = torch.Tensor(config.batchSize, config.win_size, config.win_size) # Initialize segmentation target tensor
local errD, errG # Initialize error variables
cutorch.setDevice(1) # Set GPU device
input = input:cuda();  noise = noise:cuda();  label = label:cuda();  condition = condition:cuda();  encode = encode:cuda();  seg_target = seg_target:cuda() # Move tensors to GPU
local input_record # Initialize input record tensor

local input_wrong = torch.Tensor(config.batchSize, config.n_map_all, config.win_size, config.win_size) # Initialize wrong input tensor
local condition_wrong = torch.Tensor(config.batchSize, config.n_condition, config.lr_win_size, config.lr_win_size) # Initialize wrong condition tensor
input_wrong = input_wrong:cuda(); condition_wrong = condition_wrong:cuda() # Move tensors to GPU

if pcall(require, 'cudnn') then # Check if cudnn is available
    require 'cudnn' # Load cudnn library
    cudnn.benchmark = true # Enable cudnn benchmarking
    cudnn.convert(G, cudnn) # Convert Generator to cudnn
    cudnn.convert(D, cudnn) # Convert Discriminator to cudnn
end
D:cuda(); G:cuda(); criterion:cuda(); cri_seg:cuda() # Move networks and criteria to GPU

local parametersD, gradParametersD = D:getParameters() # Get parameters and gradients of Discriminator
local parametersG, gradParametersG = G:getParameters() # Get parameters and gradients of Generator

local normal_holder = torch.Tensor(config.batchSize, config.n_map_all, config.win_size, config.win_size):cuda() # Initialize normal holder tensor

local simple_sample = function() # Function to sample data
    local ind = torch.randperm(train_ind:size(1)):narrow(1,1,config.batchSize) # Randomly permute training indices
    local ind_wrong = torch.Tensor(config.batchSize) # Initialize wrong indices tensor
    for i = 1,config.batchSize do ind_wrong[i] = (ind[i] + math.random(train_ind:size(1)-1) - 1) % train_ind:size(1) + 1; end # Generate wrong indices
    for i = 1,config.batchSize do ind[i] = train_ind[ind[i]] end # Assign correct indices
    for i = 1,config.batchSize do ind_wrong[i] = train_ind[ind_wrong[i]] end # Assign wrong indices
    noise:normal(0,1) # Sample noise from normal distribution

    for i = 1,config.batchSize do
        local t = b_[{{ind[i]},{1},{},{}}] # Get ground truth data
        for j = 1,config.n_map_all do
            local u = input[{{i},{j},{},{}}]:zero() # Zero out input tensor
            u[t:eq(j%config.n_map_all)] = 1 # Set input tensor based on ground truth
            input[{{i},{j},{},{}}] = u
        end
        condition[{{i},{},{},{}}] = lr[{{ind[i]},{},{},{}}] # Set condition tensor
        encode[{{i},{},{},{}}] = text[{{ind[i]},{}}]:view(1, config.nt_input, 1, 1) # Set encoding tensor
    end

    for i = 1,config.batchSize do
        local t = b_[{{ind_wrong[i]},{1},{},{}}] # Get wrong ground truth data
        for j = 1,config.n_map_all do
            local u = input_wrong[{{i},{j},{},{}}]:zero() # Zero out wrong input tensor
            u[t:eq(j%config.n_map_all)] = 1 # Set wrong input tensor based on wrong ground truth
            input_wrong[{{i},{j},{},{}}] = u
        end
        condition_wrong[{{i},{},{},{}}] = lr[{{ind_wrong[i]},{},{},{}}] # Set wrong condition tensor
    end

    for i = 1,config.batchSize do
        seg_target[{{i},{},{}}] = b_[{{ind[i]},{1},{},{}}]:view(1,config.win_size,config.win_size) # Set segmentation target tensor
    end
    seg_target[seg_target:eq(0)] = config.n_map_all # Adjust segmentation target tensor
end

local real_label = 1 # Define real label
local fake_label = 0 # Define fake label
local epoch_tm = torch.Timer() # Initialize epoch timer
local tm = torch.Timer() # Initialize timer
local data_tm = torch.Timer() # Initialize data timer

local errD_real, errD_wrong, errD_fake # Initialize error variables for Discriminator
local errSeg # Initialize error variable for segmentation
local fDx = function(x) # Function to compute Discriminator loss and gradients
   gradParametersD:zero() # Zero out Discriminator gradients

   simple_sample() # Sample data
   input_record = input:clone() # Clone input tensor for record
   label:fill(real_label) # Fill label tensor with real label

   local output = D:forward{input, encode, condition} # Forward pass through Discriminator
   errD_real = criterion:forward(output, label) # Compute real error
   local de_do = criterion: