# Image parsing for AItofocus

In [2]:
import numpy as np
import os
import pandas as pd
import shutil

## 1. Separate the two positions

In [62]:
# Generate two position files from the original one
file_path = "D:\Hugo\Autofocus/labels_pos.txt"
pos_paths = ["D:\Hugo\Autofocus/pos0_labels.txt", "D:\Hugo\Autofocus/pos1_labels.txt"]

with open(file_path, "r") as ifl, open(pos_paths[0], 'a') as pos0, open(pos_paths[1], 'a') as pos1:
    positions = [pos0, pos1]  # in the original files, pos0 and pos1 are alternating (repl0, repl0, repl1, repl1...)
    lines = ifl.readlines()
    reading_pos_ = 0

    k = 0
    while k < len(lines):
                    
        if lines[k].startswith("rep"):  # header of the replicate
           
            for l in range(9):  # eight replicates + header = 9 lines
                positions[reading_pos_].write(lines[k+l])
        
        k += 1
        reading_pos_ = 1 - reading_pos_
        
print("Done!")

Done!


In [63]:
# Separate the images between positions
# pos1 -> Train set, pos0 -> Test set
images_path = "D:\Hugo\Autofocus\imgs_for_AF_ML_direct_training_june18th_2021"
test_set_path, train_set_path = "D:\Hugo\Autofocus/Test_Set", "D:\Hugo\Autofocus/Train_Set"

for im in os.listdir(images_path):
    
    if "pos0" in im:
        shutil.move(f"{images_path}/{im}", f"{test_set_path}/{im}")
        
    elif "pos1" in im:
        shutil.move(f"{images_path}/{im}", f"{train_set_path}/{im}")
        
print("Done.")
train_size, test_size = len(os.listdir(train_set_path)), len(os.listdir(test_set_path))
print(f"Train set size = {train_size}")
print(f"Train set size = {test_size}")

Done.
Train set size = 2584
Train set size = 2584


## 2. Reformat the metadata file

In [64]:
# Append the replicate inside each row of the metadata table
# instead of having a header
for pos_metadata in pos_paths:
    new_metadata_path = pos_metadata.strip(".txt") + "_table.csv"
    
    with open(pos_metadata, 'r') as metadata, open(new_metadata_path, 'w') as new_metadata:
        lines = metadata.readlines()
        
        new_metadata.write("replicate\tidx\tz\n")
        
        repl = -1
        for line in lines:
            
            if line.startswith("rep"):  # update the replicate index at each header
                repl += 1
                continue
            else:
                idx, z = line.split(" ")
                new_metadata.write(f"{repl}\t{idx}\t{z}")

print("Tables created.")

Tables created


## 3. Rename the images to enhance the position, replicate and z

In [65]:
# rename files to group them by replicate
# metadata_train, metadata_test = "D:\Hugo\Autofocus/pos1_labels_table.csv", "D:\Hugo\Autofocus/pos0_labels_table.csv"

# for pos, metadata in zip([train_set_path, test_set_path], [metadata_train, metadata_test]):
#     metadata = pd.read_csv(metadata, sep="\t")
    
#     os.chdir(pos)
    
#     for file in os.listdir(pos):
#         pos, repl, z = int(file.split("_")[3].strip("pos")), int(file.strip(".tiff").split("_")[-1].strip("t")), int(file.split("_")[2].strip("optimz"))
        
#         os.rename(file, f"pos{pos}_repl{repl}_z{z}.tiff")

In [66]:
# record the image file names and the associated position, replicate and z index
metadata_im_train, metadata_im_test = "D:\Hugo\Autofocus/metadata_im_train_set.csv", "D:\Hugo\Autofocus/metadata_im_test_set.csv"

for pos, metadata in zip([train_set_path, test_set_path], [metadata_im_train, metadata_im_test]):
    
    df = []
    for im in os.listdir(pos):
        pos, repl, z = im.split("_")[0].strip("pos"), im.split("_")[1].strip("repl"), im.strip(".tiff").split("_")[2].strip("z")
        
        df.append({"pos": pos, "repl": repl, "z": z, "file_name": im})
        
    df = pd.DataFrame(df)
    
    df.to_csv(metadata, sep="\t")

## 4. Make the jointure

In [67]:
for pos_data_path, im_data_path in zip(["D:\Hugo\Autofocus/pos1_labels_table.csv", "D:\Hugo\Autofocus/pos0_labels_table.csv"], ["D:\Hugo\Autofocus/metadata_im_train_set.csv", "D:\Hugo\Autofocus/metadata_im_test_set.csv"]):
    data, im_data = pd.read_csv(pos_data_path, sep="\t"), pd.read_csv(im_data_path, sep="\t")
    
    corresponding_file_name = []
    for i, row in data.iterrows():
        repl, idx = row["replicate"], row["idx"]
        corresponding_file_name.append(im_data[(im_data["repl"] == repl) & (im_data["z"] == idx)]["file_name"].values[0])
    
    data["file_name"] = corresponding_file_name
    save_path = f"{pos_data_path}_final.csv"
    data.to_csv(save_path, sep=",")
    
print("Done!")    

0 0
0 1
0 2
0 3
0 4
0 5
0 6
0 7
1 0
1 1
1 2
1 3
1 4
1 5
1 6
1 7
2 0
2 1
2 2
2 3
2 4
2 5
2 6
2 7
3 0
3 1
3 2
3 3
3 4
3 5
3 6
3 7
4 0
4 1
4 2
4 3
4 4
4 5
4 6
4 7
5 0
5 1
5 2
5 3
5 4
5 5
5 6
5 7
6 0
6 1
6 2
6 3
6 4
6 5
6 6
6 7
7 0
7 1
7 2
7 3
7 4
7 5
7 6
7 7
8 0
8 1
8 2
8 3
8 4
8 5
8 6
8 7
9 0
9 1
9 2
9 3
9 4
9 5
9 6
9 7
10 0
10 1
10 2
10 3
10 4
10 5
10 6
10 7
11 0
11 1
11 2
11 3
11 4
11 5
11 6
11 7
12 0
12 1
12 2
12 3
12 4
12 5
12 6
12 7
13 0
13 1
13 2
13 3
13 4
13 5
13 6
13 7
14 0
14 1
14 2
14 3
14 4
14 5
14 6
14 7
15 0
15 1
15 2
15 3
15 4
15 5
15 6
15 7
16 0
16 1
16 2
16 3
16 4
16 5
16 6
16 7
17 0
17 1
17 2
17 3
17 4
17 5
17 6
17 7
18 0
18 1
18 2
18 3
18 4
18 5
18 6
18 7
19 0
19 1
19 2
19 3
19 4
19 5
19 6
19 7
20 0
20 1
20 2
20 3
20 4
20 5
20 6
20 7
21 0
21 1
21 2
21 3
21 4
21 5
21 6
21 7
22 0
22 1
22 2
22 3
22 4
22 5
22 6
22 7
23 0
23 1
23 2
23 3
23 4
23 5
23 6
23 7
24 0
24 1
24 2
24 3
24 4
24 5
24 6
24 7
25 0
25 1
25 2
25 3
25 4
25 5
25 6
25 7
26 0
26 1
26 2
26 3
26 4
26 5
26 6
26 7


## 5. Train-validation split

In [27]:
import numpy as np
import os
import pandas as pd
import shutil

# build train and validation dataframes (metadata) from directories already built
os.chdir("D:\Hugo\Autofocus/Regression/")
train_dir, val_dir, test_dir = "D:\Hugo\Autofocus/Regression/Train_Set", "D:\Hugo\Autofocus/Regression/Validation_Set", "D:\Hugo\Autofocus/Regression/Test_Set"
df_paths = ["pos0_data.csv", "pos1_data.csv", "pos2_data.csv", "pos3_data.csv", "pos4_data.csv", "pos5_data.csv"]  # data from each set
df_list = [pd.read_csv(path, sep=",") for path in df_paths]  # list of dataframes
train_df_save, val_df_save, test_df_save = "train_set_table.csv", "validation_set_table.csv", "test_set_table.csv"

train_set, val_set, test_set = [], [], []
for directory, ls in zip([train_dir, val_dir, test_dir], [train_set, val_set, test_set]):
    
    for im in os.listdir(directory):
        
        found = False
        for df in df_list:
            if im in df["file_name"].values:
                ls.append(df[df["file_name"] == im].iloc[0][["file_name", "z"]])
                found = True
                        
train_set, validation_set, test_set = pd.DataFrame(train_set), pd.DataFrame(val_set), pd.DataFrame(test_set)
print(train_set.shape, validation_set.shape, test_set.shape)

train_set.to_csv(train_df_save, sep=",")
validation_set.to_csv(val_df_save, sep=",")
test_set.to_csv(test_df_save, sep=",")

(152, 2) (77, 2) (600, 2)
