# Generate CASIA_1 Dataset Groundtruth Mask
<font size=3>There is no ground-truth mask in CASIA dataset. We subtract the real image from the manipulated image to generate the ground-truth mask. Median blur is used to eliminate noise.<br>
    
The original link to the dataset is no longer available. We obtain the dataset from the [link](https://www.kaggle.com/sophatvathana/casia-dataset).<br>
Based on this script, you can generate the groundtruth mask and test set text files for CASIA-1.<br><br>

Please note that we set some thresholds in the script, which may cause some images in the dataset to be ignored, you can process these images separately.<br><br>
We only provide the CASIA-1 script, you can modify it to preprocess the CASIA-2 dataset.<font>
    


In [1]:
import numpy as np
import cv2
import skimage.color as color
import skimage.io as io
import os

from glob import glob
casia_dir = '../dataset/casia/CASIA1/'
sp_dir = casia_dir+'Sp/'        # Tampered images path
mask_dir = '../dataset/casia/mask/'    # Mask path
probe_dir = '../dataset/casia/probe/'   # Probe path
au_dir = casia_dir+'/Au'        # Authentic images path
ext = 'Sp*'
dataType = '.jpg'

# Create data storage path
if not os.path.exists(mask_dir):
  os.makedirs(mask_dir)

if not os.path.exists(probe_dir):
  os.makedirs(probe_dir)
filenames = glob(os.path.join(sp_dir, ext))   # All tampered images list
im_num = len(filenames)
im_id = 0


  from .collection import imread_collection_wrapper


In [2]:
# Generate bounding box label from groundtruth mask
def bounding_box(image,mask,row_data):
    box_list=[]

    contours, hierarchy = cv2.findContours(mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_NONE)
    print("contours num：", len(contours))
    image_copy=image.copy()
    contours = sorted(contours, key=lambda i: len(i),reverse=True)
    for i in range(0, len(contours)):
        area=cv2.contourArea(contours[i])
        print(area,mask.shape[0]/20*image.shape[1]/20)
        if area>(mask.shape[0]/20*image.shape[1]/20):
            print(i)
            x, y, w, h = cv2.boundingRect(contours[i])
            x1=x
            y1=y
            x2=x+w
            y2=y+h
            box_list.append(str(x1)+'_'+str(y1)+'_'+ str(x2)+'_'+str(y2))
        else:
            continue

    if len(box_list)==0 and len(contours)!=0:
        x, y, w, h = cv2.boundingRect(contours[0])
        x1 = x
        y1 = y
        x2 = x + w
        y2 = y + h
        box_list.append(str(x1) + '_' + str(y1) + '_' + str(x2) + '_' + str(y2))
    if len(contours)==0 and len(box_list)==0 :
        box_list.append('0' + '_' + '0' + '_' + '0' + '_' + '0')
    return box_list

In [3]:
# Generate CASIA mask
test_name=[]
cls='tamper'
for im in filenames:
    
    img_org = io.imread(im)
    img = img_org.copy()
    img_org = cv2.cvtColor(img_org, cv2.COLOR_RGB2BGR)
    base_name = os.path.splitext(os.path.basename(im))[0]
    content = base_name.split("_")
    au_tmp = str(content[4])
    auth_name = 'Au_' + au_tmp[:-4] + '_' + au_tmp[-4:] + '*'#Authentic image name

    auth_img = io.imread(glob(os.path.join(au_dir, auth_name))[0])
    if (img.shape != auth_img.shape):
        au_tmp = str(content[5])
        auth_name = 'Au_' + au_tmp[:-4] + '_' + au_tmp[-4:] + '*'
        auth_img = io.imread(glob(os.path.join(au_Dir, auth_name))[0])
    img = color.rgb2grey(img)
    auth_img = color.rgb2grey(auth_img)
    
    # Generate coarse mask
    mask = np.abs(img - auth_img) * 255
    # Eliminate noise
    ret, mask = cv2.threshold(mask, 15, 255, cv2.THRESH_BINARY)#After testing, the threshold is set to 15
    mask = mask.astype(np.uint8)
    mask = cv2.medianBlur(mask, 5)#Remove noise
    _, mask = cv2.threshold(mask, 127, 255, cv2.THRESH_BINARY)#Binarization

    # Generate a new mask that contains only large regions
    contours, hierarchy = cv2.findContours(mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_NONE)
    contours = sorted(contours, key=lambda i: len(i), reverse=True)
    mask_save = np.zeros_like(mask, dtype=np.uint8)
    c_max = []
    for i in range(len(contours)):
        cnt = contours[i]
        area = cv2.contourArea(cnt)

        if (area > (mask.shape[0] / 20 * mask.shape[1] / 20) and area < (mask.shape[0] * mask.shape[1] * 0.7)):
            c_max.append(cnt)
    cv2.drawContours(mask_save, c_max, -1, (255), thickness=-1)
    if len(c_max) > 0:
        box_list=[]
        for cnt in c_max:
            x, y, w, h = cv2.boundingRect(contours[i])
            x1=x
            y1=y
            x2=x+w
            y2=y+h
            box_list.append(str(x1)+'_'+str(y1)+'_'+ str(x2)+'_'+str(y2))
            content=str(base_name).split('_')
#             print(content)
            save_name='Tp' +'_'+str(content[4])+'_'+str(content[5])+'_'+str(content[3])+str(content[6])+'_'
            for i in range(0,len(box_list)):
                 save_name=save_name+box_list[i]+'_'+cls+'_'
        cv2.imwrite(probe_dir + save_name.rstrip('_')+ ".png", img_org)
        cv2.imwrite(mask_dir + save_name.rstrip('_') + ".png", mask_save)
        test_name.append(save_name.rstrip('_'))
        print('{:d}/{:d} images     {:s}'.format(im_id+1, im_num, save_name.rstrip('_')))
    else:
        continue
    im_id += 1

1/1 images     Tp_pla0005_pla0023_A0281_113_87_253_177_tamper


In [4]:
# Generate the final testing set catalog file
with open('../dataset/casia/test_all_single1.txt', 'w') as f:
    for pic in test_name:
        test_content = pic.split("_")
        if test_content[-1] in cls:
            content2 = [str(i) for i in test_content[4:-1]]
            content3=' '.join(content2)
            f.write('%s %s %s\n' % (pic,content3,'tamper'))