# Data Splitter

The data is given as *manipulated* and *original*. But it should be splitted as training, validation, and testing data.

Also, apart from dividing into 3 datasets, we also need to make sure that the different types of manipulation are evenly divided in the all the datasets. 

In [None]:
import os
import glob
import random
from PIL import Image
import matplotlib.pyplot as plt

### Directory Management

In [2]:
os.mkdir("./data")
os.mkdir("./data/train")
os.mkdir("./data/val")
os.mkdir("./data/test")

In [3]:
manipulated_paths = glob.glob('./data0/manipulated/*.png')
original_paths = glob.glob('./data0/original/*.png')

f2f = []
eyes = []
mouth = []
nt = []
df = []
fs = []

manipulated_images_paths = []

In [4]:
if len(manipulated_paths) == 0:
    print("Could not find imgdb directory!  " +  "Make sure you put it here: " + os.getcwd() )
else:
    # load image data
    for path in manipulated_paths:
        name = os.path.split(path)[-1]
        manipulation_type = name.split('_')[0]
        
        if (manipulation_type == 'F2F'):
            f2f.append(path)
        elif (manipulation_type == 'eyes'):
            eyes.append(path)
        elif (manipulation_type == 'mouth'):
            mouth.append(path)
        elif (manipulation_type == 'NT'):
            nt.append(path)
        elif (manipulation_type == 'DF'):
            df.append(path)
        elif (manipulation_type == 'FS'):
            fs.append(path)

In [5]:
manipulated_images_paths = [f2f, eyes, mouth, nt, df, fs]

In [6]:
for x in manipulated_images_paths:
    print(len(x))

1335
1333
1333
1333
1333
1333


Dividing images into folders for training, validation, and testing.

There will be 75% for training, 12.5% for validation, and 12.5% for testing.

In [7]:
import shutil
counter = 0
for x in manipulated_images_paths:
    for i in range(1000):
        index = random.randint(0, len(x) - 1)
        source = x.pop(index)
        name = os.path.split(source)[-1]
        manipulation_type = name.split('_')[0]
        shutil.move(source, f'./data/train/{manipulation_type}.{counter}.png')
        counter+=1

for i in range(3000):
    index = random.randint(0, len(original_paths) - 1)
    source = original_paths.pop(index)
    shutil.move(source, f'./data/train/original.{counter}.png')
    counter+=1

In [8]:
import shutil
counter = 0
for x in manipulated_images_paths:
    for i in range(166):
        index = random.randint(0, len(x) - 1)
        source = x.pop(index)
        name = os.path.split(source)[-1]
        manipulation_type = name.split('_')[0]
        shutil.move(source, f'./data/val/{manipulation_type}.{counter}.png')
        counter+=1

for i in range(500):
    index = random.randint(0, len(original_paths) - 1)
    source = original_paths.pop(index)
    shutil.move(source, f'./data/val/original.{counter}.png')
    counter+=1

In [9]:
import shutil
counter = 0
for x in manipulated_images_paths:
    for i in range(len(x)):
        index = random.randint(0, len(x) - 1)
        source = x.pop(index)
        name = os.path.split(source)[-1]
        manipulation_type = name.split('_')[0]
        shutil.move(source, f'./data/test/{manipulation_type}.{counter}.png')
        counter+=1

for i in range(500):
    index = random.randint(0, len(original_paths) - 1)
    source = original_paths.pop(index)
    shutil.move(source, f'./data/test/original.{counter}.png')
    counter+=1

In [10]:
DIR_TRAIN = './data/train/'
DIR_VAL = './data/val/'
DIR_TEST = './data/test/'

In [11]:
def get_train_transform():
    return T.Compose([
        T.RandomHorizontalFlip(p=0.5),
        T.RandomRotation(15),
        T.RandomCrop(204),
        T.ToTensor(),
        T.Normalize((0, 0, 0),(1, 1, 1))
    ])
    
def get_val_transform():
    return T.Compose([
        T.ToTensor(),
        T.Normalize((0, 0, 0),(1, 1, 1))
    ])

In [12]:
class DeepFakeDataset(Dataset):
    
    def __init__(self, imgs, class_to_int, mode = "train", transforms = None):
        
        super().__init__()
        self.imgs = imgs
        self.class_to_int = class_to_int
        self.mode = mode
        self.transforms = transforms
        
    def __getitem__(self, idx):
        
        image_name = self.imgs[idx]
        
        ### Reading, converting and normalizing image
        #img = cv2.imread(DIR_TRAIN + image_name, cv2.IMREAD_COLOR)
        #img = cv2.resize(img, (224,224))
        #img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB).astype(np.float32)
        #img /= 255.
        img = Image.open(DIR_TRAIN + image_name)
        img = img.resize((224, 224))
        
        if self.mode == "train" or self.mode == "val":
        
            ### Preparing class label
            label = self.class_to_int[image_name.split(".")[0]]
            label = torch.tensor(label, dtype = torch.float32)

            ### Apply Transforms on image
            img = self.transforms(img)

            return img, label
        
        elif self.mode == "test":
            
            ### Apply Transforms on image
            img = self.transforms(img)

            return img
            
        
    def __len__(self):
        return len(self.imgs)

In [13]:
train_imgs = os.listdir(DIR_TRAIN)
val_imgs = os.listdir(DIR_VAL)
test_imgs = os.listdir(DIR_TEST)