In [2]:
# Imports

import numpy as np  # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
import shutil # high-level operations on files
from tqdm import tqdm # Progress bar and status logging
from sklearn.utils import shuffle
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report,confusion_matrix

import cv2 # computer vision algorithms

# Importing the Keras libraries and packages
from keras import utils
from keras.wrappers.scikit_learn import KerasClassifier
from keras.models import Sequential
from keras.layers import Conv2D
from keras.layers import MaxPooling2D
from keras.layers import Flatten
from keras.layers import Dense
from keras.layers import Dropout

In [6]:
# Configuration

DATASET_DIR = '/home/knakul853/Desktop/ml/real_and_fake_face'
TRAIN_DIR = '/home/knakul853/Desktop/ml/train_dataset'
TEST_DIR = '/home/knakul853/Desktop/ml/test_dataset'

RATE = 0.2 # splitting proportion for training and test datasets

# Parameters for Grid Search

N_EPOCHS = [20] #[20, 40, 100, 200]
OPTIMIZERS = ['adam'] #['adam', 'rmsprop', 'SGD']
DROPOUT_RATES =  [0.1, 0.2, 0.4]
LOSS_FUNCTIONS = ['binary_crossentropy']  #['sparse_categorical_crossentropy', 'kullback_leibler_divergence']

# os.mkdir(TRAIN_DIR)
# os.mkdir(TRAIN_DIR+'/fake')
# os.mkdir(TRAIN_DIR+'/real')

os.mkdir(TEST_DIR)
os.mkdir(TEST_DIR+'/fake')
os.mkdir(TEST_DIR+'/real')

In [7]:
import split_folders

In [10]:
files_real = os.listdir(f'{DATASET_DIR}/training_real')
files_fake = os.listdir(f'{DATASET_DIR}/training_fake')

In [13]:
# sample from each class to create a test set
np.random.seed(0)
files_real_test = np.random.choice(
    files_real,
    size=round(len(files_real) * RATE),
    replace=False,
    p=None)

files_real_train = list(set(files_real) - set(files_real_test)) #[file for file in files_real if file not in files_real_test] 

files_fake_test = np.random.choice(
    files_fake,
    size=round(len(files_fake) * RATE),
    replace=False,
    p=None)

files_fake_train = list(set(files_fake) - set(files_fake_test)) #[file for file in files_fake if file not in files_fake_test] 

for file in files_real_train:
    shutil.copyfile(DATASET_DIR+'/training_real/'+file, TRAIN_DIR+'/real/'+file) 

for file in files_fake_train:
    shutil.copyfile(DATASET_DIR+'/training_fake/'+file, TRAIN_DIR+'/fake/'+file) 

for file in files_real_test:
    shutil.copyfile(DATASET_DIR+'/training_real/'+file, TEST_DIR+'/real/'+file) 

for file in files_fake_test:
    shutil.copyfile(DATASET_DIR+'/training_fake/'+file, TEST_DIR+'/fake/'+file) 

In [14]:
train_samples = sum([len(files) for r, d, files in os.walk(TRAIN_DIR)])
test_samples = sum([len(files) for r, d, files in os.walk(TEST_DIR)])
print('Number of training images: {} \nNumber of test images: {}'.format(train_samples, test_samples))

Number of training images: 2358 
Number of test images: 590


In [31]:
# todo : normalize image
def get_images(path, img_shape=(64, 64)):
 
    '''
    Returns a np array of images and labels from path
    Images must be stored in path/class1, path/class2
    '''
    
    main_path = path
    k = 0
    print(main_path)
    y = []
    list = [name for name in os.listdir(main_path) if os.path.isdir(os.path.join(main_path, name))]
    print(list)
    image_collection = []
    for idx,folder in enumerate(list):
 
        label = idx
        
        sub_list = sorted(os.listdir(os.path.join(main_path,folder)))
 
        for i in tqdm(range(1, len(sub_list))):
            image_path = os.path.join(main_path, folder, sub_list[i])
            try:
                read_image = cv2.imread(image_path)
                assert not isinstance(read_image, type(None)), 'image not found'
                
#                scale_percent = 60 # percent of original size
#                 width = int(read_image.shape[1] * scale_percent / 100)
#                 height = int(read_image.shape[0] * scale_percent / 100)
#                 img_shape = (width, height)
                
                image_resized = cv2.resize(read_image, img_shape, interpolation=cv2.INTER_AREA)

                image = np.float32(image_resized)
                image = cv2.normalize(image, None, alpha=0, beta=1, norm_type=cv2.NORM_MINMAX, dtype=cv2.CV_32F) #Change alpha, beta according to the preprocessing you desire

                image_collection.append(image)

                y.append(label)  # for now just skip the errors 
            except Exception as e:
                print(e)
 
    y = np.array(y)
    y = utils.to_categorical(y,num_classes=len(list))
 
    return image_collection, y[:,0]

In [32]:
# Preparing test and trainng datasets
X_train,y_train = get_images(TRAIN_DIR,img_shape=(64,64))
X_test,y_test = get_images(TEST_DIR,img_shape=(64,64))
X_train = np.array(X_train)
X_test = np.array(X_test)

#todo : handle this skiped data.

  0%|          | 0/323 [00:00<?, ?it/s]

/home/knakul853/Desktop/ml/train_dataset
['real', 'fake']


100%|██████████| 323/323 [00:06<00:00, 52.51it/s]
100%|██████████| 2033/2033 [01:00<00:00, 33.50it/s]
  9%|▉         | 7/80 [00:00<00:01, 63.34it/s]

/home/knakul853/Desktop/ml/test_dataset
['real', 'fake']


100%|██████████| 80/80 [00:01<00:00, 49.68it/s]
100%|██████████| 508/508 [00:11<00:00, 44.44it/s]


In [33]:
print('Training set', X_train.shape)
print('Test set', X_test.shape)

Training set (2356, 64, 64, 3)
Test set (588, 64, 64, 3)


In [35]:
#Shuffle training examples
y_train = np.array(y_train)
y_test = np.array(y_test)
X_train = np.array(X_train)
y_test = np.array(y_test)
X_train, y_train = shuffle(X_train, y_train)


In [36]:
#source towards data sceince
def build_classifier(optimizer, dropout, loss):
    classifier = Sequential() # Initialising the CNN    
    classifier.add(Conv2D(32, (3, 3), input_shape = (64, 64, 3), activation = 'relu')) 
    classifier.add(MaxPooling2D(pool_size = (2, 2))) 
    classifier.add(Dropout(dropout))
    classifier.add(Conv2D(32, (3, 3), activation = 'relu'))  
    classifier.add(MaxPooling2D(pool_size = (2, 2)))
    classifier.add(Dropout(dropout))
    classifier.add(Conv2D(32, (3, 3), activation = 'relu'))  
    classifier.add(MaxPooling2D(pool_size = (2, 2)))
    classifier.add(Dropout(dropout))
    classifier.add(Flatten())
    classifier.add(Dense(units = 128, activation = 'relu'))
    classifier.add(Dense(units = 1, activation = 'sigmoid')) #'tanh'))
    
    classifier.compile(optimizer = optimizer, loss = loss, metrics = ['accuracy'])
    
    return classifier

classifier = KerasClassifier(build_fn = build_classifier)

grid_parameters = {'epochs': N_EPOCHS,
                  'optimizer': OPTIMIZERS,
                  'dropout': DROPOUT_RATES,                  
                  'loss':LOSS_FUNCTIONS                        
                  }


grid_search = GridSearchCV(estimator = classifier,
                           param_grid = grid_parameters,
                           scoring = 'accuracy',
                           cv = 2)


grid_search = grid_search.fit(X_train, y_train)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20

In [37]:
best_parameters = grid_search.best_params_
best_accuracy = grid_search.best_score_
print(best_parameters)
print(best_accuracy)

{'dropout': 0.1, 'epochs': 20, 'loss': 'binary_crossentropy', 'optimizer': 'adam'}
0.9711375212224108


In [44]:
predicted = grid_search.predict(X_test)

In [45]:
print('Confusion matrix for training set:')
print(confusion_matrix(y_train,grid_search.predict(X_train)))

Confusion matrix for training set:
[[2013   20]
 [   1  322]]


In [46]:
print(classification_report(y_train,grid_search.predict(X_train)))

              precision    recall  f1-score   support

         0.0       1.00      0.99      0.99      2033
         1.0       0.94      1.00      0.97       323

    accuracy                           0.99      2356
   macro avg       0.97      0.99      0.98      2356
weighted avg       0.99      0.99      0.99      2356



In [47]:
print('Confusion matrix  for test set:')
print(confusion_matrix(y_test,predicted))

Confusion matrix  for test set:
[[501   7]
 [  1  79]]


In [48]:
print(classification_report(y_test,predicted))

              precision    recall  f1-score   support

         0.0       1.00      0.99      0.99       508
         1.0       0.92      0.99      0.95        80

    accuracy                           0.99       588
   macro avg       0.96      0.99      0.97       588
weighted avg       0.99      0.99      0.99       588



In [None]:
e