### 1. Set up data

In [0]:
import copy
import cv2
%matplotlib inline
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
import os
import random

In [0]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
model_name = "all"
fake_df_path = "/content/drive/My Drive/FakeImageDetector/dataset_drive/all_train_test_val.csv"
generated_model_path = "/content/drive/My Drive/FakeImageDetector/lgbm/lgbm_all_updated.pickle"

In [0]:
#fake_df = pd.read_csv(fake_df_path)
full_df = pd.read_csv(fake_df_path)

In [0]:
real_df = pd.read_csv('/content/drive/My Drive/FakeImageDetector/dataset_drive/real_train_test_val.csv')

In [0]:
#fake_train = fake_df[fake_df['train'] == 'train']
#fake_test = fake_df[fake_df['train'] == 'test']
#fake_val = fake_df[fake_df['train'] == 'validation']
#del fake_df
#real_train = real_df[real_df['train'] == 'train']
#real_test = real_df[real_df['train'] == 'test']
#real_val = real_df[real_df['train'] == 'validation']
#del real_df
full_train = full_df[full_df['train'] == 'train']
full_test = full_df[full_df['train'] == 'test']
full_val = full_df[full_df['train'] == 'validation']
del full_df

In [0]:
del real_df

In [0]:
#x_train = np.concatenate((fake_train.iloc[:,1:-3].values, real_train.iloc[:,1:-2].values), axis=0)
#y_train = np.concatenate((np.ones(fake_train.shape[0]), np.zeros(real_train.shape[0])))
x_train = full_train.iloc[:,1:-3].values
y_train = full_train['label'].values
#del fake_train
#del real_train
del full_train
x_test = full_test.iloc[:,1:-3].values
y_test = full_test['label'].values
#x_test = np.concatenate((fake_test.iloc[:,1:-3].values, real_test.iloc[:,1:-2].values), axis=0)
#y_test = np.concatenate((np.ones(fake_test.shape[0]), np.zeros(real_test.shape[0])))
#del fake_test
#del real_test
del full_test
x_val = full_val.iloc[:,1:-3].values
y_val = full_val['label'].values
#x_val = np.concatenate((fake_val.iloc[:,1:-3].values, real_val.iloc[:,1:-2].values), axis=0)
#y_val = np.concatenate((np.ones(fake_val.shape[0]), np.zeros(real_val.shape[0])))
#del fake_val
#del real_val
del full_val

In [0]:
def shuffle_two_arrays(arr1, arr2):
    shuffler = np.random.permutation(len(arr1))
    array1_shuffled = arr1[shuffler]
    array2_shuffled = arr2[shuffler]

    return array1_shuffled, array2_shuffled

In [0]:
x_train, y_train = shuffle_two_arrays(x_train, y_train)
x_test, y_test = shuffle_two_arrays(x_test, y_test)
x_val, y_val = shuffle_two_arrays(x_val, y_val)

In [0]:
x_train_flat = x_train.reshape((x_train.shape[0], 40*40*4*3))
x_test_flat = x_test.reshape((x_test.shape[0], 40*40*4*3))
x_val_flat = x_val.reshape((x_val.shape[0], 40*40*4*3))

In [0]:
import lightgbm
from sklearn.model_selection import GridSearchCV

train_dataset = lightgbm.Dataset(x_train_flat, label=y_train)
val_dataset = lightgbm.Dataset(x_val_flat, label=y_val)

parameters = {
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': {'auc', 'binary_logloss'},
    'num_threads': 4,
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': 1
}

model = lightgbm.train(parameters,
                       train_dataset,
                       valid_sets=val_dataset,
                       num_boost_round=2000,
                       early_stopping_rounds=100)

In [0]:
try:
    import cPickle as pickle
except BaseException:
    import pickle

with open(generated_model_path, 'wb') as fout:
    pickle.dump(model, fout)

In [0]:
# load model with pickle to predict
try:
    import cPickle as pickle
except BaseException:
    import pickle
with open(generated_model_path, 'rb') as fin:
   model = pickle.load(fin)

In [0]:
y_test_pred = model.predict(x_test_flat)
y_test_pred = np.array([1 if y >= 0.5 else 0 for y in y_test_pred])

In [50]:
from sklearn.metrics import classification_report

print('--------------------- {} ---------------------\r\n'.format(model_name))
print(classification_report(y_test, y_test_pred))

--------------------- all ---------------------

              precision    recall  f1-score   support

           0       0.76      0.65      0.70      2210
           1       0.70      0.79      0.74      2236

    accuracy                           0.72      4446
   macro avg       0.73      0.72      0.72      4446
weighted avg       0.73      0.72      0.72      4446



In [51]:
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y_test, y_test_pred)
print(cm)

[[1437  773]
 [ 466 1770]]


In [0]:
import seaborn as sns
import matplotlib.pyplot as plt

plt.figure(figsize = (10, 8))
sns.heatmap(cm, annot=True, fmt=".0f", linewidths=.5, square = True, cmap = 'Blues_r')
plt.ylabel('Actual label')
plt.xlabel('Predicted label')
plt.title('Confusion matrix - {}'.format(model_name))

In [0]:
from sklearn.metrics import roc_curve, auc
plt.figure(figsize = (10, 8))
y_test_pred = model.predict(x_test_flat)
false_positive_rate, recall, thresholds = roc_curve(y_test, y_test_pred)
roc_auc = auc(false_positive_rate, recall)
plt.title('ROC: {}'.format(model_name))
plt.plot(false_positive_rate, recall, 'b', label = 'AUC = %0.3f' %roc_auc)
plt.legend(loc='lower right')
plt.plot([0,1], [0,1], 'r--')
plt.xlim([0.0,1.0])
plt.ylim([0.0,1.0])
plt.ylabel('Recall')
plt.xlabel('Fall-out (1-Specificity)')
plt.show()
print('AUC score:', roc_auc)


In [0]:
features = model.feature_importance()
print(features.shape)
features_mean = np.mean(features.reshape(-1, 3), axis=1)
print(features_mean.shape)
features_split = np.array_split(features_mean, 4)
labels = ['eye_left', 'eye_right', 'nose', 'lips']
plt.rcParams['figure.figsize'] = (20.0, 16.0)
for i in range(4):
    plt.subplot(1, 4, i+1)
    img_color = features_split[i]
    img_color = img_color.reshape((40, 40))
    plt.imshow(img_color, cmap='Greys')
    plt.title(labels[i])
    plt.axis('off')