# Data Preprocessing

Importing Libraries

In [29]:
from sklearn import svm, datasets
import sklearn.model_selection as model_selection
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
import numpy as np
import pandas as pd

Loading the dataset

In [30]:
dataset = pd.read_csv('features_vgg_19.csv', on_bad_lines='skip')
X = dataset.iloc[ 1 : , 1 : ].values
y = dataset.iloc[ 1 : , 0 ].values
dataset

Unnamed: 0,category,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,...,feature_1490,feature_1491,feature_1492,feature_1493,feature_1494,feature_1495,feature_1496,feature_1497,feature_1498,feature_1499
0,NeuralTextures,-62.42796,-511.979250,-144.293640,16.581331,437.121250,375.084750,585.748960,-132.477250,-424.689880,...,-0.898269,-0.132740,0.033466,0.537707,-0.356337,-0.677848,-0.033449,0.096043,0.790820,-0.256042
1,NeuralTextures,-60.10206,-511.612400,-144.567640,13.793102,436.082820,379.226750,589.281800,-134.002880,-425.063260,...,0.610182,0.344779,-0.252257,-0.712681,0.521353,0.052618,-0.272742,0.202274,0.062227,0.207085
2,NeuralTextures,283.52426,-86.484535,-72.358025,-285.846250,205.587170,6.353247,-96.631210,124.412570,25.982288,...,0.141574,0.913710,-0.973437,-0.599650,0.486635,1.609857,0.029285,-0.226419,0.210444,-0.270626
3,NeuralTextures,-109.37252,-132.780460,41.090828,-102.016180,-312.326750,-235.344730,-600.219100,57.521378,-397.795440,...,0.086565,-0.193849,0.075021,0.140493,-0.041655,-0.102646,0.004213,0.137947,-0.080017,-0.125026
4,NeuralTextures,-114.96796,-133.710040,33.637596,-102.143940,-310.961850,-233.476240,-599.182200,54.451088,-397.724400,...,-0.066966,0.210792,-0.065908,-0.111011,0.071677,0.206578,-0.012744,-0.148091,0.236905,0.084431
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1595,Deepfakes,-327.61316,314.563320,-270.090100,260.339100,60.712418,-103.131340,88.905610,57.192635,201.617610,...,0.498902,-1.078514,-0.429026,-0.386816,0.157793,0.269086,0.408329,-0.520630,-0.660197,-0.370723
1596,Deepfakes,-330.69678,323.606140,-273.277650,249.188050,58.336710,-103.883125,86.551610,58.541325,202.249970,...,-0.747822,1.099753,0.780004,1.380206,0.409219,-0.595222,-0.648377,0.776439,0.499211,0.368870
1597,Deepfakes,-333.59818,323.090400,-275.059480,245.257460,56.540356,-104.844250,83.488710,60.533146,202.259400,...,0.300279,-0.294824,-0.156819,-0.642119,0.001807,0.060356,0.164309,-0.404485,-0.008862,-0.116171
1598,Deepfakes,-334.77840,310.678700,-275.875270,258.262850,57.422730,-106.041430,84.126040,56.433655,199.466740,...,0.338990,0.900440,-0.536104,-0.069106,-0.583624,-0.579611,0.496215,-0.443521,0.013521,0.535858


In [31]:
print(X)
print(y)

[[-6.01020600e+01 -5.11612400e+02 -1.44567640e+02 ...  2.02273850e-01
   6.22268020e-02  2.07084940e-01]
 [ 2.83524260e+02 -8.64845350e+01 -7.23580250e+01 ... -2.26418850e-01
   2.10443820e-01 -2.70626070e-01]
 [-1.09372520e+02 -1.32780460e+02  4.10908280e+01 ...  1.37947480e-01
  -8.00173600e-02 -1.25025870e-01]
 ...
 [-3.33598180e+02  3.23090400e+02 -2.75059480e+02 ... -4.04485230e-01
  -8.86172100e-03 -1.16170526e-01]
 [-3.34778400e+02  3.10678700e+02 -2.75875270e+02 ... -4.43521320e-01
   1.35212540e-02  5.35857560e-01]
 [-3.40661220e+02  3.11118130e+02 -2.78083280e+02 ...  4.72801450e-01
   9.26933900e-02 -5.24574040e-01]]
['NeuralTextures' 'NeuralTextures' 'NeuralTextures' ... 'Deepfakes'
 'Deepfakes' 'Deepfakes']


Split the dataset into training set and test set.

In [32]:
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, train_size=0.80, test_size=0.20, random_state=101)

Initialise the 5-Fold Cross Validation

In [33]:
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Support Vector Machine

Predict on Test Set.

In [34]:
poly_accuracy, rbf_accuracy = 0, 0

for train_index, test_index in kf.split(X):
    
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    poly = svm.SVC(kernel='poly', degree=10, C=1).fit(X_train, y_train)
    rbf = svm.SVC(kernel='rbf', gamma=1, C=1).fit(X_train, y_train)
    
    poly_pred = poly.predict(X_test)
    rbf_pred = rbf.predict(X_test)
    
    poly_accuracy += accuracy_score(y_test, poly_pred)
    rbf_accuracy += accuracy_score(y_test, rbf_pred)


Outcomes.

In [35]:
print('Accuracy (Polynomial Kernel): ', "%.2f" % (poly_accuracy/5*100) + " %")
print('Accuracy (RBF Kernel): ', "%.2f" % (rbf_accuracy/5*100) + " %")

poly_f1 = f1_score(y_test, poly_pred, average='weighted')
print('F1 score (Polynomial Kernel): ', "%.2f" % (poly_f1*100))
rbf_f1 = f1_score(y_test, rbf_pred, average='weighted')
print('F1 score (RBF Kernel): ', "%.2f" % (rbf_f1*100))

print('Confusion Matrix (Polynomial Kernel): \n', confusion_matrix(y_test, poly_pred))
print('Confusion Matrix (RBF Kernel): \n',confusion_matrix(y_test, rbf_pred))

Accuracy (Polynomial Kernel):  58.79 %
Accuracy (RBF Kernel):  22.14 %
F1 score (Polynomial Kernel):  57.97
F1 score (RBF Kernel):  8.10
Confusion Matrix (Polynomial Kernel): 
 [[43  0 48  1]
 [ 1 33 34 11]
 [ 0  0 77  0]
 [ 5  5 29 32]]
Confusion Matrix (RBF Kernel): 
 [[ 0  0  0 92]
 [ 0  0  0 79]
 [ 0  0  0 77]
 [ 0  0  0 71]]


# Random Forest

Importing Libraries

In [36]:
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier

Define the Classifier

In [37]:
classifier = RandomForestClassifier(n_estimators = 500, criterion = 'entropy', random_state = 42)

Predicting Test Set Results

In [38]:
rf_accuracy = 0

for train_index, test_index in kf.split(X):
    
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    classifier.fit(X_train, y_train)
    
    y_pred = classifier.predict(X_test)
    
    rf_accuracy += accuracy_score(y_test, y_pred)
    

Outcomes

In [39]:
rf_f1 = f1_score(y_test, y_pred, average='weighted')
print('Accuracy (Random Forest): ', "%.2f" % (rf_accuracy/5*100) + " %")
print('F1 (Random Forest): ', "%.2f" % (rf_f1*100))
print("Confustion Matrix (Random Forest):\n", confusion_matrix(y_test, y_pred))

Accuracy (Random Forest):  87.49 %
F1 (Random Forest):  90.60
Confustion Matrix (Random Forest):
 [[84  0  7  1]
 [ 0 66  0 13]
 [ 1  0 76  0]
 [ 2  4  2 63]]


# Decision Tree

Importing Libraries

In [40]:
from sklearn.tree import DecisionTreeClassifier

Define the Classifier

In [41]:
clf = DecisionTreeClassifier()

Predicting Test Set Results

In [42]:
dt_accuracy = 0

for train_index, test_index in kf.split(X):
    
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    clf.fit(X_train, y_train)
    
    y_pred = clf.predict(X_test)
    
    dt_accuracy += accuracy_score(y_test, y_pred)

Outcomes

In [43]:
dt_f1 = f1_score(y_test, y_pred, average='weighted')
print('Accuracy (Decision Tree): ', "%.2f" % (dt_accuracy/5*100) + " %")
print('F1 (Decision Tree): ', "%.2f" % (dt_f1*100))
print("Confustion Matrix (Decision Tree):\n", confusion_matrix(y_test, y_pred))

Accuracy (Decision Tree):  60.48 %
F1 (Decision Tree):  61.50
Confustion Matrix (Decision Tree):
 [[65  8  8 11]
 [13 44  5 17]
 [15  9 44  9]
 [ 4 19  5 43]]


# MultiClass Logistic Regression

Import Libraries

In [44]:
from sklearn.linear_model import LogisticRegression

Define Classifier.

In [45]:
classifier = LogisticRegression(multi_class='auto', solver='sag')


Predicting Test Set Results.

In [46]:
lr_accuracy = 0

for train_index, test_index in kf.split(X):
    
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    classifier.fit(X_train, y_train)
    
    y_pred = classifier.predict(X_test)
    
    lr_accuracy += accuracy_score(y_test, y_pred)




Outcomes

In [47]:
lr_f1 = f1_score(y_test, y_pred, average='weighted')
print('Accuracy (Logistic Regression): ', "%.2f" % (lr_accuracy/5*100) + " %")
print('F1 (Logistic Regression): ', "%.2f" % (lr_f1*100))
print("Confusion Matrix (Logistic Regression):\n", confusion_matrix(y_test, y_pred))

Accuracy (Logistic Regression):  95.12 %
F1 (Logistic Regression):  95.62
Confusion Matrix (Logistic Regression):
 [[89  0  2  1]
 [ 0 75  0  4]
 [ 2  0 74  1]
 [ 0  4  0 67]]


# KNN

Importing Libraries

In [48]:
from sklearn.neighbors import KNeighborsClassifier

Define the Classifier

In [49]:
knn = KNeighborsClassifier(n_neighbors=5)

Predicting Test Set Results

In [50]:
knn_accuracy = 0

for train_index, test_index in kf.split(X):
    
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    knn.fit(X_train, y_train)
    
    y_pred = knn.predict(X_test)
    
    knn_accuracy += accuracy_score(y_test, y_pred)

Outcomes

In [51]:
knn_f1 = f1_score(y_test, y_pred, average='weighted')
print('Accuracy (KNN): ', "%.2f" % (knn_accuracy/5*100) + " %")
print('F1 (KNN): ', "%.2f" % (knn_f1*100))
print("Confustion Matrix (KNN):\n", confusion_matrix(y_test, y_pred))

Accuracy (KNN):  78.42 %
F1 (KNN):  76.23
Confustion Matrix (KNN):
 [[78  3  9  2]
 [ 1 54  2 22]
 [10  3 63  1]
 [ 3 19  1 48]]
