In [1]:
# install the open cv library
!pip install opencv-python

Collecting opencv-python
  Downloading opencv_python-4.7.0.72-cp37-abi3-win_amd64.whl (38.2 MB)
     --------------------------------------- 38.2/38.2 MB 32.7 MB/s eta 0:00:00
Installing collected packages: opencv-python
Successfully installed opencv-python-4.7.0.72


In [2]:
!pip install xgboost

Collecting xgboost
  Downloading xgboost-1.7.5-py3-none-win_amd64.whl (70.9 MB)
     --------------------------------------- 70.9/70.9 MB 23.4 MB/s eta 0:00:00
Installing collected packages: xgboost
Successfully installed xgboost-1.7.5


In [3]:
# importing the packages
import pandas as pd
import os
from PIL import Image
import cv2
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
import seaborn as sns
import matplotlib.pyplot as plt
from tqdm import tqdm
import matplotlib.patches as mpatches
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import NearestNeighbors
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import accuracy_score
import xgboost as xgb
from sklearn.metrics import precision_score, recall_score, f1_score
import pickle
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

In [4]:
# there are approx 5 features present overall across 75000 images
features_dataframe = pd.read_csv("final_dataset_containing_5_extracted_features.csv")

In [5]:
features_dataframe.shape

(72986, 12)

In [6]:
# viewing the features_dataframe
features_dataframe.head()

Unnamed: 0,file_name,feature_x_1,feature_x_2,feature_x_3,feature_x_4,feature_x_5,feature_y_1,feature_y_2,feature_y_3,feature_y_4,feature_y_5,is_gan
0,00000.png,125,101.0,73.0,,,89,130.0,87.0,,,No
1,000004.png,127,75.0,,,,90,87.0,,,,Yes
2,00001.png,126,75.0,76.0,,,91,90.0,139.0,,,No
3,000014.png,130,77.0,,,,89,91.0,,,,Yes
4,000016.png,127,173.0,74.0,,,90,61.0,91.0,,,Yes


In [7]:
# performing shuffling on features_dataframe
features_dataframe = features_dataframe.sample(frac=1)
features_dataframe.head()

Unnamed: 0,file_name,feature_x_1,feature_x_2,feature_x_3,feature_x_4,feature_x_5,feature_y_1,feature_y_2,feature_y_3,feature_y_4,feature_y_5,is_gan
43801,23257.png,78,124.0,,,,85,89.0,,,,No
63517,44023.png,78,123.0,,,,91,91.0,,,,No
42542,21935.png,76,127.0,88.0,115.0,,89,90.0,145.0,144.0,,No
44142,23618.png,57,123.0,102.0,159.0,,91,88.0,145.0,86.0,,No
55795,35909.png,78,113.0,123.0,88.0,,88,145.0,89.0,22.0,,No


In [8]:
features_dataframe = features_dataframe.fillna(-1)

In [9]:
# normalizing the data
is_gan = features_dataframe["is_gan"] 
features_dataframe.drop(columns=["file_name", "is_gan"], inplace=True)

In [10]:
standard_scalar = StandardScaler()
features_dataframe = standard_scalar.fit_transform(features_dataframe)
features_dataframe = pd.DataFrame(features_dataframe)
features_dataframe

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,-1.196018,0.985635,-0.923653,-0.432465,-0.189726,-0.492777,0.127754,-0.899031,-0.420531,-0.183362
1,-1.196018,0.959388,-0.923653,-0.432465,-0.189726,-0.127320,0.185082,-0.899031,-0.420531,-0.183362
2,-1.277176,1.064378,0.843687,2.658660,-0.189726,-0.249139,0.156418,1.491823,2.830785,-0.183362
3,-2.048178,0.959388,1.121696,3.831156,-0.189726,-0.127320,0.099089,1.491823,1.530259,-0.183362
4,-1.196018,0.696914,1.538709,1.939174,-0.189726,-0.310048,1.732958,0.574783,0.095195,-0.183362
...,...,...,...,...,...,...,...,...,...,...
72981,0.792355,-0.169251,0.565679,-0.432465,-0.189726,-0.249139,1.532307,0.558408,-0.420531,-0.183362
72982,0.751776,-0.221746,-0.923653,-0.432465,-0.189726,-0.127320,0.185082,-0.899031,-0.420531,-0.183362
72983,0.751776,0.486934,-0.923653,-0.432465,-0.189726,-0.249139,1.704293,-0.899031,-0.420531,-0.183362
72984,0.711197,-0.221746,0.704683,2.632012,-0.189726,-0.249139,0.213747,1.409944,2.853208,-0.183362


In [11]:
features_dataframe["is_gan"] = is_gan
features_dataframe["is_gan"] = features_dataframe["is_gan"].map(dict(Yes=1, No=0))
features_dataframe

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,is_gan
0,-1.196018,0.985635,-0.923653,-0.432465,-0.189726,-0.492777,0.127754,-0.899031,-0.420531,-0.183362,0
1,-1.196018,0.959388,-0.923653,-0.432465,-0.189726,-0.127320,0.185082,-0.899031,-0.420531,-0.183362,1
2,-1.277176,1.064378,0.843687,2.658660,-0.189726,-0.249139,0.156418,1.491823,2.830785,-0.183362,0
3,-2.048178,0.959388,1.121696,3.831156,-0.189726,-0.127320,0.099089,1.491823,1.530259,-0.183362,1
4,-1.196018,0.696914,1.538709,1.939174,-0.189726,-0.310048,1.732958,0.574783,0.095195,-0.183362,1
...,...,...,...,...,...,...,...,...,...,...,...
72981,0.792355,-0.169251,0.565679,-0.432465,-0.189726,-0.249139,1.532307,0.558408,-0.420531,-0.183362,0
72982,0.751776,-0.221746,-0.923653,-0.432465,-0.189726,-0.127320,0.185082,-0.899031,-0.420531,-0.183362,0
72983,0.751776,0.486934,-0.923653,-0.432465,-0.189726,-0.249139,1.704293,-0.899031,-0.420531,-0.183362,0
72984,0.711197,-0.221746,0.704683,2.632012,-0.189726,-0.249139,0.213747,1.409944,2.853208,-0.183362,0


In [12]:
features_dataframe.isna().sum()

0         0
1         0
2         0
3         0
4         0
5         0
6         0
7         0
8         0
9         0
is_gan    0
dtype: int64

In [13]:
x = features_dataframe
y = features_dataframe["is_gan"]
x.drop(columns=["is_gan"], inplace=True)

In [14]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.30, random_state=0)

In [15]:
f"Shape of training data = {x_train.shape[0]} and testing data = {x_test.shape[0]}"

'Shape of training data = 51090 and testing data = 21896'

## Performing experiments using the following hyperparameters to find the best model

In [16]:
# Performing Kfold cross validation
decision_tree_pipeline = Pipeline([("classifier", DecisionTreeClassifier())])
param_grid = {'classifier__criterion' : ["gini", "entropy", "log_loss"],
             'classifier__splitter' : ["best", "random"],
             'classifier__max_depth' : [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20],
             'classifier__max_features': ["auto", "sqrt", "log2"]
             }

decision_tree_pipeline

In [17]:
gridsearch_cv = GridSearchCV(decision_tree_pipeline, param_grid=param_grid, cv=10, verbose=2, n_jobs=-1)

In [18]:
gridsearch_cv.fit(x_train, y_train)

Fitting 10 folds for each of 342 candidates, totalling 3420 fits


In [19]:
gridsearch_cv.best_params_

{'classifier__criterion': 'entropy',
 'classifier__max_depth': 2,
 'classifier__max_features': 'sqrt',
 'classifier__splitter': 'best'}

## Training the model on the above best parameters

In [20]:
decision_tree_model = DecisionTreeClassifier(criterion="entropy", max_depth=2, max_features="sqrt", splitter="best")
decision_tree_model.fit(x_train, y_train)

In [21]:
decision_tree_predictions = decision_tree_model.predict(x_test)

In [22]:
cm = metrics.confusion_matrix(y_test, decision_tree_predictions)
print(f"The confusion matrix for Decision Tree Classifier is given below: \n {cm}")

The confusion matrix for Decision Tree Classifier is given below: 
 [[14908     0]
 [ 6988     0]]


In [23]:
dt_precision = precision_score(y_test, decision_tree_predictions, average='weighted', labels=np.unique(decision_tree_predictions))
dt_precision

0.6808549506759225

In [24]:
dt_recall = recall_score(y_test, decision_tree_predictions, average='weighted', labels=np.unique(decision_tree_predictions))
dt_recall

1.0

In [25]:
dt_f1 = f1_score(y_test, decision_tree_predictions, average='weighted', labels=np.unique(decision_tree_predictions))
dt_f1

0.8101293337680686

In [26]:
# storing the deci model
filename = 'decision_tree_final.pkl'
pickle.dump(decision_tree_model, open(filename, 'wb'))