# **Step 1:** Document Classification Model

In [None]:
original_path = '/content/indian-document-forgery-detection/dataset'

# Destination paths
train_path = os.path.join(original_path, 'train')
val_path = os.path.join(original_path, 'val')
test_path = os.path.join(original_path, 'test')

# Create directories if they don't exist
for path in [train_path, val_path, test_path]:
    os.makedirs(path, exist_ok=True)

In [None]:
## DEFINE CLASSES AND SAMPLE SIZES ##

## Define classes in the dataset
classes = ["ID", "Not_ID"]

## Set the number of samples for training, validation, and testing
train_s = 210
validation_s = 60
test_s = 30

In [None]:
## FUNCTION TO COPY RANDOM SAMPLES ##

## Define a function to copy a specified number of random samples from source to destination
def copy_random_samples(src_dir, dest_dir, samples):
    files = random.sample(os.listdir(src_dir), samples)
    for file in files:
        src_path = os.path.join(src_dir, file)
        dest_path = os.path.join(dest_dir, file)
        shutil.copy(src_path, dest_path)

## COPY SAMPLES FOR TRAINING ##

## Iterate over classes to copy random samples for training
for class_name in classes:
    src_class_dir = os.path.join(original_path, class_name)
    dest_class_dir = os.path.join(train_path, class_name)
    os.makedirs(dest_class_dir, exist_ok=True)
    copy_random_samples(src_class_dir, dest_class_dir, train_s)

## COPY SAMPLES FOR VALIDATION ##

## Iterate over classes to copy random samples for validation
for class_name in classes:
    src_class_dir = os.path.join(original_path, class_name)
    dest_class_dir = os.path.join(val_path, class_name)
    os.makedirs(dest_class_dir, exist_ok=True)
    copy_random_samples(src_class_dir, dest_class_dir, validation_s)

## COPY SAMPLES FOR TESTING ##

## Iterate over classes to copy random samples for testing
for class_name in classes:
    src_class_dir = os.path.join(original_path, class_name)
    dest_class_dir = os.path.join(test_path, class_name)
    os.makedirs(dest_class_dir, exist_ok=True)
    copy_random_samples(src_class_dir, dest_class_dir, test_s)

In [None]:
test_path= "/content/indian-document-forgery-detection/dataset/test"
train_path= "/content/indian-document-forgery-detection/dataset/train"
val_path= "/content/indian-document-forgery-detection/dataset/val"

In [None]:
## Defining a function to count the data instants
def count_images_in_category(category_dir):
    return len(os.listdir(category_dir))

##Looping over the classes for training category
for class_name in classes:
    train_category= os.path.join(train_path, class_name)
    count= count_images_in_category(train_category)
    print(f"Number of images in {class_name} training category: {count}")

In [None]:
for class_name in classes:
    test_category= os.path.join(test_path, class_name)
    count = count_images_in_category(test_category)
    print(f"Number of images in {class_name} category (Testing): {count}")

In [None]:
for class_name in classes:
    val_category= os.path.join(val_path, class_name)
    count = count_images_in_category(val_category)
    print(f"Number of images in {class_name} category (Validation): {count}")

In [None]:
model = Sequential([
    Conv2D(32,(3,3) , activation = "relu" , input_shape = (244,244,3)) , # 32 is the number of filters , (3,3) is the size of the filter , input shape is the size of images the models is expecting
    MaxPooling2D((2,2)),
    Conv2D(32,(3,3) , activation = "relu" ),
    MaxPooling2D((2,2)),
    Flatten(),
    Dense(64, activation = 'relu'), # 64 neurons
    Dense(1, activation = 'sigmoid') # 1 output neuron

])
model.summary()

In [None]:
import tensorflow as tf
from tensorflow.keras.preprocessing.image import ImageDataGenerator


train_datagen = ImageDataGenerator(
    rescale=1./255,
    rotation_range=5,
    zoom_range=0.2,
    horizontal_flip=True,
    vertical_flip=True,

)

val_datagen = ImageDataGenerator(rescale=1./255)
test_datagen = ImageDataGenerator(rescale=1./255)

train_generator = train_datagen.flow_from_directory(
    train_path,
    target_size=(244, 244),
    batch_size=32,
    class_mode='binary'
)

val_generator = val_datagen.flow_from_directory(
    val_path,
    target_size=(244, 244),
    batch_size=32,
    class_mode='binary'
)

In [None]:
model.compile(loss ="binary_crossentropy" , optimizer="adam" , metrics=['accuracy'])

In [None]:
# Train the model and save the training history
history = model.fit(
    train_generator,
    steps_per_epoch=train_generator.samples // train_generator.batch_size,
    epochs=10,  # You can adjust the number of epochs
    validation_data=val_generator,
    validation_steps=val_generator.samples // val_generator.batch_size
)

In [None]:
# Plot the learning curve
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('Model Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend(['Train', 'Validation'], loc='upper left')
plt.show()

In [None]:
test_generator = test_datagen.flow_from_directory(
    test_path,
    target_size=(244, 244),
    batch_size=32,
    class_mode='binary',
    shuffle=False)

results = model.evaluate(test_generator)
print("Test Loss:", results[0])
print("Test Accuracy:", results[1])

In [None]:
import seaborn as sns
from sklearn.metrics import classification_report, confusion_matrix
test_generator = test_datagen.flow_from_directory(
    test_path,
    target_size=(244, 244),
    batch_size=32,
    class_mode='binary',
    shuffle=False)

y_pred = model.predict(test_generator)
y_pred_classes = (y_pred > 0.5).astype(int)

y_true = test_generator.classes

plt.figure(figsize=(8, 6))
sns.heatmap(confusion_matrix(y_true, y_pred_classes), annot=True, fmt='d', cmap='Blues',
            xticklabels=["ID", "Not_ID"],
            yticklabels=["ID ", "Not_ID"])
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.show()

In [None]:
filenames = test_generator.filenames
for i in range(len(filenames)):
    img_path = os.path.join(test_path, filenames[i])
    img = image.load_img(img_path, target_size=(244, 244))
    img_array = image.img_to_array(img)
    img_array = np.expand_dims(img_array, axis=0)
    img_array /= 255.0

    prediction = model.predict(img_array)
    predicted_class = int(np.round(prediction)[0])

    true_class = int(y_true[i])

    plt.imshow(img)
    plt.title(f"True Class: {true_class}, Predicted Class: {predicted_class}")
    plt.axis('off')
    plt.show()