# Processing the Data and making a CSV file using it

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import csv
import cv2
import time
import pickle

In [2]:
def calc_hist(img):
    histogram = [0] * 3
    for j in range(3):
        histr = cv2.calcHist([img], [j], None, [256], [0, 256])
        histr *= 255.0 / histr.max()
        histogram[j] = histr
    return np.array(histogram).T[0]

In [3]:
def get_hist(image_path):
    # read the image
    img = cv2.imread(image_path)

    # convert the image to YCrCb color space
    ycrcb = cv2.cvtColor(img, cv2.COLOR_BGR2YCrCb)

    # convert the image to LUV color space
    luv = cv2.cvtColor(img, cv2.COLOR_BGR2Luv)

    # calculate the histograms
    ycrcb_hist = calc_hist(ycrcb)
    luv_hist = calc_hist(luv)

    return ycrcb_hist, luv_hist, np.append(ycrcb_hist.ravel(), luv_hist.ravel())

In [10]:
def read_pictures(folder_path1, folder_path2):
    hist_list = []
    
    file_list = os.listdir(folder_path1)
    # loop through each file and get its histogram
    for file_name in file_list:
        if file_name.endswith('.png'):
            file_path = os.path.join(folder_path1, file_name)
            _, _, hist = get_hist(file_path)
            hist_list.append([hist, 1])
    
    file_list = os.listdir(folder_path2)
    # loop through each file and get its histogram
    for file_name in file_list:
        if file_name.endswith('.png'):
            file_path = os.path.join(folder_path2, file_name)
            _, _, hist = get_hist(file_path)
            hist_list.append([hist, -1])

    # store the histograms in a pandas dataframe
    df = pd.DataFrame(hist_list, columns=['hist', 'label'])
    
    return df

In [11]:
small_df = read_pictures('Small_Data/Actual_Pictures', 'Small_Data/Replay_Attack')
print(small_df.head())

big_df = read_pictures('Big_Data/Actual_Pictures', 'Big_Data/Replay_Attack')
print(big_df.head())

                                                hist  label
0  [0.29209623, 0.0, 0.0, 3.5051546, 0.0, 0.0, 5....      1
1  [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...      1
2  [0.0, 0.0, 0.0, 8.525836, 0.0, 0.0, 10.851064,...      1
3  [0.0, 0.0, 0.0, 0.2802198, 0.0, 0.0, 0.2802198...      1
4  [1.2927756, 0.0, 0.0, 13.25095, 0.0, 0.0, 19.7...      1
                                                hist  label
0  [0.29209623, 0.0, 0.0, 3.5051546, 0.0, 0.0, 5....      1
1  [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...      1
2  [0.0, 0.0, 0.0, 8.525836, 0.0, 0.0, 10.851064,...      1
3  [0.0, 0.0, 0.0, 0.2802198, 0.0, 0.0, 0.2802198...      1
4  [1.2927756, 0.0, 0.0, 13.25095, 0.0, 0.0, 19.7...      1


In [12]:
def df_to_csv(df, filename):
    if os.path.exists(filename):
        print(f"{filename} already exists. Exiting without writing.")
        return
    # create a new dataframe with 1536 columns
    new_df = pd.DataFrame(df.iloc[:, 0].tolist(), columns=[f'feature{i}' for i in range(1, 1537)])
    
    # add the label column to the new dataframe
    new_df['label'] = df.iloc[:, 1]
    
    # write the new dataframe to a csv file
    new_df.to_csv(filename, index=False)
    
    print(f"{filename} saved successfully.")


In [13]:
df_to_csv(small_df, 'small_histograms.csv')
df_to_csv(big_df, 'big_histograms.csv')

small_histograms.csv saved successfully.
big_histograms.csv saved successfully.


In [2]:
from sklearn.model_selection import train_test_split
def shuffle_and_split_csv(input_file, train_file, test_file, test_size=0.2):
    # read the csv file
    df = pd.read_csv(input_file)

    # shuffle the dataframe
    df = df.sample(frac=1).reset_index(drop=True)

    # split the dataframe into training and testing sets
    train, test = train_test_split(df, test_size=test_size, random_state=42)

    # write the training and testing sets to csv files
    train.to_csv(train_file, index=False)
    test.to_csv(test_file, index=False)

    print("Data shuffled and split into training.csv and testing.csv")


In [16]:
shuffle_and_split_csv('small_histograms.csv', 'small_training.csv', 'small_testing.csv', 0.3)
shuffle_and_split_csv('big_histograms.csv', 'big_training.csv', 'big_testing.csv', 0.3)

Data shuffled and split into training.csv and testing.csv
Data shuffled and split into training.csv and testing.csv


In [3]:
shuffle_and_split_csv('small_bsif.csv', 'small_bsif_training.csv', 'small_bsif_testing.csv', 0.3)

Data shuffled and split into training.csv and testing.csv
