Dataset being used: https://www.kaggle.com/competitions/dogs-vs-cats/data

In [2]:
import numpy as np
import os
import random
import shutil
import cv2
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler

#### Partitioning a small portion of the huge data

In [3]:
def divide_data(source_dir, dest_dir, prefix, num_files):
    files = [f for f in os.listdir(source_dir) if f.startswith(prefix)]
    random_files = random.sample(files, num_files)
    for file in random_files:
        shutil.move(os.path.join(source_dir, file), os.path.join(dest_dir, file))

In [4]:
# Picked 3200 random imgs from train set and rest 800 random imgs from test set

# divide_data('data/catsNdogs/train', 'data/useddata/train', 'cat', 1600)
# divide_data('data/catsNdogs/train', 'data/useddata/train', 'dog', 1600)

# divide_data('data/catsNdogs/test', 'data/useddata/test', '', 800)

### Data Preprocessing

In [5]:
# The function is checking the label from train set. Creating 'cat'= 0, 'dog'= 1

def load_images(folder):
    images = []
    labels = []
    for filename in os.listdir(folder):
        label = filename.split('.')[0]
        if label == 'cat':
            labels.append(0)
        else:
            labels.append(1)
        img = cv2.imread(os.path.join(folder, filename))
        img_resized = cv2.resize(img, (100, 100))
        img_resized_gray = cv2.cvtColor(img_resized, cv2.COLOR_BGR2GRAY)
        images.append(img_resized_gray)
    return np.array(images), np.array(labels)

In [6]:
train_dir = "data/useddata/train"
test_dir = "data/useddata/test"

In [7]:
X_train, y_train = load_images(train_dir)
X_test, y_test = load_images(test_dir)

In [8]:
X_train_flat = X_train.reshape(X_train.shape[0], -1)
X_test_flat = X_test.reshape(X_test.shape[0], -1)

In [9]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_flat)
X_test_scaled = scaler.transform(X_test_flat)

### Model Training

SVM using Linear kernel

In [10]:
svm_model_linear = SVC(kernel='linear', C=1.0, random_state=42)
svm_model_linear.fit(X_train_scaled, y_train)

In [11]:
y_pred_linear = svm_model_linear.predict(X_test_scaled)
accuracy_svm_linear = accuracy_score(y_test, y_pred_linear)
print("SVM Accuracy (Linear Kernel):", accuracy_svm_linear)

SVM Accuracy (Linear Kernel): 0.44875


SVM using Radial Basis Function Kernel

In [12]:
svm_model_rbf = SVC(kernel='rbf', C=1.0, gamma='scale')
svm_model_rbf.fit(X_train_scaled, y_train)

In [13]:
y_pred_rbf = svm_model_rbf.predict(X_test_scaled)
accuracy_svm_rbf = accuracy_score(y_test, y_pred_rbf)
print("SVM Accuracy (RBF Kernel):", accuracy_svm_rbf)

SVM Accuracy (RBF Kernel): 0.55


SVM using Sigmoid Kernel

In [14]:
svm_model_sigmoid = SVC(kernel='sigmoid', C=1.0, degree=20)
svm_model_sigmoid.fit(X_train_scaled, y_train)

In [15]:
y_pred_sigmoid = svm_model_sigmoid.predict(X_test_scaled)
accuracy_svm_sigmoid = accuracy_score(y_test, y_pred_sigmoid)
print("SVM Accuracy (Sigmoid Kernel):", accuracy_svm_sigmoid)

SVM Accuracy (Sigmoid Kernel): 0.53875


Exporting predictions in a csv file

In [16]:
import csv

output_csv_file = "predictions.csv"
y_pred_model = y_pred_rbf

predictions_with_ids = []

for i, predicted_label in enumerate(y_pred_model):
    filename = os.listdir(test_dir)[i]
    test_id = filename.split('.')[0]
    predictions_with_ids.append([test_id, predicted_label])

with open(output_csv_file, 'w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(['ID', 'Label'])
    writer.writerows(predictions_with_ids)

print("Predictions saved to:", output_csv_file)


Predictions saved to: predictions.csv
