**Dog Breed Identification**

Importing dependencies

In [26]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import tensorflow as tf
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.applications import MobileNetV2
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, GlobalAveragePooling2D, Dropout
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from sklearn.preprocessing import LabelBinarizer

Load and Explore labels.csv

In [27]:
label_df=pd.read_csv('/content/labels.csv')

In [28]:
# first 5 rows
label_df.head()

Unnamed: 0,id,breed
0,000bec180eb18c7604dcecc8fe0dba07,boston_bull
1,001513dfcb2ffafc82cccf4d8bbaba97,dingo
2,001cdf01b096e06d78e9e5112d419397,pekinese
3,00214f311d5d2247d5dfe4fe24b2303d,bluetick
4,0021f9ceb3235effd7fcde7f7538ed62,golden_retriever


In [29]:
label_df.shape

(10222, 2)

In [30]:
label_df['breed'].value_counts()      #number of images available for each breed

Unnamed: 0_level_0,count
breed,Unnamed: 1_level_1
scottish_deerhound,126
maltese_dog,117
afghan_hound,116
entlebucher,115
bernese_mountain_dog,114
...,...
golden_retriever,67
komondor,67
brabancon_griffon,67
eskimo_dog,66


In [31]:
label_df['breed'].nunique()  #  counts the number of unique dog breeds

120

Encode Breed Labels to One-Hot

In [33]:
from sklearn.preprocessing import LabelBinarizer

lb = LabelBinarizer()
label_df['breed_vec'] = list(lb.fit_transform(label_df['breed']))
y = lb.transform(label_df['breed'])


Add File Paths to DataFrame

In [34]:
import os

# Check how many of the images in the DataFrame actually exist
label_df['file_path'] = label_df['id'].apply(lambda x: f"/content/train/{x}.jpg")
label_df['exists'] = label_df['file_path'].apply(os.path.exists)

# How many files are found?
print("Total:", len(label_df))
print("Found:", label_df['exists'].sum())
print("Missing:", (~label_df['exists']).sum())



Total: 10222
Found: 10222
Missing: 0


In [35]:
label_df.head()

Unnamed: 0,id,breed,breed_vec,file_path,exists
0,000bec180eb18c7604dcecc8fe0dba07,boston_bull,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",/content/train/000bec180eb18c7604dcecc8fe0dba0...,True
1,001513dfcb2ffafc82cccf4d8bbaba97,dingo,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",/content/train/001513dfcb2ffafc82cccf4d8bbaba9...,True
2,001cdf01b096e06d78e9e5112d419397,pekinese,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",/content/train/001cdf01b096e06d78e9e5112d41939...,True
3,00214f311d5d2247d5dfe4fe24b2303d,bluetick,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",/content/train/00214f311d5d2247d5dfe4fe24b2303...,True
4,0021f9ceb3235effd7fcde7f7538ed62,golden_retriever,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",/content/train/0021f9ceb3235effd7fcde7f7538ed6...,True


**Create Image Generators for Training/Validation**

ImageDataGenerator Configuration

In [36]:
datagen = ImageDataGenerator(
    rescale=1./255,
    validation_split=0.2,
    rotation_range=20,
    width_shift_range=0.2,
    height_shift_range=0.2,
    shear_range=0.15,
    zoom_range=0.15,
    horizontal_flip=True,
    fill_mode='nearest'
)

Training Generator

In [37]:
train_gen = datagen.flow_from_dataframe(
    dataframe=label_df,
    x_col='file_path',
    y_col='breed',
    target_size=(224, 224),
    class_mode='categorical',
    batch_size=32,
    subset='training',
    shuffle=True
)

Found 8178 validated image filenames belonging to 120 classes.


Validation Generator

In [38]:
val_gen = datagen.flow_from_dataframe(
    dataframe=label_df,
    x_col='file_path',
    y_col='breed',
    target_size=(224, 224),
    class_mode='categorical',
    batch_size=32,
    subset='validation',
    shuffle=False
)

Found 2044 validated image filenames belonging to 120 classes.


Load Pretrained MobileNetV2 (Transfer Learning)

In [39]:
base_model = MobileNetV2(weights='imagenet', include_top=False, input_shape=(224, 224, 3))
base_model.trainable = False  # Freeze base model


Add Custom Layers on Top

In [40]:
x = base_model.output
x = GlobalAveragePooling2D()(x)

x = Dropout(0.5)(x)
x = Dense(256, activation='relu')(x)
output = Dense(len(lb.classes_), activation='softmax')(x)

model = Model(inputs=base_model.input, outputs=output)


Compile the Model

In [41]:
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.summary()


In [42]:
# 1. Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [43]:
# 2. Set path to your uploaded file (update path as per your Drive)
file_path = '/content/drive/MyDrive/train.zip'

# 3. (Optional) Unzip if it's a zip file
import zipfile
with zipfile.ZipFile(file_path, 'r') as zip_ref:
    zip_ref.extractall('/content/train')  # Extract to Colab working directory


Train the Model

In [44]:
print("Train samples:", train_gen.samples)
print("Validation samples:", val_gen.samples)


Train samples: 8178
Validation samples: 2044


In [45]:
checkpoint = ModelCheckpoint('cnn_model.h5', save_best_only=True, monitor='val_accuracy', mode='max')
earlystop = EarlyStopping(monitor='val_loss', patience=3)

history = model.fit(
    train_gen,
    validation_data=val_gen,
    epochs=10,
    callbacks=[checkpoint, earlystop]
)



  self._warn_if_super_not_called()


Epoch 1/10
[1m256/256[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2s/step - accuracy: 0.2046 - loss: 3.6977



[1m256/256[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m641s[0m 2s/step - accuracy: 0.2052 - loss: 3.6939 - val_accuracy: 0.5881 - val_loss: 1.4398
Epoch 2/10
[1m256/256[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2s/step - accuracy: 0.5725 - loss: 1.5058



[1m256/256[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m604s[0m 2s/step - accuracy: 0.5725 - loss: 1.5058 - val_accuracy: 0.6301 - val_loss: 1.2626
Epoch 3/10
[1m256/256[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2s/step - accuracy: 0.6308 - loss: 1.3011



[1m256/256[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m583s[0m 2s/step - accuracy: 0.6308 - loss: 1.3011 - val_accuracy: 0.6355 - val_loss: 1.2364
Epoch 4/10
[1m256/256[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m668s[0m 2s/step - accuracy: 0.6489 - loss: 1.2159 - val_accuracy: 0.6345 - val_loss: 1.2295
Epoch 5/10
[1m256/256[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2s/step - accuracy: 0.6645 - loss: 1.1338



[1m256/256[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m584s[0m 2s/step - accuracy: 0.6645 - loss: 1.1339 - val_accuracy: 0.6438 - val_loss: 1.2058
Epoch 6/10
[1m256/256[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m604s[0m 2s/step - accuracy: 0.6676 - loss: 1.0933 - val_accuracy: 0.6399 - val_loss: 1.2459
Epoch 7/10
[1m256/256[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m617s[0m 2s/step - accuracy: 0.6860 - loss: 1.0402 - val_accuracy: 0.6429 - val_loss: 1.2152
Epoch 8/10
[1m256/256[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2s/step - accuracy: 0.6941 - loss: 1.0172



[1m256/256[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m628s[0m 2s/step - accuracy: 0.6941 - loss: 1.0173 - val_accuracy: 0.6580 - val_loss: 1.1847
Epoch 9/10
[1m256/256[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m571s[0m 2s/step - accuracy: 0.7147 - loss: 0.9269 - val_accuracy: 0.6531 - val_loss: 1.1803
Epoch 10/10
[1m256/256[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m598s[0m 2s/step - accuracy: 0.7169 - loss: 0.9155 - val_accuracy: 0.6531 - val_loss: 1.1605







































Load Test Images

In [46]:
file_path = '/content/drive/MyDrive/test.zip'

# 3. (Optional) Unzip if it's a zip file
import zipfile
with zipfile.ZipFile(file_path, 'r') as zip_ref:
    zip_ref.extractall('/content/test')  # Extract to Colab working directory


In [47]:
test_dir = "/content/test"
test_filenames = os.listdir(test_dir)

test_df = pd.DataFrame({
    'file_path': [os.path.join(test_dir, fname) for fname in test_filenames],
    'id': [fname.split('.')[0] for fname in test_filenames]
})

test_gen = ImageDataGenerator(rescale=1./255).flow_from_dataframe(
    dataframe=test_df,
    x_col='file_path',
    y_col=None,
    target_size=(224, 224),
    batch_size=32,
    class_mode=None,
    shuffle=False
)


Found 10357 validated image filenames.


Predict and Generate Submission File

In [48]:
preds = model.predict(test_gen, verbose=1)
submission = pd.DataFrame(preds, columns=lb.classes_)
submission.insert(0, 'id', test_df['id'])
submission.to_csv('submission.csv', index=False)


[1m324/324[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m503s[0m 2s/step


In [49]:
model.save('cnn_model.h5')





In [50]:
model.save('cnn_model.keras')   # new Keras format
