# Aninmal Shelter - Dog Classification

In [11]:
# Imports
import sys
# Add TensorFlow path (needed for this Windows installation)
sys.path.insert(0, r'C:\tf_temp')

import pandas as pd
import os
import tensorflow as tf

from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split

print(f"TensorFlow version: {tf.__version__}")
print("All imports successful!")

TensorFlow version: 2.20.0
All imports successful!


## Data Preparation

In [12]:
# Test TensorFlow installation from C:\tf_temp
import sys
sys.path.insert(0, r'C:\tf_temp')

try:
    import tensorflow as tf
    print(f"TensorFlow version: {tf.__version__}")
    print("TensorFlow imported successfully!")
    
    # Test basic functionality
    hello = tf.constant('Hello, TensorFlow!')
    print(f"Test tensor: {hello}")
    
except ImportError as e:
    print(f"TensorFlow import failed: {e}")
    print("Try the manual registry edit option instead.")

TensorFlow version: 2.20.0
TensorFlow imported successfully!
Test tensor: b'Hello, TensorFlow!'


In [13]:
labels_path = 'data/labels.csv'
labels=pd.read_csv(labels_path)

In [14]:
# List of filenames
image_dir = 'data/train/'
filenames = [image_dir + fname + '.jpg' for fname in labels['id']]

# Check if any files do not exist
missing_files = [fname for fname in filenames if not os.path.isfile(fname)]
print(f"Missing files: {len(missing_files)}")

Missing files: 0


In [15]:
# Set variables
X = filenames
y = labels['breed']

In [16]:
# One Hot Encode
encoder = OneHotEncoder(sparse_output=False)
y_encoded = encoder.fit_transform(labels[['breed']])
encoding_labels = encoder.categories_[0]

In [17]:
# Split data
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2)

# Subset for building
X_train_subset = X_train[:1000]
X_val_subset = X_val[:1000]

## Preprocessing Images

In [18]:
#parameters for image and batch size
IMG_SIZE = 224
BATCH_SIZE = 32

In [19]:
#Creating a function to preprocess the images
def process_image(image_path):
  image = tf.io.read_file(image_path)
  image = tf.image.decode_jpeg(image, channels=3)
  image = tf.image.convert_image_dtype(image, tf.float32)
  image = tf.image.resize(image, size=[IMG_SIZE, IMG_SIZE])

  return image

In [20]:
#Create a function to return a tuple (image, label)
def get_image_label(image_path, label):
  image = process_image(image_path)
  return image, label

In [27]:
# Create datasets from the file paths and labels
def create_data_batches(X, y=None, batch_size=BATCH_SIZE, valid_data=False, test_data=False):
    # If the data is a test dataset, we probably don't have have labels
    if test_data:
        print("Creating test data batches...")
        data = tf.data.Dataset.from_tensor_slices((tf.constant(X))) # only filepaths (no labels)
        data_batch = data.map(process_image).batch(batch_size)
        return data_batch
    
    # If the data is a valid dataset, we don't need to shuffle it
    elif valid_data:
        print("Creating validation data batches...")
        data = tf.data.Dataset.from_tensor_slices((tf.constant(X), # filepaths
                                                   tf.constant(y))) # labels
        data_batch = data.map(get_image_label).batch(batch_size)
        return data_batch
    
    else:
        print("Creating training data batches...")
        # Turn filepaths and labels into Tensors
        data = tf.data.Dataset.from_tensor_slices((tf.constant(X), # filepaths
                                                   tf.constant(y))) # labels
        
        # Shuffling pathnames and labels before mapping image processor function is faster than shuffling the mapped data
        data = data.shuffle(buffer_size=len(X))
        
        # Create (image, label) tuples (this also turns the image path into a preprocessed image)
        data = data.map(get_image_label)
        
        # Turn the training data into batches
        data_batch = data.batch(batch_size)
    return data_batch

In [22]:
# Create training and validation data batches
print("Creating training and validation data batches...")

# Create training data batches (shuffled)
train_data = create_data_batches(X_train_subset, y_train[:1000])

# Create validation data batches (not shuffled)
val_data = create_data_batches(X_val_subset, y_val[:1000], valid_data=True)

print(f"Training batches: {len(train_data)}")
print(f"Validation batches: {len(val_data)}")

Creating training and validation data batches...
Creating training data batches...
Creating validation data batches...
Training batches: 32
Validation batches: 32
