<a href="https://colab.research.google.com/github/jillianhaig/Project3_DS4002/blob/main/SCRIPTS/3_Project3Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# This code runs our model, first connecting to google drive and importing the
# images into GitHub through Google Drive, since GitHub is not large enough to store the images

In [None]:
from google.colab import drive
from google.colab import files
import zipfile
import os
import pandas as pd

# Mount Google Drive to access the dataset
drive.mount('/content/drive')

# Path to the zip file on Google Drive (need to change for your path)
zip_file_path = '/content/drive/My Drive/vehicleimages.zip'

# Directory where you want to extract the files
extract_to_path = '/content/vehicleimages'

# Unzip the dataset
with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    zip_ref.extractall(extract_to_path)

print(f"Dataset unzipped to: {extract_to_path}")

image_paths = []
labels = []

vehicle_types = os.listdir(extract_to_path)

# For each subdirectory, get image paths
for vehicle_type in vehicle_types:
    vehicle_folder = os.path.join(extract_to_path, vehicle_type)

    if os.path.isdir(vehicle_folder):
        for img_file in os.listdir(vehicle_folder):
            if img_file.lower().endswith(('.jpg', '.jpeg', '.png', '.bmp')):
                image_path = os.path.join(vehicle_folder, img_file)
                image_paths.append(image_path)
                labels.append(vehicle_type)

df = pd.DataFrame({'image_path': image_paths, 'label': labels})

# Creates dictionary to create dummy variables in integer format
category_map = {
    'other': 0,
    'hatchback': 1,
    'suv': 2,
    'pickup': 3,
    'sedan': 4
}

# Apply the dictionary to the 'label' column
df['label'] = df['label'].map(category_map)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Dataset unzipped to: /content/vehicleimages
                                       image_path      label
0   /content/vehicleimages/hatchback/PHOTO_97.jpg  hatchback
1  /content/vehicleimages/hatchback/PHOTO_464.jpg  hatchback
2  /content/vehicleimages/hatchback/PHOTO_393.jpg  hatchback
3  /content/vehicleimages/hatchback/PHOTO_480.jpg  hatchback
4  /content/vehicleimages/hatchback/PHOTO_574.jpg  hatchback


# CNN Model Construction and Compiling

In [None]:
import tensorflow as tf
from tensorflow.keras import layers, models
from tensorflow.keras.applications import ResNet50
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from sklearn.model_selection import train_test_split

# Assuming `df` is a pandas DataFrame with file paths to images and their corresponding labels
# df should contain a column 'file_path' with paths to images and a column 'label' with corresponding class labels

# Load the pre-trained ResNet50 model
resnet50_base = ResNet50(weights='imagenet', include_top=False, input_shape=(224, 224, 3))

# Freeze the layers of ResNet50 to prevent updating during initial training
resnet50_base.trainable = False

# Define the custom classification head
model = models.Sequential([
    resnet50_base,  # Pre-trained ResNet50 model
    layers.GlobalAveragePooling2D(),  # Global Average Pooling layer
    layers.Dense(512, activation='relu'),  # Fully connected layer
    layers.Dropout(0.5),  # Dropout layer to prevent overfitting
    layers.Dense(5, activation='softmax')  # Output layer with 5 units for 5 vehicle classes
])

# Compile the model
model.compile(optimizer=tf.keras.optimizers.Adam(),
              loss='categorical_crossentropy',
              metrics=['accuracy'])

# Display model architecture
model.summary()


# Initial Model Training

In [None]:
from tensorflow.keras.callbacks import EarlyStopping

# Preprocessing the data (resizing and normalizing images)
image_size = (224, 224)

# Create an ImageDataGenerator to load images with augmentation for training and validation
train_datagen = ImageDataGenerator(
    rescale=1./255,
    rotation_range=20,
    width_shift_range=0.2,
    height_shift_range=0.2,
    shear_range=0.2,
    zoom_range=0.2,
    horizontal_flip=True,
    fill_mode='nearest'
)

validation_datagen = ImageDataGenerator(rescale=1./255)

# Example of file paths and labels:
# df['image_path'] - list of paths to images
# df['label'] - list of integer class labels (0 to 4 for 5 vehicle types)

# Train-test split (90-10 split)
train_paths, val_paths, train_labels, val_labels = train_test_split(df['image_path'], df['label'], test_size=0.1, stratify=df['label'])

# Create ImageDataGenerators for training and validation
train_generator = train_datagen.flow_from_dataframe(
    dataframe=df.loc[train_paths.index],
    directory='/content',
    x_col='image_path',
    y_col='label',
    target_size=image_size,
    batch_size=32,
    class_mode='categorical'
)

validation_generator = validation_datagen.flow_from_dataframe(
    dataframe=df.loc[val_paths.index],
    directory='/content',
    x_col='image_path',
    y_col='label',
    target_size=image_size,
    batch_size=32,
    class_mode='categorical'
)

early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

# Train the model
history = model.fit(
    train_generator,
    epochs=50,  # Or any number you prefer, but EarlyStopping will stop early if necessary
    validation_data=validation_generator,
    verbose=1,
    callbacks=[early_stopping]  # Add EarlyStopping
)

Found 3920 validated image filenames belonging to 5 classes.
Found 436 validated image filenames belonging to 5 classes.
Epoch 1/50


  self._warn_if_super_not_called()


[1m123/123[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m55s[0m 429ms/step - accuracy: 0.3400 - loss: 1.5115 - val_accuracy: 0.3326 - val_loss: 1.5118
Epoch 2/50
[1m123/123[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 433ms/step - accuracy: 0.3403 - loss: 1.5112 - val_accuracy: 0.3716 - val_loss: 1.4847
Epoch 3/50
[1m123/123[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m80s[0m 420ms/step - accuracy: 0.3394 - loss: 1.4994 - val_accuracy: 0.3716 - val_loss: 1.4699
Epoch 4/50
[1m123/123[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m54s[0m 421ms/step - accuracy: 0.3495 - loss: 1.4870 - val_accuracy: 0.3830 - val_loss: 1.4624
Epoch 5/50
[1m123/123[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 425ms/step - accuracy: 0.3617 - loss: 1.4802 - val_accuracy: 0.3234 - val_loss: 1.4774
Epoch 6/50
[1m123/123[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m54s[0m 423ms/step - accuracy: 0.3658 - loss: 1.4686 - val_accuracy: 0.3739 - val_loss: 1.4493
Epoch 7/50
[1m123/12

# Fine Tuning the Model

In [None]:
# Unfreeze the top layers of ResNet50 for fine-tuning
for layer in resnet50_base.layers[-10:]:  # Unfreeze the last 10 layers of ResNet50
    layer.trainable = True

# Recompile the model after unfreezing some layers
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=1e-5),
              loss='categorical_crossentropy',
              metrics=['accuracy'])

# Fine-tune the model
history_finetune = model.fit(
    train_generator,
    epochs=50,
    validation_data=validation_generator,
    verbose=1,
    callbacks=[early_stopping]  # Add EarlyStopping
)

Epoch 1/50
[1m123/123[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m76s[0m 495ms/step - accuracy: 0.3342 - loss: 3.1483 - val_accuracy: 0.3188 - val_loss: 1.3977
Epoch 2/50
[1m123/123[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m55s[0m 430ms/step - accuracy: 0.4159 - loss: 1.3872 - val_accuracy: 0.4702 - val_loss: 1.4313
Epoch 3/50
[1m123/123[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m55s[0m 425ms/step - accuracy: 0.4463 - loss: 1.3427 - val_accuracy: 0.4151 - val_loss: 1.4516
Epoch 4/50
[1m123/123[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m83s[0m 433ms/step - accuracy: 0.4670 - loss: 1.3071 - val_accuracy: 0.5138 - val_loss: 1.2348
Epoch 5/50
[1m123/123[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 431ms/step - accuracy: 0.4707 - loss: 1.2720 - val_accuracy: 0.5688 - val_loss: 1.1278
Epoch 6/50
[1m123/123[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 432ms/step - accuracy: 0.4849 - loss: 1.2633 - val_accuracy: 0.5711 - val_loss: 1.1075
Epoch 7/50

# Evaluation and Conclusion

In [None]:
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score, accuracy_score

# Assuming you already have 'df', 'y', 'train_datagen', 'validation_datagen', 'model', 'early_stopping', and 'image_size'

# K-fold cross-validation
kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
accuracies = []
f1_scores = []

for train_idx, test_idx in kfold.split(df['image_path'], df['label']):
    # Split the data into training and testing
    train_paths, test_paths = df['image_path'].iloc[train_idx], df['image_path'].iloc[test_idx]
    train_labels, test_labels = df['label'].iloc[train_idx], df['label'].iloc[test_idx]

    # Prepare generators for training and testing
    train_generator = train_datagen.flow_from_dataframe(
        dataframe=df.loc[train_idx],
        directory='/content',
        x_col='image_path',
        y_col='label',
        target_size=image_size,
        batch_size=32,
        class_mode='categorical'
    )

    test_generator = validation_datagen.flow_from_dataframe(
        dataframe=df.loc[test_idx],
        directory='/content',
        x_col='image_path',
        y_col='label',
        target_size=image_size,
        batch_size=32,
        class_mode='categorical'
    )

    # Train the model on the current fold
    model.fit(train_generator, epochs=50, validation_data=test_generator, callbacks=[early_stopping])

    # Evaluate the model on the test set
    y_pred = model.predict(test_generator, verbose=1)
    y_true = test_labels

    # Calculate accuracy and F1 score
    acc = accuracy_score(y_true.argmax(axis=1), y_pred.argmax(axis=1))
    f1 = f1_score(y_true.argmax(axis=1), y_pred.argmax(axis=1), average='weighted')

    accuracies.append(acc)
    f1_scores.append(f1)

NameError: name 'df' is not defined

# Results and Visualization

In [None]:
# Print results
print("Individual Accuracy: ", accuracies)
print("Individual F1-Score: ", f1_scores)
print("Mean Accuracy: ", np.mean(accuracies))
print("Mean F1-Score: ", np.mean(f1_scores))

In [None]:
# ROC Graph

import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc
from sklearn.preprocessing import LabelBinarizer
from sklearn.metrics import roc_auc_score

y_true_all = []
y_probs_all = []

# Store true labels and predicted probabilities for ROC curve
y_true_all.append(y_true)
y_probs_all.append(y_pred)

# Convert lists of true labels and predicted probabilities to arrays
y_true_all = np.concatenate(y_true_all, axis=0)
y_probs_all = np.concatenate(y_probs_all, axis=0)

# Binarize the true labels (multi-class to binary for each class)
lb = LabelBinarizer()
y_true_bin = lb.fit_transform(y_true_all)

# Plot ROC curve for each class
n_classes = y_true_bin.shape[1]
plt.figure(figsize=(10, 8))

for i in range(n_classes):
    fpr, tpr, _ = roc_curve(y_true_bin[:, i], y_probs_all[:, i])
    roc_auc = auc(fpr, tpr)
    plt.plot(fpr, tpr, lw=2, label=f'Class {i} (AUC = {roc_auc:.2f})')

# Plot the diagonal line (random classifier)
plt.plot([0, 1], [0, 1], color='gray', linestyle='--', lw=2)

# Formatting the plot
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve for Multi-Class Classification')
plt.legend(loc='lower right')
plt.show()