# Baseline Model

I will use the test set to measure accuracy of these baseline models; however, note that hyperparameter tuning and model selection with more complex models will use validation sets or cross-validation to prevent adding bias to our final model.

In [13]:
# import packages needed for scikit-learn, keras ,and tensor flow image classification
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import random
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten, Conv2D, MaxPooling2D, Dropout
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression
from PIL import Image
from sklearn.preprocessing import StandardScaler

## Majority Class Classifier

Our minimal lower bound will be modeled by a majority class classifier. Because scratch is the most common class, we will always predict an image is a scratch.

In [12]:
true_labels = pd.read_csv('../data/processed/test/labels.csv')['label'].values
pred_labels = np.array([2]*len(true_labels))
f1_weighted = f1_score(true_labels, pred_labels, average='weighted')
print(f'F1: {f1_weighted:.3f}')
accuracy = accuracy_score(true_labels, pred_labels)
print(f'Acc: {accuracy:.3f}')

F1: 0.161
Acc: 0.326


# Logistic Regression

In [14]:
img_data = np.array([])
for ind in pd.read_csv('../data/processed/train/labels.csv')['image_id']:
    image = Image.open(f'../data/processed/train/images/{ind}.jpg')
    img_data = np.append(np.array(image) / 255.0, img_data)

In [None]:


# Flatten images and fit logistic regression
X_train_flat = X_train.reshape(X_train.shape[0], -1)  # Flatten each image
X_test_flat = X_test.reshape(X_test.shape[0], -1)

clf = LogisticRegression(max_iter=1000)
clf.fit(X_train_flat, y_train)
y_pred = clf.predict(X_test_flat)

accuracy = accuracy_score(y_test, y_pred)
print(f"Baseline Accuracy (Logistic Regression): {accuracy}")

# Shallow CNN

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense

model = Sequential([
    Conv2D(32, (3, 3), activation='relu', input_shape=(image_height, image_width, num_channels)),
    MaxPooling2D(pool_size=(2, 2)),
    Conv2D(64, (3, 3), activation='relu'),
    MaxPooling2D(pool_size=(2, 2)),
    Flatten(),
    Dense(128, activation='relu'),
    Dense(num_classes, activation='softmax')
])

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.fit(X_train, y_train, epochs=5, batch_size=32, validation_data=(X_test, y_test))
