In [1]:
# Import Necessary Libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
import matplotlib.pyplot as plt
from PIL import Image
import os
import pickle

In [2]:
from google.colab import drive
drive.mount('/content/drive')
!ls '/content/drive/MyDrive/Colab Notebooks/cs482FINALPROJECT'
import shutil

source_folder = '/content/drive/MyDrive/Colab Notebooks/cs482FINALPROJECT'
destination_folder = '/content/data'


Mounted at /content/drive
archive  csvs  LogisticReg.ipynb


In [3]:
# Load CSV Files
train_df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/cs482FINALPROJECT/csvs/cardatasettrain.csv')
test_df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/cs482FINALPROJECT/csvs/cardatasettest.csv')

# Clean DataFrames
train_df_clean = train_df.drop(columns=['Unnamed: 0'])

# Remove quotes from image filenames in the test DataFrame
if 'Unnamed: 0' in test_df.columns:
    test_df_clean = test_df.drop(columns=['Unnamed: 0'])
else:
    test_df_clean = test_df  # Assign original DataFrame if column not found

# Fix formatting issue in the test DataFrame
if 'image' in test_df_clean.columns:
    test_df_clean['image'] = test_df_clean['image'].str.strip("'")


In [5]:
import os
import numpy as np
from PIL import Image
from concurrent.futures import ThreadPoolExecutor

# Define image loading function for a single image
def load_single_image(img_name, folder_path, img_size=(64, 64)):
    img_path = os.path.join(folder_path, img_name)
    try:
        img = Image.open(img_path).resize(img_size).convert('RGB')  # Ensure RGB
        return np.array(img).flatten()  # Flatten the image
    except Exception as e:
        print(f"Error loading image {img_name}: {e}")
        return None

# Define parallelized image loading function
def load_images_parallel(dataframe, folder_path, img_size=(64, 64)):
    images = []
    with ThreadPoolExecutor() as executor:
        # Parallelize loading with ThreadPoolExecutor
        results = list(executor.map(lambda img_name: load_single_image(img_name, folder_path, img_size), dataframe['image']))
        # Filter out any None results due to loading errors
        images = [img for img in results if img is not None]
    return np.array(images, dtype=np.float32)

# Load training and testing images in parallel
train_images = load_images_parallel(train_df_clean, "/content/drive/MyDrive/Colab Notebooks/cs482FINALPROJECT/archive/cars_train/cars_train")
test_images = load_images_parallel(test_df_clean, "/content/drive/MyDrive/Colab Notebooks/cs482FINALPROJECT/archive/cars_test/cars_test")

# Verify loaded image shapes
print(f"Train Images Shape: {train_images.shape}")
print(f"Test Images Shape: {test_images.shape}")

Train Images Shape: (8144, 12288)
Test Images Shape: (8041, 12288)


In [6]:
# Extract bounding box features
train_boxes = train_df_clean[['x1', 'y1', 'x2', 'y2']].values
test_boxes = test_df_clean[['x1', 'y1', 'x2', 'y2']].values

# Combine image features with bounding boxes
X_train_full = np.hstack((train_boxes, train_images))
X_test = np.hstack((test_boxes, test_images))

# Encode labels
label_encoder = LabelEncoder()

if 'Class' in train_df_clean.columns:
    y_train_encoded = label_encoder.fit_transform(train_df_clean['Class'].values)
elif 'class' in train_df_clean.columns:
    print("Warning: Using column 'class' instead of 'Class'")
    y_train_encoded = label_encoder.fit_transform(train_df_clean['class'].values)
else:
    raise KeyError("Neither 'Class' nor 'class' column found in the DataFrame")

# Split into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(
    X_train_full, y_train_encoded, test_size=0.2, random_state=42
)



In [7]:
import numpy as np
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression

# Sample a small subset of images (e.g., 500 images randomly from the full dataset)
sample_size = 500  # Adjust this number as needed for a smaller subset
indices = np.random.choice(X_train.shape[0], size=sample_size, replace=False)
X_train_sampled = X_train[indices]
y_train_sampled = y_train[indices]

# Reduce dimensions using PCA
pca = PCA(n_components=500)  # Reduce to 500 components (you can adjust this)
X_train_pca = pca.fit_transform(X_train_sampled)

# Initialize the Logistic Regression classifier with the 'saga' solver and parallelization
logreg_model = LogisticRegression(
    max_iter=1000,              # Increase iterations if necessary for convergence
    multi_class='multinomial',  # Multi-class classification
    solver='saga',              # Use 'saga' for faster convergence with large datasets
    n_jobs=-1,                  # Use all available CPU cores for parallel processing
    random_state=42
)

# Train the model
logreg_model.fit(X_train_pca, y_train_sampled)




In [8]:
import numpy as np
from sklearn.metrics import accuracy_score

# Sample a smaller subset from the validation set (e.g., 500 samples for quick testing)
sample_size = 500  # Adjust this number based on your needs
val_indices = np.random.choice(X_val.shape[0], size=sample_size, replace=False)
X_val_sampled = X_val[val_indices]
y_val_sampled = y_val[val_indices]

# Apply PCA transformation to the validation subset before prediction
X_val_sampled_pca = pca.transform(X_val_sampled)  # Apply the same PCA transformation used during training

# Predict on the sampled validation set using Logistic Regression
y_val_pred = logreg_model.predict(X_val_sampled_pca)  # Use the transformed data for prediction

# Calculate validation accuracy on the subset
val_accuracy = accuracy_score(y_val_sampled, y_val_pred)
print(f'Validation Accuracy: {val_accuracy * 100:.2f}%')

# Inspect predictions and true labels for the subset
print("Predictions (first 10):", y_val_pred[:10])
print("True Labels (first 10):", y_val_sampled[:10])

# Calculate Train Data Accuracy
# Apply PCA transformation to the training set before prediction
X_train_pca = pca.transform(X_train)  # Apply PCA transformation to the entire training set
y_train_pred = logreg_model.predict(X_train_pca)

# Calculate accuracy on the full training set
train_accuracy = accuracy_score(y_train, y_train_pred)
print(f'Train Accuracy: {train_accuracy * 100:.2f}%')


Validation Accuracy: 2.00%
Predictions (first 10): [152  77 118  32 107 101 177 126  90   5]
True Labels (first 10): [110  62  18 170 194 123  24   9 112  37]
Train Accuracy: 9.09%


In [9]:
import pickle

# Save the trained Logistic Regression model
model_filename = "logreg_car_model.pkl"
with open(model_filename, 'wb') as file:
    pickle.dump(logreg_model, file)

print(f"Model saved as {model_filename}")


Model saved as logreg_car_model.pkl
