<a href="https://colab.research.google.com/github/RyanChen12035/w281_final_galaxy_zoo/blob/main/classifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive

# mount Google Drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
# copy the filters from ex7
import numpy as np
import matplotlib.pyplot as plt
import cv2
import os
import math
from mpl_toolkits.axes_grid1 import make_axes_locatable
from PIL import Image
import urllib

In [2]:
import pandas as pd
csv_file_path = '/content/drive/MyDrive/galaxy_zoo/train_data.csv'
column_names = ['path', 'index', 'label']
df_train = pd.read_csv(csv_file_path, names=column_names)

In [6]:
df_train.tail(5)

Unnamed: 0,path,index,label
23053,/content/drive/MyDrive/galaxy_zoo/Train_images...,340940,Class4
23054,/content/drive/MyDrive/galaxy_zoo/Train_images...,359896,Class4
23055,/content/drive/MyDrive/galaxy_zoo/Train_images...,980317,Class4
23056,/content/drive/MyDrive/galaxy_zoo/Train_images...,172625,Class4
23057,/content/drive/MyDrive/galaxy_zoo/Train_images...,130055,Class4


In [14]:
for i in df_train['path'][:5]:
  print(i)

path
/content/drive/MyDrive/galaxy_zoo/Train_images/Train_images/Cigar-shaped smooth/217327.jpg
/content/drive/MyDrive/galaxy_zoo/Train_images/Train_images/Cigar-shaped smooth/932331.jpg
/content/drive/MyDrive/galaxy_zoo/Train_images/Train_images/Cigar-shaped smooth/276804.jpg
/content/drive/MyDrive/galaxy_zoo/Train_images/Train_images/Cigar-shaped smooth/582339.jpg


In [24]:
# preprocess and pipeline

def grayscale_preprocessing(img):
  if np.max(img)>1:
    img = img.astype(np.float32)/255.0
  im_gray = np.mean(img, axis=2)
  return im_gray

def bilateral_preprocessing(img, ksize, sigmX, sigmY):
  bilateral_filtered_image = cv2.bilateralFilter(img, ksize, sigmX, sigmY)
  return bilateral_filtered_image

def high_pass_preprocessing(img, ksize, sig):
  low_pass = cv2.GaussianBlur(img, ksize, 0)
  high_pass = img - low_pass
  return high_pass

def sobel_preprocessing(img, ksize):
  sobelx = cv2.Sobel(img, cv2.CV_32F, 1, 0, ksize=ksize) # Find x and y gradients
  sobely = cv2.Sobel(img, cv2.CV_32F, 0, 1, ksize=ksize)
  magnitude = np.sqrt(sobelx**2.0 + sobely**2.0)
  magnitude = magnitude / np.max(magnitude) # normalize
  return magnitude

def gaussian_blur_preprocessing(img, ksize, sigma):
  blurred_im = cv2.GaussianBlur(img, ksize, sigmaX=sigma[0], sigmaY=sigma[1])
  return blurred_im

def canny_filter(img, high_threshold, low_threshold):
  # ensure the image is normalized to the range [0, 1]
  img = img / np.max(img)
  # scale the image to the range [0, 255] and convert to 8-bit
  img_8u = np.uint8(img * 255)
  # threshold the image and get the interesting points
  # The high threshold is used to identify the strong edges. Pixels with intensity gradients above this threshold are marked as strong edge pixels.
  # The low threshold is used to identify the non-edges. Pixels with intensity gradients below this threshold are suppressed.
  # By setting the high threshold too low, many pixels might be considered as strong edges, leading to a noisy edge image. Similarly, if the low threshold is set too high, you might miss genuine weak edges that should contribute to the final edge image.

  im_threshold = cv2.Canny(image=img_8u, threshold1=low_threshold, threshold2=high_threshold) # Canny Edge
  im_threshold = im_threshold / np.max(im_threshold) # normalize
  return im_threshold

def crop_center(img, cropx, cropy):
    y, x = img.shape
    startx = x // 2 - (cropx // 2)
    starty = y // 2 - (cropy // 2)
    return img[starty:starty+cropy, startx:startx+cropx]


def histogram_clipping(img, threshold_value):
  clipped_image = img.copy()
  # before
  # draw_histogram_gray(clipped_image)
  clipped_image[img < threshold_value/255.0] = 0
  # Normalize the clipped_image to 0-255 for proper histogram visualization
  normalized_clipped_image = cv2.normalize(clipped_image, None, alpha=0, beta=255,
                                             norm_type=cv2.NORM_MINMAX, dtype=cv2.CV_8UC1)
  # draw_histogram_gray(clipped_image)
  return normalized_clipped_image

def hog(img_gray, orientations=9, pixels_per_cell=(8, 8), cells_per_block=(3, 3)):
  block_norm = 'L1'  # Block normalization method
  # orientations = 9   # Number of orientation bins
  # pixels_per_cell = (8, 8)  # Size of the cell
  # cells_per_block = (3, 3)  # Size of the block

  # Compute the HOG features
  hog_features, hog_image = feature.hog(img_gray,
                                        orientations=orientations,
                                        pixels_per_cell=pixels_per_cell,
                                        cells_per_block=cells_per_block,
                                        block_norm=block_norm,
                                        visualize=True)
  return hog_features, hog_image

def preprocess_1(df):
  image_dir = df['path'][1:] # ignore index
  preprocessed_list = []

  for i, image_path in enumerate(image_dir):
    im = np.array(Image.open(image_path))

    im_gray = grayscale_preprocessing(im)
    img_histogram_clipping = histogram_clipping(im_gray, 30)
    im_blurred = gaussian_blur_preprocessing(img_histogram_clipping, (9, 9), (10, 10))
    im_bilateral = bilateral_preprocessing(im_blurred, 5, 30, 30)
    im_sobel = sobel_preprocessing(im_bilateral, 31)
    im_canny = canny_filter(im_blurred, 70, 0)
    im_cropped = crop_center(im_canny, 252, 252)
    im_flattened = im_cropped.flatten()
    preprocessed_list.append(im_flattened)

  df['features'] = preprocessed_list
  return df['features_1']

def preprocess_2(df):
  image_dir = df['path'][1:]
  preprocessed_list = []

  for i, image_path in enumerate(image_dir):
    im = np.array(Image.open(image_path))

    im_gray = grayscale_preprocessing(im)
    img_histogram_clipping = histogram_clipping(im_gray, 30)
    im_blurred = gaussian_blur_preprocessing(img_histogram_clipping, (9, 9), (10, 10))
    im_bilateral = bilateral_preprocessing(im_blurred, 5, 30, 30)
    im_cropped = crop_center(im_bilateral, 240, 240)
    hog_feature, hog_image = hog(im_cropped) #(63504,)
    # im_flattened = hog_feature.flatten()
    preprocessed_list.append(hog_feature)

  df['features'] = preprocessed_list
  return df['features_2']

In [22]:
for image_path in df_train['path'][:2]:
  print(image_path)

path
/content/drive/MyDrive/galaxy_zoo/Train_images/Train_images/Cigar-shaped smooth/217327.jpg


In [25]:
df_preprocessed_1 = preprocess_1(df_train)
df_preprocessed_2 = preprocess_2(df_train)
df_train_preprocessed = pd.concat([df_train, df_preprocessed_1, df_preprocessed_2], ignore_index=True)

  im_threshold = im_threshold / np.max(im_threshold) # normalize


KeyboardInterrupt: 

In [None]:
df_train_preprocessed.to_csv('./df_train_processed', index=False)
df_preprocessed_1.to_csv('./df_preprocessed_1', index=False)
df_preprocessed_2.to_csv('./df_preprocessed_2', index=False)

In [None]:
pd_feature_matrix_pipeline_1_2 = pd.concate([df_preprocessed_1, df_preprocessed_2], axis=1)
feature_matrix_pipeline_1_2 = pd_feature_matrix_pipeline_1_2.values #covert to numpy array

feature_matrix_pipeline_1_2.shape

In [None]:
#pick up top 300 features by PCA

from sklearn.decomposition import PCA
top_components = 300
pca = PCA(n_components=500)
pca.fit(feature_matrix_pipeline_1_2)

plt.figure(figsize=(8, 4))
plt.plot(np.cumsum(pca.explained_variance_ratio_))
plt.xlabel('Number of PCA components')
plt.ylabel('Cumulative explained variance')
plt.title('Explained Variance vs Number of PCA Components')
plt.grid(True)
plt.show()

# Extract the top 300 components
top_components_matrix = pca.components_[:top_components]

In [None]:
df_features = pd.DataFrame(top_components_matrix, columns=[f'pca_{i}' for i in range(top_components_matrix.shape[1])])
df_train_processed_pca = pd.concat([df_train, df_features], ignore_index=True)
df_train_processed_pca.to_csv('./df_train_processed_pca')

Classifier