# Complete image preprocessing and feature extraction 

This file explores the techniques to be used for preprocessing and feature extraction for the Flavia leaves dataset images.

Importing necessary libraries

In [2]:
import os
import cv2
import numpy as np
from matplotlib import pyplot as plt
import mahotas as mt
import pandas as pd
%matplotlib inline
%cd ./data

C:\Users\habangal\Desktop\JHU\Computational statistics\Project\data


In [3]:
def processFile(test_img_path):
    result = []
    
    main_img = cv2.imread(test_img_path)
    img = cv2.cvtColor(main_img, cv2.COLOR_BGR2RGB)
    gs = cv2.cvtColor(img,cv2.COLOR_RGB2GRAY)
    result.append(gs.shape[0]) #height
    result.append(gs.shape[1]) #width
    
    blur = cv2.GaussianBlur(gs, (25,25),0)
    ret_otsu,im_bw_otsu = cv2.threshold(blur,0,255,cv2.THRESH_BINARY_INV+cv2.THRESH_OTSU)
    kernel = np.ones((50,50),np.uint8)
    closing = cv2.morphologyEx(im_bw_otsu, cv2.MORPH_CLOSE, kernel)
    _, contours, hierarchy = cv2.findContours(closing,cv2.RETR_TREE,cv2.CHAIN_APPROX_SIMPLE)
    result.append(len(contours)) #number of contours
    
    #print(test_img_path)
    #print(len(contours))
    if len(contours) > 1:
        cnt = contours[1]
    else:
        cnt = contours[0]
    result.append(len(cnt)) #number of edges
    
    plottedContour = cv2.drawContours(gs,contours,-1,(0,255,0),10)
    M = cv2.moments(cnt)
    
    result.append(M["m00"])
    result.append(M["m10"])
    result.append(M["m01"])
    result.append(M["m20"])
    result.append(M["m11"])
    result.append(M["m02"])
    result.append(M["m30"])
    result.append(M["m21"])
    result.append(M["m12"])
    result.append(M["m03"])
    result.append(M["mu20"])
    result.append(M["mu11"])
    result.append(M["mu02"])
    result.append(M["mu30"])
    result.append(M["mu21"])
    result.append(M["mu12"])
    result.append(M["mu03"])
    result.append(M["nu20"])
    result.append(M["nu11"])
    result.append(M["nu02"])
    result.append(M["nu30"])
    result.append(M["nu21"])
    result.append(M["nu12"])
    result.append(M["nu03"])
    
    area = cv2.contourArea(cnt)
    result.append(area) #area
    
    perimeter = cv2.arcLength(cnt,True)
    result.append(perimeter) #perimeter
    
    rect = cv2.minAreaRect(cnt)
    box = cv2.boxPoints(rect)
    box = np.int0(box)
    contours_im = cv2.drawContours(closing,[box],0,(255,255,255),2)
    ellipse = cv2.fitEllipse(cnt)
    im = cv2.ellipse(closing,ellipse,(255,255,255),2)
    x,y,w,h = cv2.boundingRect(cnt)
    aspect_ratio = float(w)/h
    result.append(aspect_ratio) #aspect_ratio
    
    rectangularity = w*h/area
    result.append(rectangularity) #rectangularity
        
    circularity = ((perimeter)**2)/area
    result.append(circularity) #circularity
    
    equi_diameter = np.sqrt(4*area/np.pi)
    result.append(equi_diameter) #equi_diameter
    
    (x,y),(MA,ma),angle = cv2.fitEllipse(cnt)
    result.append(x)
    result.append(y)
    result.append(MA)
    result.append(ma)
    result.append(angle)
    
    red_channel = img[:,:,0]
    green_channel = img[:,:,1]
    blue_channel = img[:,:,2]
    np.mean(blue_channel)
    blue_channel[blue_channel == 255] = 0
    green_channel[green_channel == 255] = 0
    red_channel[red_channel == 255] = 0
    red_mean = np.mean(red_channel)
    result.append(red_mean)
    
    green_mean = np.mean(green_channel)
    result.append(green_mean)
    
    blue_mean = np.mean(blue_channel)
    result.append(blue_mean)
    
    red_var = np.std(red_channel)
    result.append(red_var)
    
    green_var = np.std(green_channel)
    result.append(green_var)
    
    blue_var = np.std(blue_channel)
    result.append(blue_var)
    
    textures = mt.features.haralick(gs)
    ht_mean = textures.mean(axis=0)
    result.append(ht_mean[1])
    result.append(ht_mean[2])
    result.append(ht_mean[4])
    result.append(ht_mean[8])
        
    return result

In [None]:
files = [f for f in os.listdir('.')]
cols = ['height', 'width', 'NumContours', 'NumEdges',
        'm00', 'm10', 'm01', 'm20', 'm11', 'm02', 'm30', 'm21', 'm12', 'm03', 'mu20', 'mu11', 'mu02', 'mu30', 'mu21', 'mu12', 'mu03', 'nu20', 'nu11', 'nu02', 'nu30', 'nu21', 'nu12', 'nu03',
       'area', 'perimeter', 'aspectRatio', 'rectangularity', 'circularity', 'equiDiameter', 'x', 'y', 'MA', 'ma', 'angle',
       'redMean', 'greenMean', 'blueMean', 'redVar', 'greenVar', 'blueVar', 'contrast', 'correlation', 'inverseDifferenceMoments', 'entropy']

lst = []
processed = 0
#print('Processed ' + processed + ' of ' + len(files))
for f in files:
    try:
        lst.append(processFile(f))
        processed = processed + 1
        print('Processed ' + str(processed) + ' of ' + str(len(files)))
    except:
        print('error processing ' + f)
    #print(f)
    
df1 = pd.DataFrame(lst, columns=cols)
df1.to_csv('../data.csv')
print('Done!')

Processed 1 of 524
Processed 2 of 524
Processed 3 of 524
Processed 4 of 524
Processed 5 of 524
Processed 6 of 524
Processed 7 of 524
Processed 8 of 524
Processed 9 of 524
Processed 10 of 524
Processed 11 of 524
Processed 12 of 524
Processed 13 of 524
Processed 14 of 524
Processed 15 of 524
Processed 16 of 524
Processed 17 of 524
Processed 18 of 524
Processed 19 of 524
Processed 20 of 524
Processed 21 of 524
Processed 22 of 524
Processed 23 of 524
Processed 24 of 524
Processed 25 of 524
Processed 26 of 524
Processed 27 of 524
Processed 28 of 524
Processed 29 of 524
Processed 30 of 524
Processed 31 of 524
Processed 32 of 524
Processed 33 of 524
Processed 34 of 524
Processed 35 of 524
Processed 36 of 524
Processed 37 of 524
Processed 38 of 524
Processed 39 of 524
Processed 40 of 524
Processed 41 of 524
Processed 42 of 524
Processed 43 of 524
Processed 44 of 524
Processed 45 of 524
Processed 46 of 524
Processed 47 of 524
Processed 48 of 524
Processed 49 of 524
Processed 50 of 524
Processed