In [1]:
import pandas as pd 
import numpy as np 
import os 
import random
import pickle 
import sklearn

In [2]:
training_dataset = pd.read_csv('train.csv')
training_images_path = '/Programs/Image_Processing/Train'
training_list = [training_images_path + '/' + image for image in os.listdir(training_images_path)]
num_training_img = len(os.listdir(training_images_path))
print(f'We have a total of {num_training_img} training images.')

We have a total of 19906 training images.


In [3]:
# unbalanced count
training_dataset['Class'].value_counts()

MIDDLE    10804
YOUNG      6706
OLD        2396
Name: Class, dtype: int64

In [4]:
# Split the ids into their respective classes
middle_mask = training_dataset['Class'] == "MIDDLE"
middle_mask_list = training_dataset[middle_mask]['ID']
young_mask = training_dataset['Class'] == "YOUNG"
young_mask_list = training_dataset[young_mask]['ID']
old_mask = training_dataset['Class'] == "OLD"
old_mask_list = training_dataset[old_mask]['ID']

In [5]:
# Creating the paths to each of the classes
middle_training_list = [training_images_path + '/' + image for image in middle_mask_list]
young_training_list = [training_images_path + '/' + image for image in young_mask_list]
old_training_list = [training_images_path + '/' + image for image in old_mask_list]

In [6]:
# Select 2396 random images from middle, young, old without replacement
sample_size = 2396
middle = random.sample(middle_training_list, sample_size)
young = random.sample(young_training_list, sample_size)
old = random.sample(old_training_list, sample_size)

In [7]:
# recreating the data frame by assigning young 0 middle 1 old 2  
data = []
for image_url in young: 
    data.append((0, image_url))
for image_url in middle: 
    data.append((1, image_url))
for image_url in old: 
    data.append((2, image_url))
    
df = pd.DataFrame(data, columns=['Class', 'Image_path'])

In [8]:
from skimage.io import imread
from skimage.transform import resize
from skimage.feature import local_binary_pattern
from skimage.filters import prewitt_h, prewitt_v

METHOD = 'uniform'
radius = 1
n_points = 8 * radius

all_features = []

count = 0

for image_url in df['Image_path'].to_list():
    try: 
        count += 1
        print(count, end = ' ')
        # Read Images
        image = imread(image_url)
        print(image.shape)
        
        # Convert RGB images to grayscale (2D)
        if len(image.shape) == 3: 
            image = np.mean(image, axis=2)
        
        #resize the image
        image = resize(image, (100,100))
        
        # Find the sum of the grayscale values for each pixel 
        rgb_feature_matrix = image[:,:]
        
        # Flatten the features array into a single 1d vector 
        rgb_feature = rgb_feature_matrix.flatten()

        #LBP feature extraction 
        gray_image = image
        lbp_image = local_binary_pattern(gray_image, n_points, radius, METHOD)
        lbp_features = np.reshape(lbp_image, (100*100))
        
        # horizontal edges
        horizontal_edges = prewitt_h(gray_image)
        horizontal_edges = np.reshape(horizontal_edges, (100*100))
        
        # vertical edges 
        vertical_edges = prewitt_v(gray_image)
        vertical_edges = np.reshape(vertical_edges, (100*100))
        
        # combine all the features 
        all_features.append(np.concatenate((rgb_feature, lbp_features, horizontal_edges, vertical_edges)))
        
    except Exception as e:
        print(f'Error processing image {image_url}: {str(e)}')

1 (219, 138, 3)
2 (136, 102, 3)
3 (121, 75, 3)
4 (173, 183, 3)
5 (173, 121, 3)
6 (182, 139, 3)
7 (166, 96, 3)
8 (94, 56, 3)
9 (69, 52, 3)
10 (31, 26, 3)
11 (138, 118, 3)
12 (321, 166, 3)
13 (123, 101, 3)
14 (22, 17, 3)
15 (97, 77, 3)
16 (58, 44, 3)
17 (133, 104, 3)
18 (54, 37, 3)
19 (38, 27, 3)
20 (17, 17, 3)
21 (51, 43, 3)
22 (108, 93, 3)
23 (85, 66, 3)
24 (110, 84, 3)
25 (159, 113, 3)
26 (89, 55, 3)
27 (109, 79, 3)
28 (35, 32, 3)
29 (413, 299, 3)
30 (44, 28, 3)
31 (142, 87, 3)
32 (43, 30, 3)
33 (56, 40, 3)
34 (99, 88, 3)
35 (222, 193, 3)
36 (340, 345, 3)
37 (201, 141, 3)
38 (97, 114, 3)
39 (99, 146, 3)
40 (44, 46, 3)
41 (159, 150, 3)
42 (198, 134, 3)
43 (275, 217, 3)
44 (73, 68, 3)
45 (95, 63, 3)
46 (108, 90, 3)
47 (171, 100, 3)
48 (169, 114, 3)
49 (280, 238, 3)
50 (52, 43, 3)
51 (259, 155, 3)
52 (170, 149, 3)
53 (339, 317, 3)
54 (92, 74, 3)
55 (55, 34, 3)
56 (227, 181, 3)
57 (36, 26, 3)
58 (66, 42, 3)
59 (285, 212, 3)
60 (144, 158, 3)
61 (223, 181, 3)
62 (330, 246, 3)
63 (31, 22, 3)

In [9]:
X = np.array(all_features)
Y = np.array(df['Class'].values)

In [10]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Splitting data
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=.25, random_state=32)

# Standardize the data 
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)


In [11]:
# Define Models
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn import svm 
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report

dt = DecisionTreeClassifier()
dt.fit(X_train, Y_train)

knn = KNeighborsClassifier(n_neighbors=10, n_jobs=-1)
knn.fit(X_train, Y_train)

gnb = GaussianNB()
gnb.fit(X_train, Y_train)

rfc = RandomForestClassifier(n_estimators=100, n_jobs=-1)
rfc.fit(X_train, Y_train)

linear_svm = svm.SVC(kernel='linear', decision_function_shape='ovo', probability=True)
linear_svm.fit(X_train, Y_train)

rbf_svm = svm.SVC(kernel='rbf', decision_function_shape='ovo', probability=True)
rbf_svm.fit(X_train, Y_train)

poly_svm = svm.SVC(kernel='poly', decision_function_shape='ovo', probability=True)
poly_svm.fit(X_train, Y_train)

sig_svm = svm.SVC(kernel='sigmoid', decision_function_shape='ovo', probability=True)
sig_svm.fit(X_train, Y_train)

In [12]:
# k-fold cross validation 
cv = KFold(n_splits=5, random_state=32, shuffle=True)

dt_cv = cross_val_score(dt, X_train, Y_train, cv = cv, scoring='accuracy', n_jobs=-1)   
knn_cv = cross_val_score(knn, X_train, Y_train, cv = cv, scoring='accuracy', n_jobs=-1)   
gnb_cv = cross_val_score(gnb, X_train, Y_train, cv = cv, scoring='accuracy', n_jobs=-1)  
rfc_cv = cross_val_score(rfc, X_train, Y_train, cv = cv, scoring='accuracy', n_jobs=-1)   
linear_cv = cross_val_score(linear_svm, X_train, Y_train, cv = cv, scoring='accuracy', n_jobs=-1)   
rbf_cv = cross_val_score(rbf_svm, X_train, Y_train, cv = cv, scoring='accuracy', n_jobs=-1)   
poly_cv = cross_val_score(poly_svm, X_train, Y_train, cv = cv, scoring='accuracy', n_jobs=-1)   
sig_cv = cross_val_score(sig_svm, X_train, Y_train, cv = cv, scoring='accuracy', n_jobs=-1)   

print(f'DT Average Accuracy: {round(dt_cv.mean(),2)}')
print(f'KNN Average Accuracy: {round(knn_cv.mean(),2)}')
print(f'GNB Average Accuracy: {round(gnb_cv.mean(),2)}')
print(f'RFC Average Accuracy: {round(rfc_cv.mean(),2)}')
print(f'Linear SVM Average Accuracy: {round(linear_cv.mean(),2)}')
print(f'RBF SVM Average Accuracy: {round(rbf_cv.mean(),2)}')
print(f'Poly SVM Average Accuracy: {round(poly_cv.mean(),2)}')
print(f'Sigmoid SVM Average Accuracy: {round(sig_cv.mean(),2)}')

DT Average Accuracy: 0.45
KNN Average Accuracy: 0.4
GNB Average Accuracy: 0.48
RFC Average Accuracy: 0.59
Linear SVM Average Accuracy: 0.5
RBF SVM Average Accuracy: 0.59
Poly SVM Average Accuracy: 0.48
Sigmoid SVM Average Accuracy: 0.54


In [13]:
# Predictions
dt_pred = dt.predict(X_test)
knn_pred = knn.predict(X_test)
gnb_pred = gnb.predict(X_test)
rfc_pred = rfc.predict(X_test)
linear_pred = linear_svm.predict(X_test)
rbf_pred = rbf_svm.predict(X_test)
poly_pred = poly_svm.predict(X_test)
sig_pred = sig_svm.predict(X_test)

In [14]:
# Classification Report
def create_classification_report(model, y_test, y_pred): 
    print(model)
    print(classification_report(y_test,y_pred))
    print('\n')

create_classification_report('Dt', Y_test, dt_pred)
create_classification_report('Knn', Y_test, knn_pred)
create_classification_report('Gnb', Y_test, gnb_pred)
create_classification_report('Rfc', Y_test, rfc_pred)
create_classification_report('Linear', Y_test, linear_pred)
create_classification_report('Rbf', Y_test, rbf_pred)
create_classification_report('Poly', Y_test, poly_pred)
create_classification_report('Sigmoid', Y_test, sig_pred)

Dt
              precision    recall  f1-score   support

           0       0.47      0.50      0.49       599
           1       0.42      0.41      0.41       600
           2       0.51      0.50      0.51       598

    accuracy                           0.47      1797
   macro avg       0.47      0.47      0.47      1797
weighted avg       0.47      0.47      0.47      1797



Knn
              precision    recall  f1-score   support

           0       0.73      0.09      0.16       599
           1       0.39      0.74      0.51       600
           2       0.45      0.43      0.44       598

    accuracy                           0.42      1797
   macro avg       0.52      0.42      0.37      1797
weighted avg       0.52      0.42      0.37      1797



Gnb
              precision    recall  f1-score   support

           0       0.51      0.54      0.52       599
           1       0.43      0.23      0.30       600
           2       0.47      0.67      0.55       598

    a

In [16]:
# Save the trained model and the scaler
with open('Image_Processing_Model.pkl', 'wb') as f:
    pickle.dump(rbf_svm, f)
    
with open('image_scaler.pkl', 'wb') as f:
    pickle.dump(sc, f)