In [1]:
# import libraries
import pandas as pd
import numpy as np
from PIL import Image
import matplotlib.pyplot as plt
from skimage import io, color
import cv2
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from keras.preprocessing.image import ImageDataGenerator
import time

# Define some functions for the data processing

In [2]:
def merge_channels(row):
    '''merge RGB color channels,
    convert to float, and divide by 255'''
    return cv2.merge((row['r_ppl'], row['g_ppl'], row['b_ppl'],row['r_xpl'], row['g_xpl'], row['b_xpl'])).astype('float')

def series_to_np(series, n_channels=6, height=150, width=150, labels=False):
    ''' Take a pandas Series consisting of np.arrays of pixel values
    and convert it to a np.array with the dimensions corresponding to
    (batch, channel, height, width).
    '''
    # get the length of the series input
    n_samples = len(series)

    # if the input series contains labeled data and not input data
    if labels==True:
        data = np.empty((n_samples, height, width))
        for i, img in enumerate(series):
            data[i, ...] = img
    else:
        # create an empty array to save time by pre-allocating the memory and
        # load each img data directly into it
        data = np.empty((n_samples, n_channels, height, width))
        for i, img in enumerate(series):
            # arrays are in the series in the shape of (height, width, channels)
            # need to transpose shape to be (channels, height, width)
            data[i, ...] = np.transpose(img, (2,0,1))
    return data
    
def center_pix(row):
    '''center pixels of each color channel by
    subtracting the mean value of each channel'''
    
    # get per-channel means and standard deviations
    means = row.mean(axis=(0,1), dtype='float')
    return row - means

def normalize_pix(row):
    '''normalize pixel ranges from 0-255 to the
    range 0-1'''
    return row / 255.0

# Load Data

In [3]:
# load in dataset
df = pd.read_pickle('../Data/training.pkl')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16312 entries, 0 to 16311
Data columns (total 10 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   r_ppl       16312 non-null  object
 1   g_ppl       16312 non-null  object
 2   b_ppl       16312 non-null  object
 3   r_xpl       16312 non-null  object
 4   g_xpl       16312 non-null  object
 5   b_xpl       16312 non-null  object
 6   labels      16312 non-null  object
 7   rotation    16312 non-null  object
 8   topleft     16312 non-null  object
 9   label_freq  16312 non-null  object
dtypes: object(10)
memory usage: 1.2+ MB


# Split into training set and test set

In [4]:
# create a copy of the original df to manipulate
# shuffle dataframe to randomize order of images
X = df.sample(frac=1, random_state=2).reset_index(drop=True)

# create a column that combines pixel data
X['merged'] = df.apply(merge_channels, axis=1)

# check to make sure the shape of the new column entries is correct
# should be (height, width, 6) for the  color channels
print(f"merged observation shape: {X['merged'][0].shape}")

# create a train, validation, and test set split 80, 10, 10, respectively
train, validate, test = np.split(X, [int(.8*len(df)), int(.9*len(df))])

merged observation shape: (150, 150, 6)


successfully merged 6 color channels. Now look at the train, val, test split to make sure each dataframe seems okay

In [5]:
# 80% of data
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13049 entries, 0 to 13048
Data columns (total 11 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   r_ppl       13049 non-null  object
 1   g_ppl       13049 non-null  object
 2   b_ppl       13049 non-null  object
 3   r_xpl       13049 non-null  object
 4   g_xpl       13049 non-null  object
 5   b_xpl       13049 non-null  object
 6   labels      13049 non-null  object
 7   rotation    13049 non-null  object
 8   topleft     13049 non-null  object
 9   label_freq  13049 non-null  object
 10  merged      13049 non-null  object
dtypes: object(11)
memory usage: 1.1+ MB


In [6]:
# 10% of data
validate.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1631 entries, 13049 to 14679
Data columns (total 11 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   r_ppl       1631 non-null   object
 1   g_ppl       1631 non-null   object
 2   b_ppl       1631 non-null   object
 3   r_xpl       1631 non-null   object
 4   g_xpl       1631 non-null   object
 5   b_xpl       1631 non-null   object
 6   labels      1631 non-null   object
 7   rotation    1631 non-null   object
 8   topleft     1631 non-null   object
 9   label_freq  1631 non-null   object
 10  merged      1631 non-null   object
dtypes: object(11)
memory usage: 140.3+ KB


In [7]:
# 10% of data
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1632 entries, 14680 to 16311
Data columns (total 11 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   r_ppl       1632 non-null   object
 1   g_ppl       1632 non-null   object
 2   b_ppl       1632 non-null   object
 3   r_xpl       1632 non-null   object
 4   g_xpl       1632 non-null   object
 5   b_xpl       1632 non-null   object
 6   labels      1632 non-null   object
 7   rotation    1632 non-null   object
 8   topleft     1632 non-null   object
 9   label_freq  1632 non-null   object
 10  merged      1632 non-null   object
dtypes: object(11)
memory usage: 140.4+ KB


# Compare img processing speeds with different methods

## method 1:

In [33]:
# this method will simply divide all values in train['merged'] by 255.0
start_time = time.time()
train['merged']/255.0
print("--- %s seconds ---" % (time.time() - start_time))

--- 143.70156407356262 seconds ---


## method 2:

In [36]:
# this method uses the function, normalize_pix, and
# pandas builtin apply() method to do transformation
start_time = time.time()
train['merged'].apply(normalize_pix)
print("--- %s seconds ---" % (time.time() - start_time))

--- 118.38492798805237 seconds ---


## method 3:

In [8]:
# This method will use a np.array() instead of pandas
# dataframe to do matrix division and normalize the data.

# number of images in the dataframe
start_time = time.time()
data = series_to_np(train['merged'])
print("--- %s seconds ---" % (time.time() - start_time))

--- 68.54794192314148 seconds ---


In [10]:
# using a matrix instead of a dataframe to preform the
start_time = time.time()
data / 255.0
print("--- %s seconds ---" % (time.time() - start_time))

--- 120.52148199081421 seconds ---


* method 3 and method 2 are very similar, but I think having the data in a matrix form will make things easier long-term to work with

# Normalize data

In [5]:
start_time = time.time()
# prepare data for model fitting
train_x = series_to_np(train['merged']) 
train_x = train_x / 255.0
train_y = series_to_np(train['labels'], labels=True)
train_y = train_y.astype(int)

val_x = series_to_np(validate['merged']) 
val_x = val_x / 255.0
val_y = series_to_np(validate['labels'], labels=True)
val_y = val_y.astype(int)

test_x = series_to_np(test['merged'])
test_x = test_x / 255.0
test_y = series_to_np(test['labels'], labels=True)
test_y = test_y.astype(int)
print("--- %s seconds ---" % (time.time() - start_time))

--- 199.09456706047058 seconds ---


# testing array manipulation methods

In [78]:
#batch, channels, height, width
p = np.transpose(val_x, (0,2,3,1)).reshape((-1,6))

In [87]:
p[22499]

array([0.42745098, 0.43137255, 0.44705882, 0.16862745, 0.16470588,
       0.07843137])

In [86]:
val_x[0,0:6,149,149]

array([0.42745098, 0.43137255, 0.44705882, 0.16862745, 0.16470588,
       0.07843137])

In [83]:
val_y.reshape((-1,1))[5000]

array([3])

# Gradient Boosting

In [6]:
new_train_x = np.transpose(train_x, (0,2,3,1)).reshape((-1,6))
new_train_y = train_y.ravel()

In [7]:
# get score
new_val_x = np.transpose(val_x, (0,2,3,1)).reshape((-1,6))
new_val_y = val_y.ravel()

In [8]:
new_test_x = np.transpose(test_x, (0,2,3,1)).reshape((-1,6))
new_test_y = test_y.ravel()

In [10]:
# initiate gradient boosting classifier model
clf = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, random_state=0)
# new_test_y is a 1D array, not a column vector
clf.fit(new_test_x, new_test_y)

KeyboardInterrupt: 

# CatBoost

In [16]:
# categorical features
cat_features = list(range(0, 7))
train_n = len(train_x)
val_n = len(val_x)

clf = CatBoostClassifier(
    iterations=5,
    learning_rate=0.1,
    loss_function='CrossEntropy'
)

clf.fit(train_x.reshape((train_n, -1)), train_y.reshape((train_n, -1)),
    cat_features=cat_features,
    eval_set=(val_x.reshape((val_n, -1)), val_y.reshape((val_n, -1))),
    verbose=False)

print('CatBoost model is fitted: ' + str(clf.is_fitted()))
print('CatBoost model paremeters:')
print(clf.get_params())

CatBoostError: 'data' is numpy array of floating point numerical type, it means no categorical features, but 'cat_features' parameter specifies nonzero number of categorical features