In [1]:
import pandas as pd
import numpy as np
from PIL import Image
import matplotlib.pyplot as plt
from skimage import io, color
import cv2
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split
from keras.preprocessing.image import ImageDataGenerator

# Load Data

In [2]:
# load in dataset
df = pd.read_pickle('../Data/training.pkl')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21576 entries, 0 to 21575
Data columns (total 9 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   r_ppl     21576 non-null  object
 1   g_ppl     21576 non-null  object
 2   b_ppl     21576 non-null  object
 3   r_xpl     21576 non-null  object
 4   g_xpl     21576 non-null  object
 5   b_xpl     21576 non-null  object
 6   labels    21576 non-null  object
 7   rotation  21576 non-null  object
 8   topleft   21576 non-null  object
dtypes: object(9)
memory usage: 1.5+ MB


# Split into training set and test set

In [3]:
def merge_channels(row):
    '''merge RGB color channels,
    convert to float, and divide by 255'''
    return cv2.merge((row['r_ppl'], row['g_ppl'], row['b_ppl'])).astype('float')

# create a copy of the original df to manipulate
# shuffle dataframe to randomize order of images
X = df.sample(frac=1, random_state=2).reset_index(drop=True)

# create a column that combines pixel data
X['img'] = df.apply(merge_channels, axis=1)

# create a train, validation, and test set split 80, 10, 10, respectively
train, validate, test = np.split(X, [int(.8*len(df)), int(.9*len(df))])

In [4]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17260 entries, 0 to 17259
Data columns (total 10 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   r_ppl     17260 non-null  object
 1   g_ppl     17260 non-null  object
 2   b_ppl     17260 non-null  object
 3   r_xpl     17260 non-null  object
 4   g_xpl     17260 non-null  object
 5   b_xpl     17260 non-null  object
 6   labels    17260 non-null  object
 7   rotation  17260 non-null  object
 8   topleft   17260 non-null  object
 9   img       17260 non-null  object
dtypes: object(10)
memory usage: 1.3+ MB


In [5]:
validate.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2158 entries, 17260 to 19417
Data columns (total 10 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   r_ppl     2158 non-null   object
 1   g_ppl     2158 non-null   object
 2   b_ppl     2158 non-null   object
 3   r_xpl     2158 non-null   object
 4   g_xpl     2158 non-null   object
 5   b_xpl     2158 non-null   object
 6   labels    2158 non-null   object
 7   rotation  2158 non-null   object
 8   topleft   2158 non-null   object
 9   img       2158 non-null   object
dtypes: object(10)
memory usage: 168.7+ KB


In [6]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2158 entries, 19418 to 21575
Data columns (total 10 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   r_ppl     2158 non-null   object
 1   g_ppl     2158 non-null   object
 2   b_ppl     2158 non-null   object
 3   r_xpl     2158 non-null   object
 4   g_xpl     2158 non-null   object
 5   b_xpl     2158 non-null   object
 6   labels    2158 non-null   object
 7   rotation  2158 non-null   object
 8   topleft   2158 non-null   object
 9   img       2158 non-null   object
dtypes: object(10)
memory usage: 168.7+ KB


# Pre-process Data

In [7]:
def center_pix(row):
    '''center pixels of each color channel by
    subtracting the mean value of each channel'''
    
    # get per-channel means and standard deviations
    means = row.mean(axis=(0,1), dtype='float')
    return row - means

def normalize_pix(row):
    '''normalize pixel ranges from 0-255 to the
    range 0-1'''
    return row / 255.0

In [8]:
# normalize train, val, and test sets by dividing by 255.0
train['img'] = train['img'].apply(normalize_pix)
validate['img'] = validate['img'].apply(normalize_pix)
test['img'] = test['img'].apply(normalize_pix)

# still should center data

In [9]:
# assign train, val, test sets
X_train = train['img']
y_train = train['labels']

X_val = validate['img']
y_val = validate['labels']

X_test = test['img']
y_test = test['labels']

In [38]:
# X_train is 17260 rows of 200x200 values -- reshaped

19418    [[[0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0,...
19419    [[[0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0,...
19420    [[[0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0,...
19421    [[[0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0,...
19422    [[[0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0,...
                               ...                        
21571    [[[0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0,...
21572    [[[0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0,...
21573    [[[0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0,...
21574    [[[0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0,...
21575    [[[0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0,...
Name: img, Length: 2158, dtype: object

In [21]:
X_test.to_numpy().reshape(len(X_test),1).shape


(2158, 1)

# Create model

In [25]:
# categorical features
cat_features = list(range(0, 7))

clf = CatBoostClassifier(
    iterations=5,
    learning_rate=0.1,
    loss_function='CrossEntropy'
)

clf.fit(X_train, y_train,
    cat_features=cat_features,
    eval_set=(X_val, y_val),
    verbose=False)

print('CatBoost model is fitted: ' + str(clf.is_fitted()))
print('CatBoost model paremeters:')
print(clf.get_params())

ValueError: too many values to unpack (expected 2)