In [1]:
import numpy as np
import pandas as pd
from PIL import Image
import xgboost as xgb
from sklearn.metrics import accuracy_score
from joblib import dump

In [2]:
# read solutions data
solutions = pd.read_csv("data/solutions.csv")
# only take 12000 records due to memory restrictions
solutions = solutions.truncate(after = 11999, axis = "rows")
solutions.shape

(12000, 3)

In [3]:
# image opener and flattener
def getImage(imgID):
    return np.array(Image.open("data/images_modified/" + str(imgID) + ".jpg")).flatten()

# train / test = 80% / 20%
X_train = np.array([np.array(getImage(imgID)) for imgID in solutions.truncate(after=9599, axis='rows')["GalaxyID"]]) / 255
X_test  = np.array([np.array(getImage(imgID)) for imgID in solutions.truncate(before=9600, after=11999, axis='rows')["GalaxyID"]]) / 255
y_train = solutions.truncate(after=9599, axis='rows')
y_test  = solutions.truncate(before=9600, after=11999, axis='rows')

# free memory
del solutions

print('X_train:', X_train.shape)
print('X_test:', X_test.shape)
print('y_train:', y_train.shape)
print('y_test:', y_test.shape)

X_train: (9600, 49152)
X_test: (2400, 49152)
y_train: (9600, 3)
y_test: (2400, 3)


In [4]:
y_train = y_train.loc[:, ["Smooth"]]

y_test = y_test.loc[:, ["Smooth"]]

print('X_train:', X_train.shape)
print('X_test:', X_test.shape)
print('y_train:', y_train.shape)
print('y_test:', y_test.shape)

X_train: (9600, 49152)
X_test: (2400, 49152)
y_train: (9600, 1)
y_test: (2400, 1)


In [None]:
# create the gradient boosting model using xgboost
model = xgb.XGBClassifier(
    silent=True,
    n_jobs=-1,
    max_depth=5,
    learning_rate=0.05,
    n_estimators=1000,
    objective='binary:logistic',
    min_child_weight=3,
    subsample=0.7,
    colsample_bytree=0.9,
    reg_alpha=1,
)

# train the model
model.fit(X_train, y_train.values.ravel())

y_pred = model.predict(X_test)

In [None]:
y_pred = np.round(y_pred)
accuracy_score(y_test, y_pred)

In [None]:
dump(model, 'xgboost')