In [1]:
import os
import numpy as np
import pandas as pd
import random
from tqdm import tqdm
import xgboost as xgb
import scipy
from sklearn.model_selection import train_test_split
from sklearn.metrics import fbeta_score
from sklearn.preprocessing import Binarizer, MultiLabelBinarizer
from sklearn.metrics import classification_report
from PIL import Image

# Set the seed 
random_seed = 0
random.seed(random_seed)
np.random.seed(random_seed)

### Import dataset

In [2]:
DATASET_PATH = 'dataset-haze-removed/'
TRAIN_PATH = DATASET_PATH + 'train_file'
TEST_PATH = DATASET_PATH + 'test_file'

TRAIN_CSV_PATH = DATASET_PATH + 'train_label.csv'
TEST_CSV_PATH = DATASET_PATH + 'test_label.csv'

df_train = pd.read_csv(TRAIN_CSV_PATH)
df_test = pd.read_csv(TEST_CSV_PATH)

df_train['image_name'] = df_train['image_name'].astype(str)
df_test['image_name'] = df_test['image_name'].astype(str)

df_train['tags'] = df_train['tags'].apply(lambda x: x.split(' '))
df_test['tags'] = df_test['tags'].apply(lambda x: x.split(' '))

X_train_files = np.array(df_train['image_name'].tolist()) # filenames
X_train_files.reshape((X_train_files.shape[0], 1))
y_train = np.array(df_train['tags'].tolist(), dtype=object) # train image tags (ground truth)
y_test = np.array(df_test['tags'].tolist(), dtype=object) # test image tags (ground truth)

In [3]:
y_train.shape

(32383,)

### XGBoost 

In [4]:
def calc_acc(y_test, y_pred, labels, threshold=0.2):
    
    array_labels = np.array(labels)
    test = y_test
    pred = y_pred
    
    # Binarize pred to 0 and 1...
    binarizer = Binarizer(threshold=threshold)
    pred = binarizer.fit_transform(pred)
    # 0 and 1 to False and True for boolean indexing...
    pred = pred > 0
    
    preds = []
    for row in pred:
        preds.append(array_labels[row])
    
    mlb = MultiLabelBinarizer(classes=array_labels)
    
    test = mlb.fit_transform(test)
    preds = mlb.transform(preds)
    score = fbeta_score(test, preds, beta=2, average='weighted')
    
    return score

# Function to extract the image features
def extract_features(df, data_path):
    im_features = df.copy()

    N = len(im_features.image_name.values)

    r_mean = np.zeros(N)
    g_mean = np.zeros(N)
    b_mean = np.zeros(N)

    r_std = np.zeros(N)
    g_std = np.zeros(N)
    b_std = np.zeros(N)

    r_max = np.zeros(N)
    g_max = np.zeros(N)
    b_max = np.zeros(N)

    r_min = np.zeros(N)
    g_min = np.zeros(N)
    b_min = np.zeros(N)

    r_kurtosis = np.zeros(N)
    g_kurtosis = np.zeros(N)
    b_kurtosis = np.zeros(N)
    
    r_skewness = np.zeros(N)
    g_skewness = np.zeros(N)
    b_skewness = np.zeros(N)

    for i, image_name in enumerate(tqdm(im_features.image_name.values, miniters=1000)): 
        im = Image.open(data_path + image_name)
        im = np.array(im)[:,:,:3]

        r = im[:,:,0].ravel()
        g = im[:,:,1].ravel()
        b = im[:,:,2].ravel()
        
        r_mean[i] = np.mean(r)
        g_mean[i] = np.mean(g)
        b_mean[i] = np.mean(b)

        r_std[i] = np.std(r)
        g_std[i] = np.std(g)
        b_std[i] = np.std(b)

        r_max[i] = np.max(r)
        g_max[i] = np.max(g)
        b_max[i] = np.max(b)

        r_min[i] = np.min(r)
        g_min[i] = np.min(g)
        b_min[i] = np.min(b)

        r_kurtosis[i] = scipy.stats.kurtosis(r)
        g_kurtosis[i] = scipy.stats.kurtosis(g)
        b_kurtosis[i] = scipy.stats.kurtosis(b)
        
        r_skewness[i] = scipy.stats.skew(r)
        g_skewness[i] = scipy.stats.skew(g)
        b_skewness[i] = scipy.stats.skew(b)


    im_features['r_mean'] = r_mean
    im_features['g_mean'] = g_mean
    im_features['b_mean'] = b_mean

    im_features['rgb_mean_mean'] = (r_mean + g_mean + b_mean)/3.0

    im_features['r_std'] = r_std
    im_features['g_std'] = g_std
    im_features['b_std'] = b_std

    im_features['rgb_mean_std'] = (r_std + g_std + b_std)/3.0

    im_features['r_max'] = r_max
    im_features['g_max'] = g_max
    im_features['b_max'] = b_max

    im_features['rgb_mean_max'] = (r_max + r_max + b_max)/3.0

    im_features['r_min'] = r_min
    im_features['g_min'] = g_min
    im_features['b_min'] = b_min

    im_features['rgb_mean_min'] = (r_min + g_min + b_min)/3.0

    im_features['r_range'] = r_max - r_min
    im_features['g_range'] = g_max - g_min
    im_features['b_range'] = b_max - b_min

    im_features['r_kurtosis'] = r_kurtosis
    im_features['g_kurtosis'] = g_kurtosis
    im_features['b_kurtosis'] = b_kurtosis
    
    im_features['r_skewness'] = r_skewness
    im_features['g_skewness'] = g_skewness
    im_features['b_skewness'] = b_skewness
    
    return im_features

In [5]:
# # Extract features
# print('Extracting train features')
# train_features = extract_features(df_train, TRAIN_PATH + '/')
# print('Extracting test features')
# test_features = extract_features(df_test, TEST_PATH + '/')

# # Pickle the data
# import pickle

# train_feat = open('train_features_xgb.pickle', 'wb')
# pickle.dump(train_features, train_feat, protocol=4)
# train_feat.close()

# test_feat = open('test_features_xgb.pickle', 'wb')
# pickle.dump(test_features, test_feat, protocol=4)
# test_feat.close()

In [6]:
# Load the pickled features:
import pickle

pickle_XGtrain = open('train_features_xgb.pickle', 'rb')
train_features = pickle.load(pickle_XGtrain)

pickle_yGtrain = open('test_features_xgb.pickle', 'rb')
test_features = pickle.load(pickle_yGtrain)

In [7]:
# Prepare dataset for xgboost
X_train_xgb = np.array(train_features.drop(['image_name', 'tags'], axis=1))
X_test_xgb = np.array(test_features.drop(['image_name', 'tags'], axis=1))

### Create a validation set by dividing train set...
X_train_xgb, X_val_xgb, y_train, y_val = train_test_split(X_train_xgb, y_train, test_size=0.2, random_state=random_seed)

mlb = MultiLabelBinarizer()
y_train_xgb = mlb.fit_transform(y_train)
labels = mlb.classes_
print('classes:', labels)



n_classes = len(labels) # 17 classes in total
print('n_classes =', n_classes)


classes: ['agriculture' 'artisinal_mine' 'bare_ground' 'blooming' 'blow_down'
 'clear' 'cloudy' 'conventional_mine' 'cultivation' 'habitation' 'haze'
 'partly_cloudy' 'primary' 'road' 'selective_logging' 'slash_burn' 'water']
n_classes = 17


In [8]:
def run(MAX_DEPTH, N_ESTIMATORS):
    results = []
    
    for max_depth in MAX_DEPTH:
        print('max_depth =', max_depth)
        for n_estimators in N_ESTIMATORS:
            print('n_estimators =', n_estimators)
#             train_pred_xgb = np.zeros((X_train_xgb.shape[0], n_classes)) # (num_of_train_images, n_classes=17)
            y_val_pred = np.zeros((X_val_xgb.shape[0], n_classes)) # (num_of_val_images, n_classes=17)
            y_test_pred = np.zeros((X_test_xgb.shape[0], n_classes)) # (num_of_test_images, n_classes=17)

            for i in tqdm(range(n_classes), miniters=1, leave=False): 
#                 print("i =", i, labels[i])
                model = xgb.XGBClassifier(max_depth=max_depth, learning_rate=0.1, n_estimators=n_estimators, \
                                          objective='binary:logistic', n_jobs=-1, \
                                          gamma=0, min_child_weight=1, max_delta_step=0, \
                                          subsample=1, colsample_bytree=1, colsample_bylevel=1, \
                                          reg_alpha=0, reg_lambda=1, scale_pos_weight=1, \
                                          base_score=0.5, seed=random_seed, \
                                          use_label_encoder=False, eval_metric='error', \
                                         )

                model.fit(X_train_xgb, y_train_xgb[:, i])
        
                y_val_pred[:, i] = model.predict_proba(X_val_xgb)[:,1] # prediction on test set
                y_test_pred[:, i] = model.predict_proba(X_test_xgb)[:,1] # prediction on test set

            val_score = calc_acc(y_val, y_val_pred, labels)
            test_score = calc_acc(y_test, y_test_pred, labels)
            print("val_f2_score = {} test_f2_score = {}".format(round(val_score, 3), round(test_score, 3)))

            results.append({'max_depth': max_depth, 'n_estimators': n_estimators, \
                            'val_f2_score': val_score, 'test_f2_score': test_score})
                
    return results

In [9]:
# Hyperparameter grid
MAX_DEPTH = [2, 5, 10]
N_ESTIMATORS = [100, 200, 500]

# FOR CODE TEST ONLY
# MAX_DEPTH = [5, 10]
# N_ESTIMATORS = [100]

In [10]:
results = run(MAX_DEPTH, N_ESTIMATORS)

  0%|          | 0/17 [00:00<?, ?it/s]

max_depth = 2
n_estimators = 100




val_f2_score = 0.851 test_f2_score = 0.852
n_estimators = 200




val_f2_score = 0.859 test_f2_score = 0.86
n_estimators = 500




val_f2_score = 0.866 test_f2_score = 0.867
max_depth = 5
n_estimators = 100




val_f2_score = 0.868 test_f2_score = 0.868
n_estimators = 200




val_f2_score = 0.869 test_f2_score = 0.87
n_estimators = 500




val_f2_score = 0.866 test_f2_score = 0.869
max_depth = 10
n_estimators = 100




val_f2_score = 0.864 test_f2_score = 0.866
n_estimators = 200




val_f2_score = 0.86 test_f2_score = 0.86
n_estimators = 500


                                               

val_f2_score = 0.852 test_f2_score = 0.852




In [14]:
df_results = pd.DataFrame(results)
df_results.sort_values('val_f2_score', ascending=False)

Unnamed: 0,max_depth,n_estimators,val_f2_score,test_f2_score
4,5,200,0.868604,0.870258
3,5,100,0.867693,0.867988
5,5,500,0.86564,0.86887
2,2,500,0.865577,0.866762
6,10,100,0.864125,0.865684
7,10,200,0.860126,0.860457
1,2,200,0.859342,0.859791
8,10,500,0.851707,0.852182
0,2,100,0.850952,0.852284


In [16]:
df_results.to_csv('hyperparameter_search_XGBoost.csv', index=False)