In [1]:
import os
import numpy as np
import pandas as pd
import random
from tqdm import tqdm
import xgboost as xgb
import scipy
from sklearn.metrics import fbeta_score
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import classification_report
from PIL import Image

In [2]:
# Set the seed 
random_seed = 1
random.seed(random_seed)
np.random.seed(random_seed)

In [3]:
ROOT_PATH = 'dataset-haze-removed'
TRAIN_PATH = ROOT_PATH + '/' + 'train_file' + '/'
TEST_PATH = ROOT_PATH + '/' + 'test_file' + '/'

TRAIN_CSV_PATH = TRAIN_PATH + 'train_label.csv'
TEST_CSV_PATH = TEST_PATH + 'test_label.csv'
df_train = pd.read_csv(TRAIN_CSV_PATH)
df_test = pd.read_csv(TEST_CSV_PATH)

In [4]:
# Function to extract the image features
def extract_features(df, data_path):
    
    im_features = df.copy()

    N = len(im_features.image_name.values)

    r_mean = np.zeros(N)
    g_mean = np.zeros(N)
    b_mean = np.zeros(N)

    r_std = np.zeros(N)
    g_std = np.zeros(N)
    b_std = np.zeros(N)

    r_max = np.zeros(N)
    g_max = np.zeros(N)
    b_max = np.zeros(N)

    r_min = np.zeros(N)
    g_min = np.zeros(N)
    b_min = np.zeros(N)

    r_kurtosis = np.zeros(N)
    g_kurtosis = np.zeros(N)
    b_kurtosis = np.zeros(N)
    
    r_skewness = np.zeros(N)
    g_skewness = np.zeros(N)
    b_skewness = np.zeros(N)

    for i, image_name in enumerate(tqdm(im_features.image_name.values, miniters=1000)): 
        im = Image.open(data_path + image_name)
        im = np.array(im)[:,:,:3]

        r = im[:,:,0].ravel()
        g = im[:,:,1].ravel()
        b = im[:,:,2].ravel()
        
        r_mean[i] = np.mean(r)
        g_mean[i] = np.mean(g)
        b_mean[i] = np.mean(b)

        r_std[i] = np.std(r)
        g_std[i] = np.std(g)
        b_std[i] = np.std(b)

        r_max[i] = np.max(r)
        g_max[i] = np.max(g)
        b_max[i] = np.max(b)

        r_min[i] = np.min(r)
        g_min[i] = np.min(g)
        b_min[i] = np.min(b)

        r_kurtosis[i] = scipy.stats.kurtosis(r)
        g_kurtosis[i] = scipy.stats.kurtosis(g)
        b_kurtosis[i] = scipy.stats.kurtosis(b)
        
        r_skewness[i] = scipy.stats.skew(r)
        g_skewness[i] = scipy.stats.skew(g)
        b_skewness[i] = scipy.stats.skew(b)


    im_features['r_mean'] = r_mean
    im_features['g_mean'] = g_mean
    im_features['b_mean'] = b_mean

    im_features['rgb_mean_mean'] = (r_mean + g_mean + b_mean)/3.0

    im_features['r_std'] = r_std
    im_features['g_std'] = g_std
    im_features['b_std'] = b_std

    im_features['rgb_mean_std'] = (r_std + g_std + b_std)/3.0

    im_features['r_max'] = r_max
    im_features['g_max'] = g_max
    im_features['b_max'] = b_max

    im_features['rgb_mean_max'] = (r_max + r_max + b_max)/3.0

    im_features['r_min'] = r_min
    im_features['g_min'] = g_min
    im_features['b_min'] = b_min

    im_features['rgb_mean_min'] = (r_min + g_min + b_min)/3.0

    im_features['r_range'] = r_max - r_min
    im_features['g_range'] = g_max - g_min
    im_features['b_range'] = b_max - b_min

    im_features['r_kurtosis'] = r_kurtosis
    im_features['g_kurtosis'] = g_kurtosis
    im_features['b_kurtosis'] = b_kurtosis
    
    im_features['r_skewness'] = r_skewness
    im_features['g_skewness'] = g_skewness
    im_features['b_skewness'] = b_skewness
    
    return im_features

In [5]:
# Extract features
print('Extracting train features')
train_features = extract_features(df_train, TRAIN_PATH)
print('Extracting test features')
test_features = extract_features(df_test, TEST_PATH)

  0%|          | 0/32383 [00:00<?, ?it/s]

Extracting train features


100%|██████████| 32383/32383 [04:11<00:00, 128.66it/s]
  0%|          | 0/8096 [00:00<?, ?it/s]

Extracting test features


100%|██████████| 8096/8096 [01:02<00:00, 128.97it/s]


In [6]:
# Pickle the data
import pickle

train_feat = open('train_features_xgb.pickle', 'wb')
pickle.dump(train_features, train_feat, protocol=4)
train_feat.close()

test_feat = open('test_features_xgb.pickle', 'wb')
pickle.dump(test_features, test_feat, protocol=4)
test_feat.close()

In [7]:
# Load the pickled features:
import pickle

pickle_XGtrain = open('train_features_xgb.pickle', 'rb')
train_features = pickle.load(pickle_XGtrain)

pickle_yGtrain = open('test_features_xgb.pickle', 'rb')
test_features = pickle.load(pickle_yGtrain)

In [8]:
train_features

Unnamed: 0,image_name,tags,r_mean,g_mean,b_mean,rgb_mean_mean,r_std,g_std,b_std,rgb_mean_std,...,rgb_mean_min,r_range,g_range,b_range,r_kurtosis,g_kurtosis,b_kurtosis,r_skewness,g_skewness,b_skewness
0,train_3577.jpg,haze primary,33.536392,65.431900,62.715378,53.894557,13.784964,11.300903,10.461465,11.849111,...,17.000000,232.0,191.0,184.0,13.050226,9.217388,7.321115,2.087852,1.547085,1.122272
1,train_10327.jpg,clear primary,11.404053,43.711578,17.701981,24.272537,6.197222,5.888620,4.929450,5.671764,...,8.333333,52.0,50.0,43.0,1.327724,0.314270,0.449883,0.921156,0.416528,0.324070
2,train_1243.jpg,clear primary water,24.042618,38.042938,13.600906,25.228821,15.389291,12.501424,7.648305,11.846340,...,4.000000,165.0,130.0,100.0,4.774659,2.882907,5.042152,1.489449,1.148245,1.223912
3,train_17066.jpg,clear primary,18.135345,44.320572,40.276062,34.243993,9.198519,8.602356,7.036161,8.279012,...,11.000000,81.0,86.0,71.0,1.052266,0.494463,0.530144,0.608870,0.297543,0.334936
4,train_15959.jpg,clear primary,16.630310,32.600525,16.318314,21.849716,9.145231,9.100885,7.284127,8.510081,...,2.666667,68.0,67.0,53.0,-0.095879,-0.217142,-0.185950,0.407414,0.245804,0.199647
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32378,train_7813.jpg,agriculture clear primary,15.477585,30.640793,13.050232,19.722870,13.642089,9.437640,7.496165,10.191964,...,3.333333,187.0,144.0,113.0,31.838679,16.472478,12.185731,4.563234,2.953253,2.238615
32379,train_32511.jpg,clear primary,24.251404,33.039978,15.311920,24.201101,9.635826,9.055283,7.181655,8.624255,...,1.000000,88.0,93.0,57.0,0.868034,0.713837,0.382141,0.539241,0.451534,0.464247
32380,train_5192.jpg,partly_cloudy primary,17.912979,38.986435,34.105347,30.334920,16.490701,16.399915,14.978230,15.956282,...,7.333333,138.0,145.0,134.0,9.176509,9.294900,9.163632,2.710161,2.721485,2.686671
32381,train_12172.jpg,agriculture clear cultivation habitation prima...,19.452423,43.950317,22.351227,28.584656,16.530786,11.430518,9.470354,12.477220,...,6.000000,255.0,237.0,242.0,33.187836,60.916271,96.569428,4.326013,5.017092,6.112610


In [9]:
print(train_features.shape)
print(test_features.shape)

(32383, 27)
(8096, 27)


In [11]:
# Prepare train data
X_train = np.array(train_features.drop(['image_name', 'tags'], axis=1))
y_train = []

flatten = lambda l: [item for sublist in l for item in sublist]
labels = np.array(list(set(flatten([l.split(' ') for l in train_features['tags'].values]))))

label_map = {l: i for i, l in enumerate(labels)}
inv_label_map = {i: l for l, i in label_map.items()}

In [12]:
label_map

{'cloudy': 0,
 'agriculture': 1,
 'cultivation': 2,
 'slash_burn': 3,
 'partly_cloudy': 4,
 'blow_down': 5,
 'selective_logging': 6,
 'bare_ground': 7,
 'water': 8,
 'haze': 9,
 'artisinal_mine': 10,
 'conventional_mine': 11,
 'road': 12,
 'habitation': 13,
 'blooming': 14,
 'clear': 15,
 'primary': 16}

In [13]:
df_train.tags.values[0]

'haze primary'

In [14]:
# Create a bag of words for train
for tags in tqdm(df_train.tags.values, miniters=1000):
    targets = np.zeros(17)
    for t in tags.split(' '):
        targets[label_map[t]] = 1 
    y_train.append(targets)
    
y_train = np.array(y_train, np.uint8) # float to int

100%|██████████| 32383/32383 [00:00<00:00, 327250.04it/s]


In [15]:
n_classes = y_train.shape[1] # 17 classes in total

#Create X_test dataset
X_test = np.array(test_features.drop(['image_name', 'tags'], axis=1))


In [16]:
# Train and predict with one-vs-all strategy
train_pred = np.zeros((X_train.shape[0], n_classes)) # (num_of_train_images, n_classes=17)
y_pred = np.zeros((X_test.shape[0], n_classes)) # (num_of_test_images, n_classes=17)

In [18]:
for class_i in tqdm(range(n_classes), miniters=1): 
    print("class_i =", class_i, inv_label_map[class_i])
    model = xgb.XGBClassifier(max_depth=5, learning_rate=0.1, n_estimators=250, \
                              objective='binary:logistic', n_jobs=-1, \
                              gamma=0, min_child_weight=1, max_delta_step=0, \
                              subsample=1, colsample_bytree=1, colsample_bylevel=1, \
                              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, \
                              base_score=0.5, seed=random_seed, \
                              use_label_encoder=False, eval_metric='error', \
                             )
    
    model.fit(X_train, y_train[:, class_i])
    train_pred[:, class_i] = model.predict_proba(X_train)[:,1] # prediction on train set
    y_pred[:, class_i] = model.predict_proba(X_test)[:,1] # prediction on test set
    

  "memory consumption")


class_i = 0 cloudy


  6%|▌         | 1/17 [00:09<02:35,  9.74s/it]

class_i = 1 agriculture


 12%|█▏        | 2/17 [00:19<02:25,  9.68s/it]

class_i = 2 cultivation


 18%|█▊        | 3/17 [00:29<02:15,  9.66s/it]

class_i = 3 slash_burn


 24%|██▎       | 4/17 [00:37<01:58,  9.10s/it]

class_i = 4 partly_cloudy


 29%|██▉       | 5/17 [00:46<01:51,  9.28s/it]

class_i = 5 blow_down


 35%|███▌      | 6/17 [00:54<01:35,  8.73s/it]

class_i = 6 selective_logging


 41%|████      | 7/17 [01:03<01:26,  8.66s/it]

class_i = 7 bare_ground


 47%|████▋     | 8/17 [01:12<01:19,  8.84s/it]

class_i = 8 water


 53%|█████▎    | 9/17 [01:22<01:13,  9.13s/it]

class_i = 9 haze


 59%|█████▉    | 10/17 [01:31<01:05,  9.30s/it]

class_i = 10 artisinal_mine


 65%|██████▍   | 11/17 [01:39<00:52,  8.78s/it]

class_i = 11 conventional_mine


 71%|███████   | 12/17 [01:46<00:41,  8.35s/it]

class_i = 12 road


 76%|███████▋  | 13/17 [01:56<00:34,  8.70s/it]

class_i = 13 habitation


 82%|████████▏ | 14/17 [02:05<00:26,  8.93s/it]

class_i = 14 blooming


 88%|████████▊ | 15/17 [02:14<00:17,  8.80s/it]

class_i = 15 clear


 94%|█████████▍| 16/17 [02:23<00:09,  9.01s/it]

class_i = 16 primary


100%|██████████| 17/17 [02:32<00:00,  8.99s/it]


In [19]:
print(y_pred.shape)
print(y_pred[0])

(8096, 17)
[4.57968672e-05 2.57156999e-03 4.10070468e-04 4.17873434e-06
 8.73481767e-05 8.61473236e-05 3.36833145e-05 5.27518161e-04
 1.58886295e-02 1.62929704e-04 3.71965513e-07 4.47586353e-06
 1.42336788e-03 4.50305088e-05 1.68495153e-05 9.99656677e-01
 9.99892950e-01]


In [20]:
def calc_acc(y_pred, df_test, labels=labels, threshold=0.2):
    
    result = np.array(y_pred)
    result = pd.DataFrame(result, columns=labels)
    preds = []
    for i in range(result.shape[0]):
        a = result.iloc[[i]]
        a = a.apply(lambda x: x > threshold, axis=1)
        a = a.transpose()
        a = a.loc[a[i] == True]
        preds.append(' '.join(list(a.index)))
        
    df_test['pred_tags'] = preds
    test = df_test['tags'].apply(lambda x: x.split(' '))
    pred = df_test['pred_tags'].apply(lambda x: x.split(' '))
    
    mlb = MultiLabelBinarizer()
    test = pd.DataFrame(mlb.fit_transform(test), columns=mlb.classes_)
    pred = pd.DataFrame(mlb.transform(pred), columns=mlb.classes_)
    score = fbeta_score(test, pred, beta=2, average='weighted')
    
    return score

In [21]:
train_score = calc_acc(train_pred, df_train)
print(train_score)

0.931687534915448


In [22]:
test_score = calc_acc(y_pred, df_test)
print(test_score)

0.8721141837284199


In [23]:
print(classification_report(test, pred, target_names=labels))

NameError: name 'test' is not defined