In [1]:
import numpy as np
import pandas as pd
import os
from tqdm import tqdm
from sklearn.metrics import fbeta_score, accuracy_score

In [2]:
inv_label_map = ['blow_down',
 'bare_ground',
 'conventional_mine',
 'blooming',
 'cultivation',
 'artisinal_mine',
 'haze',
 'primary',
 'slash_burn',
 'habitation',
 'clear',
 'road',
 'selective_logging',
 'partly_cloudy',
 'agriculture',
 'water',
 'cloudy']

label_map = {'agriculture': 14,
 'artisinal_mine': 5,
 'bare_ground': 1,
 'blooming': 3,
 'blow_down': 0,
 'clear': 10,
 'cloudy': 16,
 'conventional_mine': 2,
 'cultivation': 4,
 'habitation': 9,
 'haze': 6,
 'partly_cloudy': 13,
 'primary': 7,
 'road': 11,
 'selective_logging': 12,
 'slash_burn': 8,
 'water': 15}

stage_1 = [
    'collection',
    'resnet50',
    'vgg16',
    'vgg16_scratch',
    'xception',
    'xception_scratch'
]

In [3]:
def find_f2_thresholds(y, pred):
    def test_thres(x):
        pred2 = np.zeros_like(pred)
        for i in range(17):
            pred2[:, i] = (pred[:, i] > x[i]).astype(np.int)
        score = fbeta_score(y, pred2, beta=2, average='samples')
        return score
        
    threshold = [0.2]*17
    for col in range(17):
        best_thres = 0
        best_score = 0
        for thres in tqdm(np.arange(0,1,0.005)):
            threshold[col] = thres
            score = test_thres(threshold)
            if score > best_score:
                best_thres = thres
                best_score = score
        threshold[col] = best_thres
        print(col, best_thres, best_score)
    
    return threshold

# 1. Generate Average Data

In [4]:
train = []
for folder in stage_1:
    train.append(pd.read_csv('../../Layer_1/'+folder+'/train.csv'))

In [6]:
X = pd.DataFrame(np.zeros(train[1].shape, dtype=np.float16))
count = {}
for df in train:
    for col in df:
        if col in count:
            count[col] += 1
        else:
            count[col] = 0
        X[int(col)] += df[str(col)]

for col in X:
    X[col] /= count[str(col)]

In [7]:
df_train = pd.read_csv('../../../input/train.csv')
Y = df_train.iloc[:,1:].values

In [8]:
from sklearn.model_selection import train_test_split

In [18]:
x_train, x_val, y_train, y_val = train_test_split(X.values, Y, test_size=0.2, random_state=42)

In [11]:
thresholds = find_f2_thresholds(y_train, x_train)

100%|██████████| 200/200 [01:10<00:00,  2.84it/s]
  0%|          | 0/200 [00:00<?, ?it/s]

0 0.115 0.935631310991


100%|██████████| 200/200 [01:10<00:00,  2.81it/s]
  0%|          | 0/200 [00:00<?, ?it/s]

1 0.155 0.935714311178


100%|██████████| 200/200 [01:46<00:00,  1.50it/s]
  0%|          | 0/200 [00:00<?, ?it/s]

2 0.175 0.935715825053


100%|██████████| 200/200 [01:19<00:00,  2.50it/s]
  0%|          | 0/200 [00:00<?, ?it/s]

3 0.195 0.935724271427


100%|██████████| 200/200 [01:16<00:00,  2.67it/s]
  0%|          | 0/200 [00:00<?, ?it/s]

4 0.28 0.936236926773


100%|██████████| 200/200 [01:16<00:00,  2.84it/s]
  0%|          | 0/200 [00:00<?, ?it/s]

5 0.255 0.936240732554


100%|██████████| 200/200 [01:10<00:00,  2.85it/s]
  0%|          | 0/200 [00:00<?, ?it/s]

6 0.285 0.936522448747


100%|██████████| 200/200 [01:10<00:00,  2.85it/s]
  0%|          | 0/200 [00:00<?, ?it/s]

7 0.41 0.93718758527


100%|██████████| 200/200 [01:11<00:00,  2.84it/s]
  0%|          | 0/200 [00:00<?, ?it/s]

8 0.125 0.937322375393


100%|██████████| 200/200 [01:25<00:00,  1.29it/s]
  0%|          | 0/200 [00:00<?, ?it/s]

9 0.27 0.937592259557


100%|██████████| 200/200 [02:09<00:00,  2.88it/s]
  0%|          | 0/200 [00:00<?, ?it/s]

10 0.505 0.938914322145


100%|██████████| 200/200 [01:09<00:00,  2.88it/s]
  0%|          | 0/200 [00:00<?, ?it/s]

11 0.32 0.939709988319


100%|██████████| 200/200 [01:10<00:00,  2.83it/s]
  0%|          | 0/200 [00:00<?, ?it/s]

12 0.165 0.939731981066


100%|██████████| 200/200 [01:10<00:00,  2.84it/s]
  0%|          | 0/200 [00:00<?, ?it/s]

13 0.35 0.94022810692


100%|██████████| 200/200 [01:11<00:00,  2.84it/s]
  0%|          | 0/200 [00:00<?, ?it/s]

14 0.355 0.941682191218


100%|██████████| 200/200 [01:10<00:00,  2.85it/s]
  0%|          | 0/200 [00:00<?, ?it/s]

15 0.3 0.942322432211


  'precision', 'predicted', average, warn_for)
100%|██████████| 200/200 [01:10<00:00,  2.86it/s]

16 0.305 0.942404124913





In [21]:
for i in range(17):
    x_train[:, i] = (x_train[:, i] > thresholds[i]).astype(np.int)

In [22]:
print(fbeta_score(y_train, x_train, beta=2, average='samples'))

0.945891819623


# 2. Evaluation

In [24]:
for i in range(17):
    x_val[:, i] = (x_val[:, i] > thresholds[i]).astype(np.int)

In [25]:
print(fbeta_score(y_val, x_val, beta=2, average='samples'))

0.928453417875


# 3. Submit Prediction

In [None]:
test = []
for folder in stage_1:
    test.append(pd.read_csv('../../Layer_1/'+folder+'/test.csv'))

In [None]:
X_submission = pd.DataFrame(np.zeros(test[1].shape, dtype=np.float16))
count = {}
for df in train:
    for col in df:
        if col in count:
            count[col] += 1
        else:
            count[col] = 0
        X_submission[int(col)] += df[str(col)]

for col in X:
    X_submission[col] /= count[str(col)]

In [None]:
for i in range(17):
    x_submission[:, i] = (x_submission[:, i] > thresholds[i]).astype(np.int)

In [None]:
result = pd.DataFrame(x_submission)
preds = []
sorted_tags = pd.Series(inv_label_map)

for i in tqdm(range(result.shape[0]), miniters=1000):
    preds.append(' '.join(list(
        sorted_tags[np.where(result.loc[i] == 1)[0]]
    )))

In [None]:
df_submission = pd.read_csv('../input/sample_submission_v2.csv')
df_submission['tags'] = preds
df_submission.to_csv('test.csv', index=False)