# Библиотеки

In [1]:
import numpy as np
import pandas as pd
from PIL import Image
import os
from natsort import natsorted, ns

import keras
from keras.models import Model, Sequential, model_from_json
from keras.layers import Input, Convolution2D, MaxPooling2D, Dense, Dropout, Flatten
from keras.utils import np_utils

import cv2
from tqdm import tqdm

from sklearn.metrics import fbeta_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

Using TensorFlow backend.


# Загрузка и обработка данных

In [21]:
core_dir = 'Z:\\Kaggle Amazon Rainforest\\'
#core_dir = 'C:\\Kaggle\\Understanding the Amazon from Space\\'
train_dir = core_dir + 'train-jpg\\'
test_dir = core_dir + 'test-jpg\\'
add_test_dir = 'C:\\Users\\horch\\Desktop\\Local_data\\test-jpg-additional\\'

test_data_names = natsorted(os.listdir(test_dir), key=lambda y: y.lower())
add_test_data_names = os.listdir(add_test_dir)


cathegories = ['agriculture', 'artisinal_mine', 'bare_ground', 
                      'blooming', 'blow_down', 'clear', 'cloudy', 'conventional_mine', 
                      'cultivation', 'habitation', 'haze', 'partly_cloudy', 'primary', 
                      'road', 'selective_logging', 'slash_burn', 'water']

train_data = pd.read_csv(core_dir + 'train_v2.csv')
train_data.head()

Unnamed: 0,image_name,tags
0,train_0,haze primary
1,train_1,agriculture clear primary water
2,train_2,clear primary
3,train_3,clear primary
4,train_4,agriculture clear habitation primary road


# Функции

подготовка тренировочной выборки по имени категории
принимает категорию, возвращает массив из 0 и 1 (1 - файл содержит категорию, 0 - иначе)

In [22]:
def TrainingSetByCathegory(cathegory):
    df = pd.DataFrame()
    
    df['image_name'] = train_data.image_name.values
    df['indicator'] = np.zeros(train_data.shape[0])
    
    df.loc[df['image_name'].isin(cathegory_dict[cathegory]), ['indicator']] = 1
    
    return df.indicator.values

формирование полной (по всем категориям) тестовой выборки

In [23]:
def TestSetWhole(data, col_name = 'tags'):
    encoder = LabelEncoder()
    encoder.fit(cathegories)
    
    result = np.zeros((data.shape[0], len(cathegories)))
    
    for i in range(data.shape[0]):
        tag = data[col_name][i]
        tag = tag.split(' ')
        v = encoder.transform(tag)
        
        for j in v:
            result[i, j] = 1
    
    return(result)

функция предназначенная для конвертации .jpg изображения в numpy массив
если transparency = False, то каждая точка представляется 3 числами иначе - 4

In [24]:
def ImageToNumpy(img_name, img_dir = train_dir, img_type = '.jpg', transparency = False):
    img = Image.open(train_dir + img_name + img_type)
    img.load()
    data = np.asarray( img, dtype="int32" )
    img.close()
    
    if transparency:
        return data
    else:
        return data[:, :, 0:3]

функция для конвертации вектора компоненты которого принадлежат [0, 1] в вектор из 0 и 1 при заданном пороге (treshold)

In [25]:
def BinimialPrediction(x, treshold = 0.5):
    result = np.zeros(x.shape)
    
    for i in range(x.shape[0]):
        for j in range(x.shape[1]):
            if x[i, j] >= treshold:
                result[i, j] = 1
    
    return(result)

def FBettaScore(x_true, x_predicted, betta = 2):
    if len(x_true) == len(x_predicted):
        tp = 0
        fp = 0
        fn = 0

        for i in range(len(x_predicted)):
            if x_true[i] == 1 and x_predicted[i] == 1:
                tp += 1
            
            if x_true[i] == 0 and x_predicted[i] == 1:
                fp += 1
            
            if x_true[i] == 1 and x_predicted[i] == 0:
                fn += 1
        
        if tp == 0 or (tp + fp) == 0 or (tp + fn) == 0:
            return(0)
        else:
            precision = tp/(tp + fp)
            recall = tp/(tp + fn)
            
            #print(precision, recall)
            return((1 + betta**2)*precision*recall/(betta**2*precision + recall))
    else:
        print('FBettaScore error! len(x_true) != len(x_predicted)')

def AvgFBettaScore(x_true, x_predicted, betta = 2):
    result = 0
    n = x_true.shape[0]
    
    x_predicted = BinimialPrediction(x_predicted)
    
    for i in range(n):
        result += FBettaScore(x_true[i, :], x_predicted[i, :], betta)
    
    return(result/n)


# Вычисления

создадим словарь категорий
каждой категории будет соответствовать список файлов, в которых данная категория присутствует

In [26]:
cathegory_dict = {cathegory: [] for cathegory in cathegories}

ind = 0
for tag in train_data.tags.values:
    file_name = train_data.image_name[ind]
    
    cathegory_names_list = tag.split(' ')
    
    for cathegory in cathegory_names_list:
        cathegory_dict[cathegory].append(file_name)
    
    ind += 1   

формирование обучающей выборки

In [27]:
X = []

for img_name in tqdm(train_data.image_name.values):
    
    img = cv2.imread(train_dir + img_name + '.jpg')
    res = cv2.resize(img, (32, 32))
    X.append(res)
    
X = np.array(X, np.float16) / 255.

100%|████████████████████████████████████| 40479/40479 [12:42<00:00, 53.08it/s]


формирование тестовой выборки

In [28]:
Y = TestSetWhole(train_data)

разбиение выборки на обучение и тест

In [29]:
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=42)

# Настройка и  обучение сети для всех категорий

In [40]:
batch_size = 128 # in each iteration, we consider 32 training examples at once
num_epochs = 100 # we iterate 200 times over the entire training set
kernel_size = 3 # we will use 3x3 kernels throughout
pool_size = 2 # we will use 2x2 pooling throughout
conv_depth_1 = 32 # we will initially have 32 kernels per conv. layer...
conv_depth_2 = 64 # ...switching to 64 after the first pooling layer
drop_prob_1 = 0.25 # dropout after pooling with probability 0.25
drop_prob_2 = 0.5 # dropout in the FC layer with probability 0.5
hidden_size = 512 # the FC layer will have 512 neurons

In [41]:
# загрузка изображений и их приведение к подходящему для обработки виду
num_train = 40479
depth = 32 
height = 32
width = 3 

num_classes = len(cathegories)

#y_train = TrainingSetByCathegory(cathegories[0]) #y_train labels (для категории_0 1, если подходит, 0-иначе)
#Y_train = np_utils.to_categorical(y_train, num_classes) # One-hot encode the labels
#X_train = X

In [None]:
# переделать
'''
def f2_metric(y_true, y_pred):
    result = []
    for i in range(y_true.shape[0]):
        result.append(fbeta_score(y_true[i, :], y_pred[i, :], 2))
    
    result = np.mean(result)
    
    return(result)
'''

In [42]:
inp = Input(shape=(depth, height, width)) # N.B. depth goes first in Keras!

# Conv [32] -> Conv [32] -> Pool (with dropout on the pooling layer)

conv_1 = Convolution2D(conv_depth_1, kernel_size, kernel_size, border_mode='same', activation='relu')(inp)
conv_2 = Convolution2D(conv_depth_1, kernel_size, kernel_size, border_mode='same', activation='relu')(conv_1)
pool_1 = MaxPooling2D(pool_size=(pool_size, pool_size))(conv_2)
drop_1 = Dropout(drop_prob_1)(pool_1)

# Conv [64] -> Conv [64] -> Pool (with dropout on the pooling layer)

conv_3 = Convolution2D(conv_depth_2, kernel_size, kernel_size, border_mode='same', activation='relu')(drop_1)
conv_4 = Convolution2D(conv_depth_2, kernel_size, kernel_size, border_mode='same', activation='relu')(conv_3)
pool_2 = MaxPooling2D(pool_size=(pool_size, pool_size))(conv_4)
drop_2 = Dropout(drop_prob_1)(pool_2)

# Now flatten to 1D, apply FC -> ReLU (with dropout) -> softmax

flat = Flatten()(drop_2)

hidden = Dense(hidden_size, activation='relu')(flat)

drop_3 = Dropout(drop_prob_2)(hidden)

out = Dense(num_classes, activation='sigmoid')(drop_3)

model = Model(input=inp, output=out) # To define a model, just specify its input and output layers

model.compile(loss='binary_crossentropy', # using the cross-entropy loss function
              optimizer='adam', # using the Adam optimiser
              metrics=['accuracy']) # reporting the accuracy

model.fit(x_train, y_train, # Train the model using the training set...
          batch_size=batch_size, nb_epoch=num_epochs,
          verbose=1, validation_split=0.2) # ...holding out 10% of the data for validation

#model.evaluate(x_test, y_test, verbose=1) # Evaluate the trained model on the test set!
#cnn_prediction = model.predict(x_test)



Train on 22668 samples, validate on 5667 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/10

<keras.callbacks.History at 0x23ba6710>

In [43]:
# Генерируем описание модели в формате json
model_json = model.to_json()
# Записываем модель в файл
json_file = open("C:\\Users\\horch\\Desktop\\Local_data\\model_add.json", "w")
json_file.write(model_json)
json_file.close()

model.save_weights("C:\\Users\\horch\\Desktop\\Local_data\\model_add.h5")

## Делаем предсказание

In [44]:
# load json and create model
json_file = open('C:\\Users\\horch\\Desktop\\Local_data\\model_add.json', 'r')
loaded_model_json = json_file.read()
json_file.close()
loaded_model = model_from_json(loaded_model_json)

# load weights into new model
loaded_model.load_weights('C:\\Users\\horch\\Desktop\\Local_data\\model_add.h5')
print("Loaded model from disk")
 
# evaluate loaded model on test data
loaded_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

Loaded model from disk


функция для формирования тестовой выборки

In [32]:
X_pred_add = []

for img_name in tqdm(add_test_data_names):
    #print(img_name)
    if img_name.endswith("jpg"):
        img = cv2.imread(add_test_dir + img_name)
        res = cv2.resize(img, (32, 32))
        X_pred_add.append(res)
    else:
        print("This file is not jpg: " + img_name)
    
    
X_pred_add = np.array(X_pred_add, np.float16) / 255.

100%|███████████████████████████████████| 20522/20522 [01:13<00:00, 277.89it/s]


In [33]:
X_pred = []

for img_name in tqdm(test_data_names):
    if img_name.endswith("jpg"):
        img = cv2.imread(test_dir + img_name)
        res = cv2.resize(img, (32, 32))
        X_pred.append(res)
    else:
        print("This file is not jpg: " + img_name)
    
    
X_pred = np.array(X_pred, np.float16) / 255.

100%|███████████████████████████████████▉| 40663/40670 [13:59<00:00, 59.04it/s]

This file is not jpg: Thumbs.db


100%|████████████████████████████████████| 40670/40670 [14:00<00:00, 48.42it/s]


In [45]:
img_prediction = loaded_model.predict(X_pred)

In [46]:
img_prediction_add = loaded_model.predict(X_pred_add)

In [47]:
local_dir = 'C:\\Users\\horch\\Desktop\\Local_data\\'

In [48]:
csv_file_dir = local_dir + 'submission.csv'
submission_csv = open(csv_file_dir, 'w')
submission_csv.write('image_name, tags\n')

for i in tqdm(range(len(test_data_names))):
    if test_data_names[i].endswith("jpg"):
        line = test_data_names[i] + ','
        
        for j in range(len(img_prediction[i])):
            if img_prediction[i][j] >= 0.95:
                line += cathegories[j] + ' '

        submission_csv.write(line + '\n')
submission_csv.close()

100%|█████████████████████████████████| 40670/40670 [00:03<00:00, 13040.06it/s]


In [49]:
csv_file_dir = local_dir + 'submission_add.csv'
submission_csv = open(csv_file_dir, 'w')
submission_csv.write('image_name, tags\n')

for i in tqdm(range(len(add_test_data_names))):
    if add_test_data_names[i].endswith("jpg"):
        line = add_test_data_names[i] + ','
        
        for j in range(len(img_prediction_add[i])):
            if img_prediction_add[i][j] >= 0.95:
                line += cathegories[j] + ' '

        submission_csv.write(line + '\n')
submission_csv.close()

100%|█████████████████████████████████| 20522/20522 [00:01<00:00, 12496.38it/s]
