## Project Goal
You will build artificial intelligence algorithms to label satellite image chips with different atmospheric conditions and the different classes of land cover/land use.  For this Multi-class Multi-Label problem, some of the labels are from the following categories: Cloud Cover (clear, partly, cloudy, haze), Primary RainForest, Water (rivers, lakes), Habitation (large city, small homes), Agriculture, Roads etc. The algorithms from this project will enable us to understand where, how and why deforestation happens.

In [1]:
# Importing necessary libraries for loading and exploring the dataset.

import pandas as pd
import numpy as np

train_set = pd.read_csv("planet-understanding-the-amazon-from-space/train_v2.csv/train_v2.csv")
train_set.head()

Unnamed: 0,image_name,tags
0,train_0,haze primary
1,train_1,agriculture clear primary water
2,train_2,clear primary
3,train_3,clear primary
4,train_4,agriculture clear habitation primary road


In [2]:
# Checking if there are any missing tags

train_set.isnull().sum()

image_name    0
tags          0
dtype: int64

In [3]:
labels = set()

def splitting_tags(tags):
    """
    Takes in a column of tags, splits the tags, and stores unique labels in a set.

    Parameters:
    - tags (str): A string containing space-separated tags.

    Returns:
    None
    """
    [labels.add(tag) for tag in tags.split()]

# Create a copy of train_label
train_clone = train_set.copy()

# Apply the splitting_tags function to the 'tags' column
train_clone['tags'].apply(splitting_tags)

# Convert the set of unique labels to a list
labels = list(labels)

# Get unique labels
print(labels)

['clear', 'selective_logging', 'haze', 'cultivation', 'slash_burn', 'primary', 'blow_down', 'bare_ground', 'agriculture', 'habitation', 'partly_cloudy', 'artisinal_mine', 'road', 'water', 'blooming', 'cloudy', 'conventional_mine']


In [4]:
# Perform one-hot encoding on the "Unique" labels in the specified cloned train df.
for tag in labels:
    train_clone[tag] = train_clone['tags'].apply(lambda x: 1 if tag in x.split() else 0)

# Adding '.jpg' extension to the 'image_name' column for same file format
train_clone['image_name'] = train_clone['image_name'].apply(lambda x: '{}.jpg'.format(x))

train_clone.head()

Unnamed: 0,image_name,tags,clear,selective_logging,haze,cultivation,slash_burn,primary,blow_down,bare_ground,agriculture,habitation,partly_cloudy,artisinal_mine,road,water,blooming,cloudy,conventional_mine
0,train_0.jpg,haze primary,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0
1,train_1.jpg,agriculture clear primary water,1,0,0,0,0,1,0,0,1,0,0,0,0,1,0,0,0
2,train_2.jpg,clear primary,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
3,train_3.jpg,clear primary,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
4,train_4.jpg,agriculture clear habitation primary road,1,0,0,0,0,1,0,0,1,1,0,0,1,0,0,0,0


In [5]:
# Get and assign definition to the columns that were newly added from encoding
columns = list(train_clone.columns[2:])
columns

['clear',
 'selective_logging',
 'haze',
 'cultivation',
 'slash_burn',
 'primary',
 'blow_down',
 'bare_ground',
 'agriculture',
 'habitation',
 'partly_cloudy',
 'artisinal_mine',
 'road',
 'water',
 'blooming',
 'cloudy',
 'conventional_mine']

In [6]:
import tensorflow as tf

In [7]:
def fbeta(y_true, y_pred, beta=2, epsilon=1e-4):
    """
    Calculate the F-beta score.

    Args:
        y_true (tf.Tensor): Correct target values.
        y_pred (tf.Tensor): Predicted values returned by the classifier.
        beta (float): Weight parameter for precision in the F-beta score (default is 2).
        epsilon (float): Smoothing term to avoid division by zero (default is 1e-4).

    Returns:
        tf.Tensor: F-beta score.
    """
    beta_squared = beta**2

    y_true = tf.cast(y_true, tf.float32)
    y_pred = tf.cast(tf.greater(tf.cast(y_pred, tf.float32), tf.constant(0.5)), tf.float32)

    tp = tf.reduce_sum(y_true * y_pred, axis=1)
    fp = tf.reduce_sum(y_pred, axis=1) - tp
    fn = tf.reduce_sum(y_true, axis=1) - tp

    precision = tp / (tp + fp + epsilon)
    recall = tp / (tp + fn + epsilon)

    fb = (1 + beta_squared) * precision * recall / (beta_squared * precision + recall + epsilon)
    return fb

In [8]:
def multi_label_acc(y_true, y_pred, epsilon=1e-4):
    """
    Returns accuracy value for multi-label classification.

    Args:
        y_true (tf.Tensor): Correct target values.
        y_pred (tf.Tensor): Predicted values returned by the classifier.
        epsilon (float): Smoothing term to avoid division by zero (default is 1e-4).

    Returns:
        tf.Tensor: Accuracy score.
    """
    y_true = tf.cast(y_true, tf.float32)
    y_pred = tf.cast(tf.greater(tf.cast(y_pred, tf.float32), tf.constant(0.5)), tf.float32)

    tp = tf.reduce_sum(y_true * y_pred, axis=1)
    fp = tf.reduce_sum(y_pred, axis=1) - tp
    fn = tf.reduce_sum(y_true, axis=1) - tp

    y_true = tf.cast(y_true, tf.bool)
    y_pred = tf.cast(y_pred, tf.bool)

    tn = tf.reduce_sum(tf.cast(tf.logical_not(y_true), tf.float32)
                       * tf.cast(tf.logical_not(y_pred), tf.float32), axis=1)

    return (tp + tn) / (tp + tn + fp + fn + epsilon)

In [9]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import BatchNormalization, Conv2D, MaxPooling2D, Dropout, Flatten, Dense
from tensorflow.keras.optimizers import Adam

In [10]:


def build_model():
    """
    Build and compile a convolutional neural network model for multi-label image classification.

    Returns:
        tensorflow.keras.models.Sequential: The compiled model.
    """
    model = Sequential()
    model.add(BatchNormalization(input_shape=(128, 128, 3)))
    model.add(Conv2D(32, kernel_size=(3, 3), padding='same', activation='relu'))
    model.add(Conv2D(32, kernel_size=(3, 3), activation='relu'))
    model.add(MaxPooling2D(pool_size=(2, 2)))
    model.add(Dropout(0.2))

    model.add(Conv2D(64, kernel_size=(3, 3), padding='same', activation='relu'))
    model.add(Conv2D(64, kernel_size=(3, 3), activation='relu'))
    model.add(MaxPooling2D(pool_size=(2, 2)))
    model.add(Dropout(0.2))

    model.add(Conv2D(128, kernel_size=(3, 3), padding='same', activation='relu'))
    model.add(Conv2D(128, kernel_size=(3, 3), activation='relu'))
    model.add(MaxPooling2D(pool_size=(2, 2)))
    model.add(Dropout(0.2))

    model.add(Conv2D(256, kernel_size=(3, 3), padding='same', activation='relu'))
    model.add(Conv2D(256, kernel_size=(3, 3), activation='relu'))
    model.add(MaxPooling2D(pool_size=(2, 2)))
    model.add(Dropout(0.2))

    model.add(Flatten())
    model.add(Dense(512, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(17, activation='sigmoid'))

    opt = Adam(learning_rate=0.001)

    #  binary_crossentropy is used here because categorical_crossentropy l1 norms the output before calculating loss.
    model.compile(loss='binary_crossentropy',
                  optimizer=opt,
                  metrics=[multi_label_acc, fbeta])

    return model

In [11]:
from tensorflow.keras.callbacks import ModelCheckpoint

In [12]:
# ModelCheckpoint method is set to monitor the model using validation fbeta score and save the best only
save_best_checkpoint = ModelCheckpoint(
    filepath='best_model.hdf5',
    monitor='val_fbeta',
    mode='max',
    save_best_only=True,
    save_weights_only=True
)

In [13]:
from tensorflow.keras.preprocessing.image import ImageDataGenerator
import warnings
warnings.filterwarnings("ignore")

In [14]:
# Initializing ImageDataGenerator method with a validation split of 0.2
train_image_gen = ImageDataGenerator(rescale=1/255, validation_split=0.2)

# Generating train data generator (80%)
train_generator = train_image_gen.flow_from_dataframe(
    dataframe=train_clone,
    directory="train-jpg.tar/train-jpg/train-jpg",
    x_col="image_name", y_col=columns, subset="training",
    batch_size=16, seed=2021, shuffle=True,
    class_mode="raw", target_size=(128, 128)
)

# Generating validation data (20%)
val_generator = train_image_gen.flow_from_dataframe(
    dataframe=train_clone,
    directory="train-jpg.tar/train-jpg/train-jpg",
    x_col="image_name", y_col=columns, subset="validation",
    batch_size=16, seed=2021, shuffle=True,
    class_mode="raw", target_size=(128, 128)
)

Found 32384 validated image filenames.
Found 8095 validated image filenames.


In [15]:
# Setting train and test data
step_train_size = int(np.ceil(train_generator.samples / train_generator.batch_size))
step_val_size = int(np.ceil(val_generator.samples / val_generator.batch_size))

# call the model
image_model = build_model()

# get over of model architecture
image_model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 batch_normalization (Batch  (None, 128, 128, 3)       12        
 Normalization)                                                  
                                                                 
 conv2d (Conv2D)             (None, 128, 128, 32)      896       
                                                                 
 conv2d_1 (Conv2D)           (None, 126, 126, 32)      9248      
                                                                 
 max_pooling2d (MaxPooling2  (None, 63, 63, 32)        0         
 D)                                                              
                                                                 
 dropout (Dropout)           (None, 63, 63, 32)        0         
                                                                 
 conv2d_2 (Conv2D)           (None, 63, 63, 64)        1

In [18]:
# The image model is fitted on the pre-defined functions
image_model.fit(x=train_generator, 
           steps_per_epoch=step_train_size, 
           validation_data=val_generator, 
           validation_steps=step_val_size,
            epochs=5, 
           callbacks=[save_best_checkpoint])

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x255d5bc3f90>

In [19]:
# A second model is initialised for making predictions
image_model2 = build_model()

# The second model is then loaded with the weights of the trained model (best_model.hdf5)
image_model2.load_weights('best_model.hdf5')

# adding .jpg extension to all rows in the image name column in the sample submission file
sample = pd.read_csv("planet-understanding-the-amazon-from-space/sample_submission_v2.csv/sample_submission_v2.csv")
sample_clone = sample.copy()
sample_clone['image_name'] = sample_clone['image_name'].apply(lambda x: '{}.jpg'.format(x))
sample_clone.head()

Unnamed: 0,image_name,tags
0,test_0.jpg,primary clear agriculture road water
1,test_1.jpg,primary clear agriculture road water
2,test_2.jpg,primary clear agriculture road water
3,test_3.jpg,primary clear agriculture road water
4,test_4.jpg,primary clear agriculture road water


In [20]:
# Split the sample submission file into two

test_df = sample_clone.iloc[:40669]['image_name'].reset_index().drop('index', axis =1)
test_df.head()

Unnamed: 0,image_name
0,test_0.jpg
1,test_1.jpg
2,test_2.jpg
3,test_3.jpg
4,test_4.jpg


In [21]:
# Call ImageDataGenerator
test_image_gen = ImageDataGenerator(rescale = 1/255)

# Generator is created for the images found in the first test image files
test_generator = test_image_gen.flow_from_dataframe(dataframe=test_df, 
                                                directory="test-jpg/test-jpg", 
                                                x_col="image_name", 
                                                y_col=None, 
                                                batch_size=16, 
                                                shuffle=False, 
                                                class_mode=None, 
                                                target_size=(128,128))

step_test_size = int(np.ceil(test_generator.samples/test_generator.batch_size))

Found 40669 validated image filenames.


In [22]:
# Generator is tested to avoid shuffling of index
test_generator.reset()
pred = image_model2.predict(test_generator, steps=step_test_size, verbose=1)



In [23]:
# Get filenames in the generator using the attribute .filenames
file_names = test_generator.filenames

# Convert the predicted values into a Pandas DataFrame. Then, join the two labels together into a 
# single label if the probability of occurrence of either label is greater than 0.5. This will provide 
# a more concise and clear representation of the predicted labels.

pred_tags = pd.DataFrame(pred)
pred_tags = pred_tags.apply(lambda x: ' '.join(np.array(labels)[x > 0.5]), axis = 1)

# Store result in a DataFrame
result_df = pd.DataFrame({'image_name': file_names, 'tags': pred_tags})
result_df.head()

Unnamed: 0,image_name,tags
0,test_0.jpg,clear primary
1,test_1.jpg,clear primary
2,test_2.jpg,primary partly_cloudy
3,test_3.jpg,clear primary
4,test_4.jpg,primary partly_cloudy


In [24]:
# Another test is added ...
test_df2 = sample_clone.iloc[40669:]['image_name'].reset_index().drop('index', axis =1)
test_df2.head()

Unnamed: 0,image_name
0,file_0.jpg
1,file_1.jpg
2,file_10.jpg
3,file_100.jpg
4,file_1000.jpg


The same operations are done on the first test DataFrame carried out on the additional test

In [26]:
test_df2_generator = test_image_gen.flow_from_dataframe(dataframe=test_df2, 
                                                    directory ="test-jpg-additional.tar/test-jpg-additional/test-jpg-additional", 
                                                    x_col="image_name", 
                                                    y_col=None, 
                                                    batch_size=16, 
                                                    shuffle=False, 
                                                    class_mode=None, 
                                                    target_size=(128,128))


step_test_size2 = int(np.ceil(test_df2_generator.samples/test_df2_generator.batch_size))

Found 20522 validated image filenames.


In [28]:
test_df2_generator.reset()
test_df2_pred = image_model2.predict(test_df2_generator, steps=step_test_size2, verbose=1)



In [31]:
file_names = test_df2_generator.filenames

add_pred_tags = pd.DataFrame(test_df2_pred)
add_pred_tags = add_pred_tags.apply(lambda x: ''.join(np.array(labels)[x > 0.5]), axis=1)

result_df2 = pd.DataFrame({'image_name': file_names, 'tags': add_pred_tags})
result_df2.head()

Unnamed: 0,image_name,tags
0,file_0.jpg,clearprimary
1,file_1.jpg,clearprimaryagricultureroad
2,file_10.jpg,primaryagriculturewater
3,file_100.jpg,clearprimaryagriculturewater
4,file_1000.jpg,clearprimary


In [34]:
# Concatenate the 1st & 2nd results...
final_df = pd.concat([result_df, result_df2])
final_df = final_df.reset_index().drop('index', axis=1)

# Get shape of final result
print(final_df.shape)

# Get overview of the final result
final_df.head()

(61191, 2)


Unnamed: 0,image_name,tags
0,test_0.jpg,clear primary
1,test_1.jpg,clear primary
2,test_2.jpg,primary partly_cloudy
3,test_3.jpg,clear primary
4,test_4.jpg,primary partly_cloudy


In [35]:
# .jpg extension is removed since all operations have been carried out
final_df['image_name'] = final_r['image_name'].apply(lambda x: x[:-4])
final_df.head()

NameError: name 'final_result' is not defined

In [None]:
final_result.to_csv('image_predictions.csv', index=False)