## In this notebook

- The first model.

In [1]:
import os
import datetime
import time

# analytics
import pandas as pd
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 100)
import numpy as np

# tif images processing
import tensorflow as tf
import tifffile as tiff

# model
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler, OrdinalEncoder

caused by: ["[Errno 2] The file to load file system plugin from does not exist.: '/opt/venv/lib/python3.8/site-packages/tensorflow_io/python/ops/libtensorflow_io_plugins.so'"]
caused by: ['/opt/venv/lib/python3.8/site-packages/tensorflow_io/python/ops/libtensorflow_io.so: cannot open shared object file: No such file or directory']


In [2]:
FEATURES_METADATA_PROCESSED_FILEPATH = "../data/features_metadata_processed.csv"
SUBMISSION_FILES_CNT = 2_773
CHIPS_TO_SELECT_CNT = 100

## Load metadata

In [3]:
df_metadata = pd.read_csv(FEATURES_METADATA_PROCESSED_FILEPATH)

# filter metadata
df_metadata = (
    df_metadata
    .loc[
        (df_metadata.split == "train")
    ]
    .reset_index(drop=True)
)

df_metadata["satellite_cnt_per_chip_month"] = (
    df_metadata
    .groupby(["chip_id", "month"])
    .satellite
    .transform("count")
)

df_metadata.tail()

Unnamed: 0,filename,filepath,label_filename,label_filepath,chip_id,satellite,split,month,satellite_cnt_per_chip_month
189073,fff05995_S2_07.tif,/usr/src/app/data/train_features/fff05995_S2_0...,fff05995_agbm.tif,/usr/src/app/data/train_agbm/fff05995_agbm.tif,fff05995,S2,train,April,2
189074,fff05995_S2_08.tif,/usr/src/app/data/train_features/fff05995_S2_0...,fff05995_agbm.tif,/usr/src/app/data/train_agbm/fff05995_agbm.tif,fff05995,S2,train,May,2
189075,fff05995_S2_09.tif,/usr/src/app/data/train_features/fff05995_S2_0...,fff05995_agbm.tif,/usr/src/app/data/train_agbm/fff05995_agbm.tif,fff05995,S2,train,June,2
189076,fff05995_S2_10.tif,/usr/src/app/data/train_features/fff05995_S2_1...,fff05995_agbm.tif,/usr/src/app/data/train_agbm/fff05995_agbm.tif,fff05995,S2,train,July,2
189077,fff05995_S2_11.tif,/usr/src/app/data/train_features/fff05995_S2_1...,fff05995_agbm.tif,/usr/src/app/data/train_agbm/fff05995_agbm.tif,fff05995,S2,train,August,2


## Load data

In [4]:
def load_train_data_for_month(month: str, df_metadata: pd.DataFrame) -> pd.DataFrame:
    dfs = []

    for chip_id in np.random.choice(
        (
            df_metadata.loc[
                (df_metadata.month == month) &
                (df_metadata.satellite_cnt_per_chip_month == 2)
            ]
            .chip_id
            .unique()
        ), 
        size=CHIPS_TO_SELECT_CNT, 
        replace=False
    ):
        cols = {
            "chip_id": [chip_id] * 65_536
        }

        # load features
        for sat_i in [1, 2]:
            filepath = (
                df_metadata
                .loc[
                    (df_metadata.month == month) &
                    (df_metadata.chip_id == chip_id) &
                    (df_metadata.satellite == f"S{sat_i}")
                ]
                .filepath
                .values[0]
            )

            img = tiff.imread(filepath)
            for band_i, band in enumerate(tf.transpose(img, perm=(2, 0, 1)).numpy()):
                cols[f"s{sat_i}_band_{band_i + 1}"] = band.reshape(1, -1)[0]

        # load label
        label_filepath = (
            df_metadata
            .loc[
                (df_metadata.month == month) &
                (df_metadata.chip_id == chip_id) & 
                (df_metadata.satellite == f"S{sat_i}")
            ]
            .label_filepath
            .values[0]
        )
        img = tiff.imread(label_filepath)
        cols["label"] = img.reshape(1, -1)[0]

        # create chip dataframe        
        dfs.append(
            pd.DataFrame(cols)
        )

    df = pd.concat(dfs, ignore_index=True)
    return df

## Create models

In [5]:
Y_COLUMN = "label"
X_COLUMNS = [
    "s1_band_1", 
    "s1_band_2", 
    "s1_band_3", 
    "s1_band_4",
    "s2_band_1", 
    "s2_band_2", 
    "s2_band_3", 
    "s2_band_4", 
    "s2_band_5",
    "s2_band_6", 
    "s2_band_7", 
    "s2_band_8", 
    "s2_band_9", 
    "s2_band_10",
    "s2_band_11",
]

In [6]:
%%time

models = {}

for month in df_metadata.month.unique():
    time_start = time.time()
    print(f"▶️ Starting training {month} model.")
    
    # load data
    _df = load_train_data_for_month(month=month, df_metadata=df_metadata)
    
    # define pipeline
    pipeline = Pipeline([
        ("min_max_scaler", MinMaxScaler()),
        ("model", GradientBoostingRegressor())
    ])
    
    # fir pipeline
    pipeline.fit(
        _df.loc[:, X_COLUMNS],
        _df.loc[:, Y_COLUMN],
    )
    
    models[month] = pipeline
    
    print(f"✅ Finished training {month} model in {(time.time() - time_start) / 60:,.2f} mins.")

▶️ Starting training September model.
✅ Finished training September model in 29.06 mins.
▶️ Starting training October model.
✅ Finished training October model in 30.27 mins.
▶️ Starting training November model.
✅ Finished training November model in 30.63 mins.
▶️ Starting training December model.
✅ Finished training December model in 29.25 mins.
▶️ Starting training January model.
✅ Finished training January model in 32.00 mins.
▶️ Starting training February model.
✅ Finished training February model in 31.29 mins.
▶️ Starting training March model.
✅ Finished training March model in 32.53 mins.
▶️ Starting training April model.
✅ Finished training April model in 30.92 mins.
▶️ Starting training May model.
✅ Finished training May model in 29.57 mins.
▶️ Starting training June model.
✅ Finished training June model in 28.75 mins.
▶️ Starting training July model.
✅ Finished training July model in 29.22 mins.
▶️ Starting training August model.
✅ Finished training August model in 29.51 mins.


## Create submission

In [7]:
df_metadata = pd.read_csv(FEATURES_METADATA_PROCESSED_FILEPATH)

# filter metadata
df_metadata = (
    df_metadata
    .loc[
        # (df_metadata.month == "April") &
        (df_metadata.split == "test")
    ]
    .reset_index(drop=True)
)

df_metadata["satellite_cnt_per_chip_month"] = (
    df_metadata
    .groupby(["chip_id", "month"])
    .satellite
    .transform("count")
)

df_metadata.tail()

Unnamed: 0,filename,filepath,label_filename,label_filepath,chip_id,satellite,split,month,satellite_cnt_per_chip_month
63343,fff812c0_S2_07.tif,/usr/src/app/data/test_features/fff812c0_S2_07...,fff812c0_agbm.tif,/usr/src/app/data/test_agbm/fff812c0_agbm.tif,fff812c0,S2,test,April,2
63344,fff812c0_S2_08.tif,/usr/src/app/data/test_features/fff812c0_S2_08...,fff812c0_agbm.tif,/usr/src/app/data/test_agbm/fff812c0_agbm.tif,fff812c0,S2,test,May,2
63345,fff812c0_S2_09.tif,/usr/src/app/data/test_features/fff812c0_S2_09...,fff812c0_agbm.tif,/usr/src/app/data/test_agbm/fff812c0_agbm.tif,fff812c0,S2,test,June,2
63346,fff812c0_S2_10.tif,/usr/src/app/data/test_features/fff812c0_S2_10...,fff812c0_agbm.tif,/usr/src/app/data/test_agbm/fff812c0_agbm.tif,fff812c0,S2,test,July,2
63347,fff812c0_S2_11.tif,/usr/src/app/data/test_features/fff812c0_S2_11...,fff812c0_agbm.tif,/usr/src/app/data/test_agbm/fff812c0_agbm.tif,fff812c0,S2,test,August,2


In [8]:
%%time

# predictions mean

for i, chip_id in enumerate(df_metadata.chip_id.unique()):
    
    # get valid months
    valid_months = (
        df_metadata
        .loc[
            (df_metadata.chip_id == chip_id) & 
            (df_metadata.satellite_cnt_per_chip_month == 2)
        ]
        .month
        .unique()
    )
    
    # read label filepath
    label_filepath = (
        df_metadata
        .loc[df_metadata.chip_id == chip_id]
        .label_filepath
        .values[0]
    )
    
    preds = []
    
    for month in valid_months:
        cols = {
            "chip_id": [chip_id] * 65_536
        }

        # load features
        for sat_i in [1, 2]:
            filepath = (
                df_metadata
                .loc[(df_metadata.chip_id == chip_id) & (df_metadata.satellite == f"S{sat_i}")]
                .filepath
                .values[0]
            )

            img = tiff.imread(filepath)
            for band_i, band in enumerate(tf.transpose(img, perm=(2, 0, 1)).numpy()):
                cols[f"s{sat_i}_band_{band_i + 1}"] = band.reshape(1, -1)[0]
    
        # create chip dataframe        
        df_chip = pd.DataFrame(cols).loc[:, X_COLUMNS]
    
        # get predictions
        preds.append(
            models[month].predict(df_chip).reshape(256, 256)
        )
    
    # save predictions
    tiff.imwrite(
        label_filepath, 
        data=(
            np.divide(
                np.sum(preds, axis=0), 
                len(valid_months)
            )
            .astype(np.float32)     
        )
    )
    
    if (i % 100 == 0) & (i != 0):
        print(f"--> finished prediciton of {i:,} tifs")

--> finished prediciton of 100 tifs
--> finished prediciton of 200 tifs
--> finished prediciton of 300 tifs
--> finished prediciton of 400 tifs
--> finished prediciton of 500 tifs
--> finished prediciton of 600 tifs
--> finished prediciton of 700 tifs
--> finished prediciton of 800 tifs
--> finished prediciton of 900 tifs
--> finished prediciton of 1,000 tifs
--> finished prediciton of 1,100 tifs
--> finished prediciton of 1,200 tifs
--> finished prediciton of 1,300 tifs
--> finished prediciton of 1,400 tifs
--> finished prediciton of 1,500 tifs
--> finished prediciton of 1,600 tifs
--> finished prediciton of 1,700 tifs
--> finished prediciton of 1,800 tifs
--> finished prediciton of 1,900 tifs
--> finished prediciton of 2,000 tifs
--> finished prediciton of 2,100 tifs
--> finished prediciton of 2,200 tifs
--> finished prediciton of 2,300 tifs
--> finished prediciton of 2,400 tifs
--> finished prediciton of 2,500 tifs
--> finished prediciton of 2,600 tifs
--> finished prediciton of 2,7

In [9]:
%%time

# predictions maximum

for i, chip_id in enumerate(df_metadata.chip_id.unique()):
    
    # get valid months
    valid_months = (
        df_metadata
        .loc[
            (df_metadata.chip_id == chip_id) & 
            (df_metadata.satellite_cnt_per_chip_month == 2)
        ]
        .month
        .unique()
    )
    
    # read label filepath
    label_filepath = (
        df_metadata
        .loc[df_metadata.chip_id == chip_id]
        .label_filepath
        .values[0]
    )
    
    preds = []
    
    for month in valid_months:
        cols = {
            "chip_id": [chip_id] * 65_536
        }

        # load features
        for sat_i in [1, 2]:
            filepath = (
                df_metadata
                .loc[(df_metadata.chip_id == chip_id) & (df_metadata.satellite == f"S{sat_i}")]
                .filepath
                .values[0]
            )

            img = tiff.imread(filepath)
            for band_i, band in enumerate(tf.transpose(img, perm=(2, 0, 1)).numpy()):
                cols[f"s{sat_i}_band_{band_i + 1}"] = band.reshape(1, -1)[0]
    
        # create chip dataframe        
        df_chip = pd.DataFrame(cols).loc[:, X_COLUMNS]
    
        # get predictions
        preds.append(
            models[month].predict(df_chip).reshape(256, 256)
        )
    
    # save predictions
    tiff.imwrite(
        label_filepath, 
        data=(
            np
            .max(preds, axis=0)
            .astype(np.float32)
        )
    )
    
    if (i % 100 == 0) & (i != 0):
        print(f"--> finished prediciton of {i:,} tifs")

--> finished prediciton of 100 tifs
--> finished prediciton of 200 tifs
--> finished prediciton of 300 tifs
--> finished prediciton of 400 tifs
--> finished prediciton of 500 tifs
--> finished prediciton of 600 tifs
--> finished prediciton of 700 tifs
--> finished prediciton of 800 tifs
--> finished prediciton of 900 tifs
--> finished prediciton of 1,000 tifs
--> finished prediciton of 1,100 tifs
--> finished prediciton of 1,200 tifs
--> finished prediciton of 1,300 tifs
--> finished prediciton of 1,400 tifs
--> finished prediciton of 1,500 tifs
--> finished prediciton of 1,600 tifs
--> finished prediciton of 1,700 tifs
--> finished prediciton of 1,800 tifs
--> finished prediciton of 1,900 tifs
--> finished prediciton of 2,000 tifs
--> finished prediciton of 2,100 tifs
--> finished prediciton of 2,200 tifs
--> finished prediciton of 2,300 tifs
--> finished prediciton of 2,400 tifs
--> finished prediciton of 2,500 tifs
--> finished prediciton of 2,600 tifs
--> finished prediciton of 2,7

In [10]:
%%time

# predictions median

for i, chip_id in enumerate(df_metadata.chip_id.unique()):
    
    # get valid months
    valid_months = (
        df_metadata
        .loc[
            (df_metadata.chip_id == chip_id) & 
            (df_metadata.satellite_cnt_per_chip_month == 2)
        ]
        .month
        .unique()
    )
    
    # read label filepath
    label_filepath = (
        df_metadata
        .loc[df_metadata.chip_id == chip_id]
        .label_filepath
        .values[0]
    )
    
    preds = []
    
    for month in valid_months:
        cols = {
            "chip_id": [chip_id] * 65_536
        }

        # load features
        for sat_i in [1, 2]:
            filepath = (
                df_metadata
                .loc[(df_metadata.chip_id == chip_id) & (df_metadata.satellite == f"S{sat_i}")]
                .filepath
                .values[0]
            )

            img = tiff.imread(filepath)
            for band_i, band in enumerate(tf.transpose(img, perm=(2, 0, 1)).numpy()):
                cols[f"s{sat_i}_band_{band_i + 1}"] = band.reshape(1, -1)[0]
    
        # create chip dataframe        
        df_chip = pd.DataFrame(cols).loc[:, X_COLUMNS]
    
        # get predictions
        preds.append(
            models[month].predict(df_chip).reshape(256, 256)
        )
    
    # save predictions
    tiff.imwrite(
        label_filepath, 
        data=(
            np
            .median(preds, axis=0)
            .astype(np.float32)
        )
    )
    
    if (i % 100 == 0) & (i != 0):
        print(f"--> finished prediciton of {i:,} tifs")

--> finished prediciton of 100 tifs
--> finished prediciton of 200 tifs
--> finished prediciton of 300 tifs
--> finished prediciton of 400 tifs
--> finished prediciton of 500 tifs
--> finished prediciton of 600 tifs
--> finished prediciton of 700 tifs
--> finished prediciton of 800 tifs
--> finished prediciton of 900 tifs
--> finished prediciton of 1,000 tifs
--> finished prediciton of 1,100 tifs
--> finished prediciton of 1,200 tifs
--> finished prediciton of 1,300 tifs
--> finished prediciton of 1,400 tifs
--> finished prediciton of 1,500 tifs
--> finished prediciton of 1,600 tifs
--> finished prediciton of 1,700 tifs
--> finished prediciton of 1,800 tifs
--> finished prediciton of 1,900 tifs
--> finished prediciton of 2,000 tifs
--> finished prediciton of 2,100 tifs
--> finished prediciton of 2,200 tifs
--> finished prediciton of 2,300 tifs
--> finished prediciton of 2,400 tifs
--> finished prediciton of 2,500 tifs
--> finished prediciton of 2,600 tifs
--> finished prediciton of 2,7

## Results

- The following submissions have been made. 
    - Model 5 (mean of monthly predictions)
        - Model has been trained on data subset (12 month, 100 chips per month).
        - Model has been tested on 0 chips.
        - Average RMSE equal to 57.4.
    - Model 6 (max of monthly predictions)
        - Model has been trained on data subset (12 month, 100 chips per month).
        - Model has been tested on 0 chips.
        - Average RMSE equal to 91.
    - Model 7 (median of monthly predictions)
        - Model has been trained on data subset (12 month, 100 chips per month).
        - Model has been tested on 0 chips.
        - Average RMSE equal to 56.        

*Yes, the results don't make very much the sense.*        