## In this notebook

- The first model.

In [1]:
import os
import datetime
import shutil

# analytics
import pandas as pd
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 100)
import numpy as np

# tif images processing
import tensorflow as tf
import tifffile as tiff

# model
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler, OrdinalEncoder

caused by: ["[Errno 2] The file to load file system plugin from does not exist.: '/opt/venv/lib/python3.8/site-packages/tensorflow_io/python/ops/libtensorflow_io_plugins.so'"]
caused by: ['/opt/venv/lib/python3.8/site-packages/tensorflow_io/python/ops/libtensorflow_io.so: cannot open shared object file: No such file or directory']


In [2]:
FEATURES_METADATA_PROCESSED_FILEPATH = "../data/features_metadata_processed.csv"
FILTER_MONTH = "April"
SUBMISSION_FILES_CNT = 2_773
CHIPS_TO_SELECT_CNT = 500

## Load metadata

In [3]:
df_metadata = pd.read_csv(FEATURES_METADATA_PROCESSED_FILEPATH)

# filter metadata
df_metadata = (
    df_metadata
    .loc[
        (df_metadata.month == FILTER_MONTH) &
        (df_metadata.split == "train")
    ]
    .reset_index(drop=True)
)

df_metadata.tail()

Unnamed: 0,filename,filepath,label_filename,label_filepath,chip_id,satellite,split,month
17373,ffc7d4f2_S2_07.tif,/usr/src/app/data/train_features/ffc7d4f2_S2_0...,ffc7d4f2_agbm.tif,/usr/src/app/data/train_agbm/ffc7d4f2_agbm.tif,ffc7d4f2,S2,train,April
17374,ffd8db7a_S1_07.tif,/usr/src/app/data/train_features/ffd8db7a_S1_0...,ffd8db7a_agbm.tif,/usr/src/app/data/train_agbm/ffd8db7a_agbm.tif,ffd8db7a,S1,train,April
17375,ffd8db7a_S2_07.tif,/usr/src/app/data/train_features/ffd8db7a_S2_0...,ffd8db7a_agbm.tif,/usr/src/app/data/train_agbm/ffd8db7a_agbm.tif,ffd8db7a,S2,train,April
17376,fff05995_S1_07.tif,/usr/src/app/data/train_features/fff05995_S1_0...,fff05995_agbm.tif,/usr/src/app/data/train_agbm/fff05995_agbm.tif,fff05995,S1,train,April
17377,fff05995_S2_07.tif,/usr/src/app/data/train_features/fff05995_S2_0...,fff05995_agbm.tif,/usr/src/app/data/train_agbm/fff05995_agbm.tif,fff05995,S2,train,April


## Load data

In [4]:
dfs = []

for chip_id in np.random.choice(df_metadata.chip_id.unique(), size=CHIPS_TO_SELECT_CNT, replace=False):
    cols = {
        "chip_id": [chip_id] * 65_536
    }
    
    # load features
    for sat_i in [1, 2]:
        filepath = (
            df_metadata
            .loc[(df_metadata.chip_id == chip_id) & (df_metadata.satellite == f"S{sat_i}")]
            .filepath
            .values[0]
        )
        
        img = tiff.imread(filepath)
        for band_i, band in enumerate(tf.transpose(img, perm=(2, 0, 1)).numpy()):
            cols[f"s{sat_i}_band_{band_i + 1}"] = band.reshape(1, -1)[0]
            
    # load label
    label_filepath = (
        df_metadata
        .loc[(df_metadata.chip_id == chip_id) & (df_metadata.satellite == f"S{sat_i}")]
        .label_filepath
        .values[0]
    )
    img = tiff.imread(label_filepath)
    cols["label"] = img.reshape(1, -1)[0]
    
    # create chip dataframe        
    dfs.append(
        pd.DataFrame(cols)
    )
    
df = pd.concat(dfs, ignore_index=True)
df.tail()

Unnamed: 0,chip_id,s1_band_1,s1_band_2,s1_band_3,s1_band_4,s2_band_1,s2_band_2,s2_band_3,s2_band_4,s2_band_5,s2_band_6,s2_band_7,s2_band_8,s2_band_9,s2_band_10,s2_band_11,label
32767995,ceffcbd0,-17.131737,-25.479885,-20.648647,-25.854658,480,494,454,421,374,315,299,220,63,48,1,0.0
32767996,ceffcbd0,-17.212259,-25.835213,-20.602642,-25.964767,502,486,438,409,353,304,293,254,62,48,1,0.0
32767997,ceffcbd0,-17.065723,-26.330378,-19.379017,-27.494797,459,426,417,409,353,304,277,254,62,48,1,0.0
32767998,ceffcbd0,-17.64909,-26.382376,-20.543293,-27.995228,447,434,391,399,346,282,279,225,62,48,1,0.0
32767999,ceffcbd0,-18.141705,-26.63443,-21.294044,-27.807858,474,451,427,399,346,282,279,225,62,48,1,0.0


In [5]:
# show label distribution

label_threshold = 500

print(f"Number of values over {label_threshold}: {df.loc[df.label >= label_threshold].shape[0]:,}")

df.label.describe()

Number of values over 500: 6,544


count    3.276800e+07
mean     6.099969e+01
std      7.118694e+01
min      0.000000e+00
25%      0.000000e+00
50%      4.145000e+01
75%      9.650000e+01
max      5.753690e+03
Name: label, dtype: float64

## Create model

In [6]:
Y_COLUMN = "label"
X_COLUMNS = [
    "s1_band_1", 
    "s1_band_2", 
    "s1_band_3", 
    "s1_band_4",
    "s2_band_1", 
    "s2_band_2", 
    "s2_band_3", 
    "s2_band_4", 
    "s2_band_5",
    "s2_band_6", 
    "s2_band_7", 
    "s2_band_8", 
    "s2_band_9", 
    "s2_band_10",
    "s2_band_11",
]

In [7]:
# split data into X and y

chips_cnt_test = np.ceil(CHIPS_TO_SELECT_CNT * 0.01).astype(int)
test_chip_ids = np.random.choice(df.chip_id.unique(), size=chips_cnt_test, replace=False)

df_train = df.loc[~df.chip_id.isin(test_chip_ids)].copy()
df_test = df.loc[df.chip_id.isin(test_chip_ids)].copy()

X_train = df_train.loc[:, X_COLUMNS]
y_train = df_train.loc[:, Y_COLUMN]
X_test = df_test.loc[:, X_COLUMNS]
y_test = df_test.loc[:, Y_COLUMN]

print(f"{X_train.shape=}")
print(f"{y_train.shape=}")
print(f"{X_test.shape=}")
print(f"{y_test.shape=}")

X_train.shape=(32440320, 15)
y_train.shape=(32440320,)
X_test.shape=(327680, 15)
y_test.shape=(327680,)


In [8]:
# define and fit pipeline

# pipeline = Pipeline([
#     ("model", GradientBoostingRegressor())
# ])

# pipeline.fit(X_train, y_train)

In [None]:
%%time

# define numerical transformers

transformers_numerical = Pipeline(
    steps=[
        ("min_max_scaler", MinMaxScaler())
    ]
)

# define and fit pipeline

transformers = ColumnTransformer(
    transformers=[
        ("numerical", transformers_numerical, X_COLUMNS),
    ],
    remainder="passthrough"
)

pipeline = Pipeline([
    ("transformers", transformers),
    ("model", GradientBoostingRegressor())
])

pipeline.fit(X_train, y_train)

In [None]:
# check performance on test dataset

df_test["pred"] = pipeline.predict(X_test)

rmses = []

for chip_id in df_test.chip_id.unique():
    y_label = df_test.loc[df_test.chip_id == chip_id, "label"]
    y_preds = df_test.loc[df_test.chip_id == chip_id, "pred"]
    rmses.append(
        mean_squared_error(y_label, y_preds, squared=False)
    )    

print("Results:")
print(f"- all RMSEs: {[round(rmse, 1) for rmse in rmses]}")
print(f"- average RMSEs: {np.mean(rmses):,.3f}")

## Create submission

In [None]:
df_metadata = pd.read_csv(FEATURES_METADATA_PROCESSED_FILEPATH)

# filter metadata
df_metadata = (
    df_metadata
    .loc[
        (df_metadata.month == FILTER_MONTH) &
        (df_metadata.split == "test")
    ]
    .reset_index(drop=True)
)

df_metadata.tail()

In [None]:
assert len(df_metadata) / 2 == SUBMISSION_FILES_CNT
assert df_metadata.label_filename.nunique() == SUBMISSION_FILES_CNT

In [None]:
%%time

for i, chip_id in enumerate(df_metadata.chip_id.unique()):
    cols = {
        "chip_id": [chip_id] * 65_536
    }
    
    # load features
    for sat_i in [1, 2]:
        filepath = (
            df_metadata
            .loc[(df_metadata.chip_id == chip_id) & (df_metadata.satellite == f"S{sat_i}")]
            .filepath
            .values[0]
        )
        
        img = tiff.imread(filepath)
        for band_i, band in enumerate(tf.transpose(img, perm=(2, 0, 1)).numpy()):
            cols[f"s{sat_i}_band_{band_i + 1}"] = band.reshape(1, -1)[0]
    
    # read label filepath
    label_filepath = (
        df_metadata
        .loc[(df_metadata.chip_id == chip_id) & (df_metadata.satellite == f"S{sat_i}")]
        .label_filepath
        .values[0]
    )
    
    # create chip dataframe        
    df_chip = pd.DataFrame(cols).loc[:, X_COLUMNS]
    
    # save predictions
    tiff.imwrite(
        label_filepath, 
        data=pipeline.predict(df_chip).reshape(256, 256).astype(np.float32)
    )
    
    if (i % 100 == 0) & (i != 0):
        print(f"--> finished prediciton of {i:,} tifs")

## Results

- The following submissions have been made. 
    - Model 3
        - Model has been trained on data subset (1 month, 22 chips).
        - Model has been tested on 3 chips only.
        - RMSE on one chip test data equal to 78.7.
        - Average RMSE equal to 46.1.
    - Model 4
        - Model has been trained on data subset (1 month, 99 chips).
        - Model has been tested on 1 chip only.
        - RMSE on one chip test data equal to 15.3.
        - Average RMSE equal to 45.6.
        
*Yes, the results don't make very much the sense.*        