In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
# data handling
import geopandas as gpd
import pandas as pd
import numpy as np
import utils

# data analysis
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.preprocessing import LabelEncoder

from sklearn.model_selection import cross_val_score

from sklearn.metrics import accuracy_score, f1_score



import matplotlib.pyplot as plt
import tensorflow as tf
import keras
import pathlib
import os


#################### SET GPU ####################
print("tf.__version__:", tf.__version__)

physical_devices = tf.config.list_physical_devices("GPU")
print("Num GPUs Available: ", len(physical_devices))
# print(device_lib.list_local_devices())
tf.config.experimental.set_memory_growth(physical_devices[0], True)

DTYPE = 'float32'
tf.keras.backend.set_floatx(DTYPE)

AUTOTUNE = tf.data.AUTOTUNE


#################### CONFIG ####################
import yaml
from yaml.loader import SafeLoader


with open('CNN_config.yaml') as f:
    config = yaml.load(f, Loader=SafeLoader)

print("CNN_config:\n",config)

tf.__version__: 2.10.1
Num GPUs Available:  1
CNN_config:
 {'BUFFER_SIZE': 100000, 'BATCH_SIZE': 32768}


In [4]:
change_type_map = {'Demolition': 0, 'Road': 1, 'Residential': 2, 'Commercial': 3, 'Industrial': 4,
                   'Mega Projects': 5}

# Read csvs
print("--- read .csv files ---")
train_df = gpd.read_file('train.geojson', index_col=0)

test_df = gpd.read_file('test.geojson', index_col=0)

--- read .csv files ---


In [9]:
print("--- Feature engineering ---")

def get_features_terrain(df):
    dic_features = {"names":[],"features":[]}

    # geometry features
    perimeter = np.asarray(df['geometry'].length)
    perimeter = np.expand_dims(perimeter, axis=-1)
    dic_features["features"].append(perimeter)
    dic_features["names"].append("perimeter")

    area_values = np.asarray(df['geometry'].area)
    area_values = np.expand_dims(area_values, axis=-1)
    dic_features["features"].append(area_values)
    dic_features["names"].append("area_values")

    def length_ratio(exte):
        x, y = exte.xy
        lengths = [
            np.sqrt((x[i] - x[i + 1]) ** 2 + (y[i] - y[i + 1]) ** 2) for i in range(4)
        ]
        return np.min(lengths) / np.max(lengths)


    ratios = np.asarray(df["geometry"].exterior.apply(length_ratio))
    ratios = np.expand_dims(ratios, axis=-1)
    dic_features["features"].append(ratios)
    dic_features["names"].append("ratios")
    
    # geography features
    le_urban_type = LabelEncoder()
    urban_type = np.asarray(df["urban_type"])
    le_urban_type.fit(urban_type)
    # print("possible urban_type list :", list(le_urban_type.classes_))
    urban_type = le_urban_type.transform(urban_type)
    urban_type = np.expand_dims(urban_type, axis=-1)
    dic_features["features"].append(urban_type)
    dic_features["names"].append("urban_type")


    res = np.concatenate(dic_features["features"], axis=-1)

    return res,dic_features

    

def get_features_sequence(df):
    dic_features = {"names":[],"features":[]}

    # dates/images features
    def get_sorted_date_diff_with_indices(df):
        dates_to_add = []
        for i in range(5):
            date = df[f'date{i}'].apply(lambda x: int(str(x)[-4:]) if x is not None else 0)
            date = np.asarray(date)
            date = np.expand_dims(date,axis=-1)
            dates_to_add.append(date)

        dates = np.concatenate(dates_to_add,axis=-1)

        indices_dates = np.argsort(dates,axis=-1)
        dates = np.array([dates[i,indices_dates[i,:]] for i in range(dates.shape[0])])

        date_diff = dates - dates[:,-1:]@np.ones((1,5),dtype=int)
        return date_diff, indices_dates

    date_diff,indices_dates=get_sorted_date_diff_with_indices(df)
    # dic_features["features"].append(date_diff)
    # dic_features["names"]+=[f"new_date_diff{i}" for i in range(5)]

    # colors
    trad_colors = {"red":0,"blue":1,"green":2}
    def get_mean_std(df):
        colors = list(trad_colors.keys())
        res = np.zeros((df.shape[0],5,3,2))
        for i in range(1,6):
            for j_color,color in enumerate(colors):
                res[:,i-1,j_color,0] = np.asarray(df[f"img_{color}_mean_date{i}"])
                res[:,i-1,j_color,0] = np.nan_to_num(res[:,i-1,j_color,0], nan=np.nanmean(res[:,i-1,j_color,0]))
                res[:,i-1,j_color,1] = np.asarray(df[f"img_{color}_std_date{i}"])
                res[:,i-1,j_color,1] = np.nan_to_num(res[:,i-1,j_color,1], nan=np.nanmean(res[:,i-1,j_color,1]))
        return res

    color_mean_std = get_mean_std(df)
    color_mean_std = np.array([color_mean_std[i,indices_dates[i,:],:,:] for i in range(color_mean_std.shape[0])])
    color_mean_std = np.concatenate([color_mean_std[:,:,:,i] for i in range(color_mean_std.shape[-1])],axis=-1)

    dic_features["features"].append(color_mean_std)
    dic_features["names"]+=[f"color_mean_{i}" for i in range(3)]
    dic_features["names"]+=[f"color_std_{i}" for i in range(3)]


    # change statuses
    le_change_status = LabelEncoder()
    change_statuses = np.zeros((df.shape[0],5),dtype=int)
    for i in range(5):
        change_status_datei = np.asarray(df[f"change_status_date{i}"])
        le_change_status.fit(change_status_datei)
        change_status_datei = le_change_status.transform(change_status_datei)
        change_statuses[:,i]= change_status_datei
    # print("le_change_status.classes_",le_change_status.classes_)
    # print("le_change_status.transform(le_change_status.classes_)",le_change_status.transform(le_change_status.classes_))

    change_statuses =  np.array([change_statuses[i,indices_dates[i,:]] for i in range(change_statuses.shape[0])])
    change_statuses = np.expand_dims(change_statuses,axis=-1)
    dic_features["features"].append(change_statuses)
    dic_features["names"]+=["change_statuses"]


    # final concatenation
    res = np.concatenate(dic_features["features"], axis=-1)

    return res,dic_features

def get_features_for_full_NN(df):
    features_terrain,dic_features_terrain = get_features_terrain(df)
    features_sequence,dic_features_sequence = get_features_sequence(df)

    dic_features = {"terrain":dic_features_terrain, "sequence":dic_features_sequence}

    return features_terrain,features_sequence,dic_features

    
# train_df_without_na,_,train_indices,train_dummy_values = handle_na_in_df(train_df)
train_terrain_x,train_sequence_x,dic_features= get_features_for_full_NN(train_df)
print("train:")
for k in dic_features.keys():
    print(k)
    utils.display_features(dic_features[k])
    
train_y = train_df['change_type'].apply(lambda x: change_type_map[x])

test_terrain_x,test_sequence_x,dic_features_test= get_features_for_full_NN(test_df)

print("test:")
for k in dic_features_test.keys():
    print(k)
    utils.display_features(dic_features_test[k])

--- Feature engineering ---



  perimeter = np.asarray(df['geometry'].length)

  area_values = np.asarray(df['geometry'].area)


train:
terrain
['perimeter'] (296146, 1)
['area_values'] (296146, 1)
['ratios'] (296146, 1)
['urban_type'] (296146, 1)
sequence
['color_mean_0', 'color_mean_1', 'color_mean_2', 'color_std_0', 'color_std_1', 'color_std_2'] (296146, 5, 6)
['change_statuses'] (296146, 5, 1)



  perimeter = np.asarray(df['geometry'].length)

  area_values = np.asarray(df['geometry'].area)


test:
terrain
['perimeter'] (120526, 1)
['area_values'] (120526, 1)
['ratios'] (120526, 1)
['urban_type'] (120526, 1)
sequence
['color_mean_0', 'color_mean_1', 'color_mean_2', 'color_std_0', 'color_std_1', 'color_std_2'] (120526, 5, 6)
['change_statuses'] (120526, 5, 1)


#### optimize in Tensorflow Dataset 

In [10]:
def split_train_val(data,split_ratio=0.1):
    val_size = int(split_ratio*data.shape[0])
    np.random.seed(42)
    shuffle_indices = np.arange(0,data.shape[0])
    np.random.shuffle(shuffle_indices)
    data_shuffled = data[shuffle_indices]

    val_data = data_shuffled[:val_size]
    train_data = data_shuffled[val_size:]
    return train_data,val_data

new_train_terrain_x,new_val_terrain_x = split_train_val(train_terrain_x)
new_train_sequence_x, new_val_sequence_x = split_train_val(train_sequence_x)
new_train_y,new_val_y = split_train_val(train_y)

print("new_train_terrain_x shape",new_train_terrain_x.shape)
print("new_val_terrain_x shape",new_val_terrain_x.shape)
print("new_train_sequence_x shape",new_train_sequence_x.shape)
print("new_val_sequence_x shape",new_val_sequence_x.shape)
print("new_train_y shape",new_train_y.shape)
print("new_val_y shape",new_val_y.shape)

new_train_terrain_x shape (266532, 4)
new_val_terrain_x shape (29614, 4)
new_train_sequence_x shape (266532, 5, 7)
new_val_sequence_x shape (29614, 5, 7)
new_train_y shape (266532,)
new_val_y shape (29614,)


  val_data = data_shuffled[:val_size]
  train_data = data_shuffled[val_size:]


In [11]:
train_dataset_x = tf.data.Dataset.from_tensor_slices((new_train_terrain_x,new_train_sequence_x))
train_dataset_y = tf.data.Dataset.from_tensor_slices(np.expand_dims(new_train_y,axis=-1))
train_dataset = tf.data.Dataset.zip((train_dataset_x, train_dataset_y))

val_dataset_x = tf.data.Dataset.from_tensor_slices((new_val_terrain_x,new_val_sequence_x))
val_dataset_y = tf.data.Dataset.from_tensor_slices(np.expand_dims(new_val_y,axis=-1))
val_dataset = tf.data.Dataset.zip((val_dataset_x, val_dataset_y))


def configure_for_performance(ds, batch_size):
    ds = ds.cache()
    ds = ds.shuffle(
        buffer_size=config["BUFFER_SIZE"], reshuffle_each_iteration=True)
    ds = ds.batch(batch_size)
    ds = ds.prefetch(buffer_size=AUTOTUNE)
    return ds


train_dataset = configure_for_performance(train_dataset, config["BATCH_SIZE"])
val_dataset = configure_for_performance(val_dataset, config["BATCH_SIZE"])

for (batch_terrain_x,batch_sequence_x),batch_y in train_dataset.take(5):
    print("batch_x.shape,batch_sequence_x,batch_y.shape\n",batch_terrain_x.shape,batch_sequence_x.shape,batch_y.shape)

batch_x.shape,batch_sequence_x,batch_y.shape
 (32768, 4) (32768, 5, 7) (32768, 1)
batch_x.shape,batch_sequence_x,batch_y.shape
 (32768, 4) (32768, 5, 7) (32768, 1)
batch_x.shape,batch_sequence_x,batch_y.shape
 (32768, 4) (32768, 5, 7) (32768, 1)
batch_x.shape,batch_sequence_x,batch_y.shape
 (32768, 4) (32768, 5, 7) (32768, 1)
batch_x.shape,batch_sequence_x,batch_y.shape
 (32768, 4) (32768, 5, 7) (32768, 1)


#### Create and train CNN

In [14]:
input_terrain_shape = train_terrain_x.shape[1:]
input_sequence_shape = train_sequence_x.shape[1:]

def get_CNN_model(input_shape):
        model_dense_32 = tf.keras.Sequential([
            tf.keras.layers.Conv1D(filters=256,kernel_size=3, activation='relu',kernel_initializer="HeNormal",padding="same",input_shape=input_shape),
            tf.keras.layers.BatchNormalization(),
            tf.keras.layers.Dropout(0.5),
            tf.keras.layers.Conv1D(256,3, padding="same",activation='relu',kernel_initializer="HeNormal"),
            tf.keras.layers.BatchNormalization(),
            tf.keras.layers.Dropout(0.3),
            tf.keras.layers.Conv1D(128,3, padding="same",activation='relu',kernel_initializer="HeNormal"),
            tf.keras.layers.BatchNormalization(),
            tf.keras.layers.Conv1D(128,3, padding="same",activation='relu',kernel_initializer="HeNormal"),
            tf.keras.layers.BatchNormalization(),
            tf.keras.layers.Conv1D(64,3, padding="same",activation='relu',kernel_initializer="HeNormal"),
            tf.keras.layers.BatchNormalization(),
            tf.keras.layers.Conv1D(64,3, padding="same",activation='relu',kernel_initializer="HeNormal"),
            tf.keras.layers.BatchNormalization(),
            tf.keras.layers.Conv1D(32,3, padding="same",activation='relu',kernel_initializer="HeNormal"),
            tf.keras.layers.BatchNormalization(),
            tf.keras.layers.Flatten(),
            tf.keras.layers.Dense(32, activation='relu',kernel_initializer="HeNormal"),
        ],name='model_dense_32')

        model_dense_16 = tf.keras.Sequential([
            model_dense_32,
            tf.keras.layers.Dense(16, activation='relu',kernel_initializer="HeNormal"),
        ],name="model_dense_16")

        CNN_model = tf.keras.Sequential([
            model_dense_16,
            tf.keras.layers.Dense(6, activation='softmax'),
        ])

        CNN_model.summary()
        return model_dense_32, model_dense_16,CNN_model


class FullNNModel(keras.Model):
    def __init__(self,input_terrain_shape,input_sequence_shape,**kwargs):
        super().__init__(**kwargs)
        self.CNN_model_cut_at_dense_32, _,_ = get_CNN_model(input_sequence_shape)
        self.terrain_model = tf.keras.Sequential([
            tf.keras.layers.Dense(256,activation="elu",kernel_initializer="HeNormal"),
            tf.keras.layers.BatchNormalization(),
            tf.keras.layers.Dropout(0.5),
            tf.keras.layers.Dense(128,activation="elu",kernel_initializer="HeNormal"),
            tf.keras.layers.BatchNormalization(),
            tf.keras.layers.Dropout(0.3),
            tf.keras.layers.Dense(128,activation="elu",kernel_initializer="HeNormal"),
            tf.keras.layers.BatchNormalization(),
            tf.keras.layers.Dense(64,activation="elu",kernel_initializer="HeNormal"),
            tf.keras.layers.BatchNormalization(),
            tf.keras.layers.Dense(64,activation="elu",kernel_initializer="HeNormal"),
            tf.keras.layers.BatchNormalization(),
            tf.keras.layers.Dense(32,activation="elu",kernel_initializer="HeNormal"),
            tf.keras.layers.BatchNormalization(),
        ],name='terrain_model')

        self.output_model = tf.keras.Sequential([
            tf.keras.layers.Dense(64,activation="elu",kernel_initializer="HeNormal"),
            tf.keras.layers.BatchNormalization(),
            tf.keras.layers.Dense(32,activation="elu",kernel_initializer="HeNormal"),
            tf.keras.layers.BatchNormalization(),
            tf.keras.layers.Dense(16,activation="elu",kernel_initializer="HeNormal"),
            tf.keras.layers.Dense(6,activation="softmax",kernel_initializer="GlorotNormal")
        ])


    def call(self,inputs):
        input_terrain, input_sequence = inputs
        hidden_terrain = self.terrain_model(input_terrain)
        hidden_sequences = self.CNN_model_cut_at_dense_32(input_sequence)
        concat = keras.layers.Concatenate()([hidden_terrain,hidden_sequences])
        main_output = self.output_model(concat)
        return main_output


full_NN_model = FullNNModel(input_terrain_shape,input_sequence_shape)

full_NN_model.compile(optimizer=tf.keras.optimizers.Adam(1e-4),
                loss=keras.losses.SparseCategoricalCrossentropy(),
                metrics=['sparse_categorical_accuracy'])


Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 model_dense_16 (Sequential)  (None, 16)               402768    
                                                                 
 dense_28 (Dense)            (None, 6)                 102       
                                                                 
Total params: 402,870
Trainable params: 401,014
Non-trainable params: 1,856
_________________________________________________________________


In [15]:
early_stopping_cb = keras.callbacks.EarlyStopping(monitor="val_sparse_categorical_accuracy" ,patience=200,restore_best_weights=True)

history = full_NN_model.fit(train_dataset, epochs=10000, validation_data=val_dataset,callbacks=[early_stopping_cb])

Epoch 1/10000
Epoch 2/10000
Epoch 3/10000
Epoch 4/10000
Epoch 5/10000
Epoch 6/10000
Epoch 7/10000
Epoch 8/10000
Epoch 9/10000
Epoch 10/10000
Epoch 11/10000
Epoch 12/10000
Epoch 13/10000
Epoch 14/10000
Epoch 15/10000
Epoch 16/10000
Epoch 17/10000
Epoch 18/10000
Epoch 19/10000
Epoch 20/10000
Epoch 21/10000
Epoch 22/10000
Epoch 23/10000
Epoch 24/10000
Epoch 25/10000
Epoch 26/10000
Epoch 27/10000
Epoch 28/10000
Epoch 29/10000
Epoch 30/10000
Epoch 31/10000
Epoch 32/10000
Epoch 33/10000
Epoch 34/10000
Epoch 35/10000
Epoch 36/10000
Epoch 37/10000
Epoch 38/10000
Epoch 39/10000
Epoch 40/10000
Epoch 41/10000
Epoch 42/10000
Epoch 43/10000
Epoch 44/10000
Epoch 45/10000
Epoch 46/10000
Epoch 47/10000
Epoch 48/10000
Epoch 49/10000
Epoch 50/10000
Epoch 51/10000
Epoch 52/10000
Epoch 53/10000
Epoch 54/10000
Epoch 55/10000
Epoch 56/10000
Epoch 57/10000
Epoch 58/10000
Epoch 59/10000
Epoch 60/10000
Epoch 61/10000
Epoch 62/10000
Epoch 63/10000
Epoch 64/10000
Epoch 65/10000
Epoch 66/10000
Epoch 67/10000
Epoc

In [25]:
full_NN_model.save_weights('./save/weights_full_NN_model_val_0f')

full_NN_model = FullNNModel(input_terrain_shape,input_sequence_shape)
full_NN_model.load_weights('./save/weights_full_NN_model_val_0f')

Model: "sequential_30"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 model_dense_16 (Sequential)  (None, 16)               402768    
                                                                 
 dense_142 (Dense)           (None, 6)                 102       
                                                                 
Total params: 402,870
Trainable params: 401,014
Non-trainable params: 1,856
_________________________________________________________________


<tensorflow.python.checkpoint.checkpoint.CheckpointLoadStatus at 0x1ff1bb86ec0>

#### Prediction and error plot

In [46]:
print(history.history.keys())

dict_keys(['loss', 'sparse_categorical_accuracy'])


In [28]:
utils.plot_and_save_history(history,"./results/history_full_NN_model_val_0f.png")

pred_y = full_NN_model.predict(train_dataset)
pred_y = np.argmax(pred_y,axis=-1)

utils.plot_and_save_confusion_matrix(pred_y,new_train_y,"./results/confusion_matrix_full_NN_model.png")




### Send predictions

In [29]:
test_dataset_x = tf.data.Dataset.from_tensor_slices((test_terrain_x,test_sequence_x))
dummy_test_y = np.zeros((test_terrain_x.shape[0],1))
test_dataset_y = tf.data.Dataset.from_tensor_slices(dummy_test_y)
test_dataset = tf.data.Dataset.zip((test_dataset_x, test_dataset_y))

test_dataset = configure_for_performance(test_dataset, config["BATCH_SIZE"])

pred_y = full_NN_model.predict(test_dataset)
pred_y = np.argmax(pred_y,axis=-1)
print("prediction on test set shape :", pred_y.shape)
print(pred_y)


######## Save results to submission file ########
print("--- save ---")
pred_df = pd.DataFrame(pred_y, columns=['change_type'])
pred_df.to_csv("my_submission_full_NN.csv", index=True, index_label='Id')

prediction on test set shape : (120526,)
[2 3 2 ... 3 2 2]
--- save ---
