In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
# data handling
import geopandas as gpd
import pandas as pd
import numpy as np
import utils

# data analysis
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
import my_ML_algo as algo

from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.preprocessing import LabelEncoder

from sklearn.model_selection import cross_val_score

from sklearn.metrics import accuracy_score, f1_score

change_type_map = {'Demolition': 0, 'Road': 1, 'Residential': 2, 'Commercial': 3, 'Industrial': 4,
                   'Mega Projects': 5}



In [4]:
# Read csvs
print("--- read .csv files ---")
train_df = gpd.read_file('train.geojson', index_col=0)
# train_df = train_df.dropna()
test_df = gpd.read_file('test.geojson', index_col=0)
# test_df = test_df.dropna()

--- read .csv files ---


In [5]:
print(train_df.shape, test_df.shape)

(296146, 45) (120526, 44)


In [15]:
######## Feature engineering ########
print("--- Feature engineering ---")


def get_features(df, dataset_type):
    dic_features = {"names":[],"features":[]}

    # geometry features
    perimeter = np.asarray(df['geometry'].length)
    perimeter = np.expand_dims(perimeter, axis=-1)
    dic_features["features"].append(perimeter)
    dic_features["names"].append("perimeter")

    area_values = np.asarray(df['geometry'].area)
    area_values = np.expand_dims(area_values, axis=-1)
    dic_features["features"].append(area_values)
    dic_features["names"].append("area_values")

    def get_min_length_ratio(exte):
        x, y = exte.xy
        lengths = [
            np.sqrt((x[i] - x[i + 1]) ** 2 + (y[i] - y[i + 1]) ** 2) for i in range(4)
        ]
        return np.min(lengths)

    def get_max_length_ratio(exte):
        x, y = exte.xy
        lengths = [
            np.sqrt((x[i] - x[i + 1]) ** 2 + (y[i] - y[i + 1]) ** 2) for i in range(4)
        ]
        return np.max(lengths)

    
    min_lengths = np.asarray(df["geometry"].exterior.apply(get_min_length_ratio))
    min_lengths = np.expand_dims(min_lengths,axis=-1)

    max_lengths = np.asarray(df["geometry"].exterior.apply(get_max_length_ratio))
    max_lengths = np.expand_dims(max_lengths,axis=-1)

    ratio_min_max_lengths = min_lengths/max_lengths
    dic_features["features"].append(ratio_min_max_lengths)
    dic_features["names"].append("ratio_min_max_lengths")

    # dic_features["features"].append(min_lengths)
    # dic_features["names"].append("min_lengths")

    # dic_features["features"].append(max_lengths)
    # dic_features["names"].append("max_lengths")

    
    # geography features
    le_urban_type = LabelEncoder()
    urban_type = np.asarray(df["urban_type"])
    le_urban_type.fit(urban_type)
    # print("possible urban_type list :", list(le_urban_type.classes_))
    urban_type = le_urban_type.transform(urban_type)
    urban_type = np.expand_dims(urban_type, axis=-1)
    dic_features["features"].append(urban_type)
    dic_features["names"].append("urban_type")

    # le_geography_type = LabelEncoder()
    # geography_type = np.asarray(df["geography_type"])
    # le_geography_type.fit(geography_type)
    # # print("possible geography_type list :", list(le_geography_type.classes_))
    # geography_type = le_geography_type.transform(geography_type)
    # geography_type = np.expand_dims(geography_type, axis=-1)
    # features.append(geography_type)

    # add sequence features

    # kept_columns_dense_32 = np.arange(32)
    # model_dense_32_output = np.load(f"save/outputs/model_dense_32_{dataset_type}_output_5000_train_steps_no_val.npz")["arr_0"][:,kept_columns_dense_32]
    # dic_features["features"].append(model_dense_32_output)
    # dic_features["names"]+=[f"model_dense_32_output_{j}" for j in kept_columns_dense_32]

    # kept_columns_dense_16 = np.arange(16)
    # model_dense_16_output = np.load(f"save/outputs/model_dense_16_{dataset_type}_output_5000_train_steps_no_val.npz")["arr_0"]
    # model_dense_16_output = model_dense_16_output[:,kept_columns_dense_16]
    # dic_features["features"].append(model_dense_16_output)
    # dic_features["names"]+=[f"model_dense_16_output_{j}" for j in kept_columns_dense_16]

    
    # PCA on sequence features :

    model_dense_32_output = np.load(f"save/outputs/model_dense_32_{dataset_type}_output_5000_train_steps_no_val.npz")["arr_0"]
    
    PCA_model_dense_32_output = algo.apply_PCA(model_dense_32_output)
    print("PCA 32 : ",PCA_model_dense_32_output.shape)
    dic_features["features"].append(PCA_model_dense_32_output)
    dic_features["names"]+=[f"PCA_model_dense_32_output_{j}" for j in np.arange(PCA_model_dense_32_output.shape[-1])]


    model_dense_16_output = np.load(f"save/outputs/model_dense_16_{dataset_type}_output_5000_train_steps_no_val.npz")["arr_0"]
    PCA_model_dense_16_output = algo.apply_PCA(model_dense_16_output)
    print("PCA 16 : ",PCA_model_dense_16_output.shape)
    dic_features["features"].append(PCA_model_dense_16_output)
    dic_features["names"]+=[f"PCA_model_dense_16_output_{j}" for j in np.arange(PCA_model_dense_16_output.shape[-1])]


    # for feat in features:
    #     print(feat.shape)

    res = np.concatenate(dic_features["features"], axis=-1)

    return res,dic_features


train_x,train_dic_features = get_features(train_df,dataset_type="train")
train_y = train_df['change_type'].apply(lambda x: change_type_map[x])

index_x = test_df["index"]
test_x,_ = get_features(test_df,dataset_type="test")

print("train_x.shape, train_y.shape, test_x.shape :\n",
      train_x.shape, train_y.shape, test_x.shape)


--- Feature engineering ---



  perimeter = np.asarray(df['geometry'].length)

  area_values = np.asarray(df['geometry'].area)


PCA 32 :  (296146, 12)
PCA 16 :  (296146, 4)



  perimeter = np.asarray(df['geometry'].length)

  area_values = np.asarray(df['geometry'].area)


PCA 32 :  (120526, 12)
PCA 16 :  (120526, 4)
train_x.shape, train_y.shape, test_x.shape :
 (296146, 20) (296146,) (120526, 20)


In [16]:
######## Training ########
print("--- train ---")
rnd_clf = RandomForestClassifier(verbose=1,n_jobs=-1)

rnd_clf.fit(train_x,train_y)
pred_y = rnd_clf.predict(train_x)
train_rnd_clf_f1_score = f1_score(pred_y, train_y,average='macro')
print("f1_score on training set :", train_rnd_clf_f1_score)
print("prediction on test set shape :", pred_y.shape)

utils.display_feature_importances(train_dic_features,rnd_clf)

print([estimator.get_depth() for estimator in rnd_clf.estimators_])
print([estimator.get_n_leaves() for estimator in rnd_clf.estimators_])

utils.plot_and_save_confusion_matrix(pred_y,train_y,"./results/confusion_matrix_simple_rnd_clfxCNN.png")



k_fold_number = 10
val_scores = [-1]
if k_fold_number>1:
    def display_scores(scores):
        print("Scores:",scores)
        print("Mean",scores.mean())
        print("Std:",scores.std())

    val_scores = cross_val_score(rnd_clf,train_x,train_y,scoring="f1_macro",cv=k_fold_number)
    display_scores(val_scores)


    utils.save_experiment_in_excel("./results/automatic_study_features_rndxCNN.xlsx",rnd_clf,train_rnd_clf_f1_score,k_fold_number,val_scores,train_dic_features,)


--- train ---


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    6.4s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:   22.4s finished
[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done  18 tasks      | elapsed:    0.2s
[Parallel(n_jobs=16)]: Done 100 out of 100 | elapsed:    1.1s finished


f1_score on training set : 0.9999972175022197
prediction on test set shape : (296146,)
perimeter importance: 0.06397796689902732
area_values importance: 0.002266520493634809
ratio_min_max_lengths importance: 0.03535079426427134
urban_type importance: 0.010504239208602656
PCA_model_dense_32_output_0 importance: 0.03984066441008055
PCA_model_dense_32_output_1 importance: 0.05516322053075325
PCA_model_dense_32_output_2 importance: 0.07962050867702015
PCA_model_dense_32_output_3 importance: 0.07461725168387211
PCA_model_dense_32_output_4 importance: 0.06961474993311663
PCA_model_dense_32_output_5 importance: 0.03529056999609417
PCA_model_dense_32_output_6 importance: 0.030763754065506646
PCA_model_dense_32_output_7 importance: 0.04528849524931228
PCA_model_dense_32_output_8 importance: 0.030822618078052214
PCA_model_dense_32_output_9 importance: 0.034903815462257785
PCA_model_dense_32_output_10 importance: 0.040637224710224025
PCA_model_dense_32_output_11 importance: 0.04999402791620833
PC

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    7.5s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:   21.6s finished
[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done  18 tasks      | elapsed:    0.0s
[Parallel(n_jobs=16)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    6.0s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:   20.6s finished
[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done  18 tasks      | elapsed:    0.0s
[Parallel(n_jobs=16)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    6.4s
[Parallel(n

Scores: [0.55824714 0.5526009  0.56332018 0.54081452 0.5521717  0.55196117
 0.5298318  0.53255696 0.55007385 0.55472355]
Mean 0.5486301773758064
Std: 0.010294231286904973
experiment saved at ./results/automatic_study_features_rndxCNN.xlsx


  df = df_read.append(new_df, ignore_index=True)


In [23]:
######## Save results to submission file ########
pred_y = rnd_clf.predict(test_x)
print("--- save ---")
pred_df = pd.DataFrame(pred_y, columns=['change_type'])
pred_df.to_csv("./my_submissions/my_submission_simple_rndxCNN.csv", index=True, index_label='Id')

[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.2s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    0.7s finished


--- save ---


OSError: Cannot save file into a non-existent directory: 'my_submissions'