In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
# data handling
import geopandas as gpd
import pandas as pd
import numpy as np
import utils

# data analysis
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.preprocessing import LabelEncoder

from sklearn.model_selection import cross_val_score

from sklearn.metrics import accuracy_score, f1_score

change_type_map = {'Demolition': 0, 'Road': 1, 'Residential': 2, 'Commercial': 3, 'Industrial': 4,
                   'Mega Projects': 5}



In [3]:
# Read csvs
print("--- read .csv files ---")
train_df = gpd.read_file('train.geojson', index_col=0)
# train_df = train_df.dropna()
test_df = gpd.read_file('test.geojson', index_col=0)
# test_df = test_df.dropna()

--- read .csv files ---


In [4]:
print(train_df.shape, test_df.shape)

(296146, 45) (120526, 44)


In [16]:
######## Feature engineering ########
print("--- Feature engineering ---")


def get_features(df, dataset_type):
    dic_features = {"names":[],"features":[]}

    # geometry features
    perimeter = np.asarray(df['geometry'].length)
    perimeter = np.expand_dims(perimeter, axis=-1)
    dic_features["features"].append(perimeter)
    dic_features["names"].append("perimeter")

    area_values = np.asarray(df['geometry'].area)
    area_values = np.expand_dims(area_values, axis=-1)
    dic_features["features"].append(area_values)
    dic_features["names"].append("area_values")

    def get_min_length_ratio(exte):
        x, y = exte.xy
        lengths = [
            np.sqrt((x[i] - x[i + 1]) ** 2 + (y[i] - y[i + 1]) ** 2) for i in range(4)
        ]
        return np.min(lengths)

    def get_max_length_ratio(exte):
        x, y = exte.xy
        lengths = [
            np.sqrt((x[i] - x[i + 1]) ** 2 + (y[i] - y[i + 1]) ** 2) for i in range(4)
        ]
        return np.max(lengths)

    
    min_lengths = np.asarray(df["geometry"].exterior.apply(get_min_length_ratio))
    min_lengths = np.expand_dims(min_lengths,axis=-1)

    max_lengths = np.asarray(df["geometry"].exterior.apply(get_max_length_ratio))
    max_lengths = np.expand_dims(max_lengths,axis=-1)

    ratio_min_max_lengths = min_lengths/max_lengths
    dic_features["features"].append(ratio_min_max_lengths)
    dic_features["names"].append("ratio_min_max_lengths")

    # dic_features["features"].append(min_lengths)
    # dic_features["names"].append("min_lengths")

    # dic_features["features"].append(max_lengths)
    # dic_features["names"].append("max_lengths")

    
    # geography features
    le_urban_type = LabelEncoder()
    urban_type = np.asarray(df["urban_type"])
    le_urban_type.fit(urban_type)
    # print("possible urban_type list :", list(le_urban_type.classes_))
    urban_type = le_urban_type.transform(urban_type)
    urban_type = np.expand_dims(urban_type, axis=-1)
    dic_features["features"].append(urban_type)
    dic_features["names"].append("urban_type")

    # le_geography_type = LabelEncoder()
    # geography_type = np.asarray(df["geography_type"])
    # le_geography_type.fit(geography_type)
    # # print("possible geography_type list :", list(le_geography_type.classes_))
    # geography_type = le_geography_type.transform(geography_type)
    # geography_type = np.expand_dims(geography_type, axis=-1)
    # features.append(geography_type)

    # add sequence features
    kept_columns_dense_32 = [0,6,15,19]
    model_dense_32_output = np.load(f"./save/model_dense_32_{dataset_type}_output_val_0f7151.npz")["arr_0"][:,kept_columns_dense_32]
    dic_features["features"].append(model_dense_32_output)
    dic_features["names"]+=[f"model_dense_32_output_{j}" for j in kept_columns_dense_32]

    kept_columns_dense_16 = np.arange(16)
    model_dense_16_output = np.load(f"./save/model_dense_16_{dataset_type}_output_val_0f7151.npz")["arr_0"]
    model_dense_16_output = model_dense_16_output[:,kept_columns_dense_16]
    dic_features["features"].append(model_dense_16_output)
    dic_features["names"]+=[f"model_dense_16_output_{j}" for j in kept_columns_dense_16]

    kept_columns_dense_6 = np.arange(6)
    model_dense_6_output = np.load(f"./save/CNN_model_{dataset_type}_output_val_0f7151.npz")["arr_0"]
    model_dense_6_output = model_dense_16_output[:,kept_columns_dense_6]
    dic_features["features"].append(model_dense_6_output)
    dic_features["names"]+=[f"model_dense_6_output_{i}" for i in range(model_dense_6_output.shape[-1])]
    

    # for feat in features:
    #     print(feat.shape)

    res = np.concatenate(dic_features["features"], axis=-1)

    return res,dic_features


train_x,train_dic_features = get_features(train_df,dataset_type="train")
train_y = train_df['change_type'].apply(lambda x: change_type_map[x])

index_x = test_df["index"]
test_x,_ = get_features(test_df,dataset_type="test")

print("train_x.shape, train_y.shape, test_x.shape :\n",
      train_x.shape, train_y.shape, test_x.shape)


--- Feature engineering ---



  perimeter = np.asarray(df['geometry'].length)

  area_values = np.asarray(df['geometry'].area)

  perimeter = np.asarray(df['geometry'].length)

  area_values = np.asarray(df['geometry'].area)


train_x.shape, train_y.shape, test_x.shape :
 (296146, 8) (296146,) (120526, 8)


In [18]:
######## Training ########
print("--- train ---")
rnd_clf = RandomForestClassifier(verbose=1,n_jobs=-1)

rnd_clf.fit(train_x,train_y)
pred_y = rnd_clf.predict(train_x)
train_rnd_clf_f1_score = f1_score(pred_y, train_y,average='macro')
print("f1_score on training set :", train_rnd_clf_f1_score)
print("prediction on test set shape :", pred_y.shape)

utils.display_feature_importances(train_dic_features,rnd_clf)

print([estimator.get_depth() for estimator in rnd_clf.estimators_])
print([estimator.get_n_leaves() for estimator in rnd_clf.estimators_])

utils.plot_and_save_confusion_matrix(pred_y,train_y,"./results/confusion_matrix_simple_rnd_clfxCNN.png")



k_fold_number = 20
val_scores = [-1]
if k_fold_number>1:
    def display_scores(scores):
        print("Scores:",scores)
        print("Mean",scores.mean())
        print("Std:",scores.std())

    val_scores = cross_val_score(rnd_clf,train_x,train_y,scoring="f1_macro",cv=k_fold_number)
    display_scores(val_scores)


utils.save_experiment_in_excel("./results/automatic_study_features_rndxCNN.xlsx",rnd_clf,train_rnd_clf_f1_score,k_fold_number,val_scores,train_dic_features,)


--- train ---


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    3.5s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:   10.9s finished
[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done  18 tasks      | elapsed:    0.3s
[Parallel(n_jobs=16)]: Done 100 out of 100 | elapsed:    1.4s finished


f1_score on training set : 0.9999175731457745
prediction on test set shape : (296146,)
perimeter importance: 0.13295973203118797
area_values importance: 0.006760517620738476
ratio_min_max_lengths importance: 0.11053582992774649
urban_type importance: 0.03307658272713351
model_dense_32_output_0 importance: 0.16015068895706575
model_dense_32_output_6 importance: 0.19214030310814378
model_dense_32_output_15 importance: 0.1781024150426918
model_dense_32_output_19 importance: 0.18627393058529218
[45, 44, 52, 43, 49, 50, 44, 59, 49, 48, 56, 46, 46, 46, 48, 47, 45, 50, 46, 49, 49, 48, 46, 45, 52, 54, 48, 49, 47, 45, 45, 47, 42, 54, 48, 45, 51, 50, 46, 50, 47, 45, 50, 53, 45, 44, 48, 48, 49, 52, 47, 49, 46, 49, 48, 44, 51, 50, 44, 47, 44, 48, 49, 43, 58, 51, 47, 48, 54, 49, 48, 49, 51, 45, 45, 51, 55, 47, 47, 48, 56, 51, 46, 44, 51, 46, 54, 48, 48, 53, 50, 43, 52, 46, 51, 46, 47, 52, 45, 56]
[55982, 55901, 55659, 56156, 54457, 55825, 55870, 55959, 55083, 55707, 55412, 55978, 56135, 55697, 5579

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    2.8s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:   10.0s finished
[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done  18 tasks      | elapsed:    0.0s
[Parallel(n_jobs=16)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    2.8s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    9.9s finished
[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done  18 tasks      | elapsed:    0.0s
[Parallel(n_jobs=16)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    2.8s
[Parallel(n

Scores: [0.4873199  0.42720778 0.47981694 0.45011626 0.46210542 0.50544567
 0.45185457 0.43741543 0.46113321 0.44301686 0.47088607 0.44423029
 0.43509745 0.42040583 0.46679216 0.40375826 0.4850155  0.4592891
 0.45816423 0.46357056]
Mean 0.45563207413148576
Std: 0.023741404780578203
experiment saved at ./results/automatic_study_features_rndxCNN.xlsx


  df = df_read.append(new_df, ignore_index=True)


{'training_info': {'f1_score': 0.9999175731457745},
 'validation_info': {'k_fold_number': 20,
  'scores_mean': 0.45563207413148576,
  'scores_std': 0.023741404780578203,
  'scores_max': 0.5054456702519067,
  'scores_min': 0.4037582647932288},
 'feature_importance_info': {'perimeter importance': 0.13295973203118797,
  'area_values importance': 0.006760517620738476,
  'ratio_min_max_lengths importance': 0.11053582992774649,
  'urban_type importance': 0.03307658272713351,
  'model_dense_32_output_0 importance': 0.16015068895706575,
  'model_dense_32_output_6 importance': 0.19214030310814378,
  'model_dense_32_output_15 importance': 0.1781024150426918,
  'model_dense_32_output_19 importance': 0.18627393058529218}}

In [16]:
######## Save results to submission file ########
pred_y = rnd_clf.predict(test_x)
print("--- save ---")
pred_df = pd.DataFrame(pred_y, columns=['change_type'])
pred_df.to_csv("my_submission_simple_rndxCNN.csv", index=True, index_label='Id')

[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done  18 tasks      | elapsed:    0.1s


--- save ---


[Parallel(n_jobs=16)]: Done 100 out of 100 | elapsed:    0.6s finished
