In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
# data handling
import geopandas as gpd
import pandas as pd
import numpy as np
import utils

# data analysis
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
import my_ML_algo as algo

from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.preprocessing import LabelEncoder

from sklearn.model_selection import cross_val_score

from sklearn.metrics import accuracy_score, f1_score

change_type_map = {'Demolition': 0, 'Road': 1, 'Residential': 2, 'Commercial': 3, 'Industrial': 4,
                   'Mega Projects': 5}



In [3]:
# Read csvs
print("--- read .csv files ---")
train_df = gpd.read_file('train.geojson', index_col=0)
# train_df = train_df.dropna()
test_df = gpd.read_file('test.geojson', index_col=0)
# test_df = test_df.dropna()

--- read .csv files ---


In [4]:
print(train_df.shape, test_df.shape)

(296146, 45) (120526, 44)


In [92]:
######## Feature engineering ########
print("--- Feature engineering ---")


def get_features(df, dataset_type):
    dic_features = {"names":[],"features":[]}

    # geometry features
    perimeter = np.asarray(df['geometry'].length)
    perimeter = np.expand_dims(perimeter, axis=-1)
    dic_features["features"].append(perimeter)
    dic_features["names"].append("perimeter")

    area_values = np.asarray(df['geometry'].area)
    area_values = np.expand_dims(area_values, axis=-1)
    dic_features["features"].append(area_values)
    dic_features["names"].append("area_values")

    def get_min_length_ratio(exte):
        x, y = exte.xy
        lengths = [
            np.sqrt((x[i] - x[i + 1]) ** 2 + (y[i] - y[i + 1]) ** 2) for i in range(4)
        ]
        return np.min(lengths)

    def get_max_length_ratio(exte):
        x, y = exte.xy
        lengths = [
            np.sqrt((x[i] - x[i + 1]) ** 2 + (y[i] - y[i + 1]) ** 2) for i in range(4)
        ]
        return np.max(lengths)

    
    min_lengths = np.asarray(df["geometry"].exterior.apply(get_min_length_ratio))
    min_lengths = np.expand_dims(min_lengths,axis=-1)

    max_lengths = np.asarray(df["geometry"].exterior.apply(get_max_length_ratio))
    max_lengths = np.expand_dims(max_lengths,axis=-1)

    ratio_min_max_lengths = min_lengths/max_lengths
    dic_features["features"].append(ratio_min_max_lengths)
    dic_features["names"].append("ratio_min_max_lengths")

    # dic_features["features"].append(min_lengths)
    # dic_features["names"].append("min_lengths")

    # dic_features["features"].append(max_lengths)
    # dic_features["names"].append("max_lengths")

    # diameter
    def get_coords(geom):
        coords = list(geom.exterior.coords)
        return (coords)

    def get_diameters(coord):
        arr_coord = np.array(coord)
        distances = utils.get_distances(arr_coord,arr_coord)
        return np.max(distances)

    # coords = df.geometry.apply(get_coords)
    # diameters = coords.apply(get_diameters)
    # diameters = np.asarray(diameters)
    # diameters = np.expand_dims(diameters,axis=-1)
    # dic_features["features"].append(diameters)
    # dic_features["names"].append("diameters")

    # ratio_area_over_diameter = perimeter/diameters
    # dic_features["features"].append(ratio_area_over_diameter)
    # dic_features["names"].append("ratio_area_over_diameter")

    # ratio_perimeter_over_diameter = area_values/diameters
    # dic_features["features"].append(ratio_perimeter_over_diameter)
    # dic_features["names"].append("ratio_perimeter_over_diameter")

    # geography features
    # mlb_urban_type = MultiLabelBinarizer()
    # urban_type = np.asarray(df["urban_type"].apply(lambda x: x.split(",") if x!="N,A" else [x]))
    # urban_type = [urban_type[row] for row in range(urban_type.shape[0])]
    # urban_type = mlb_urban_type.fit_transform(urban_type)
    # dic_features["features"].append(urban_type)
    # dic_features["names"]+=list(mlb_urban_type.classes_)
    
    le_urban_type = LabelEncoder()
    urban_type = np.asarray(df["urban_type"])
    le_urban_type.fit(urban_type)
    # print("possible urban_type list :", list(le_urban_type.classes_))
    urban_type = le_urban_type.transform(urban_type)
    urban_type = np.expand_dims(urban_type, axis=-1)
    dic_features["features"].append(urban_type)
    dic_features["names"].append("urban_type")

    # mlb_geography_type = MultiLabelBinarizer()
    # geography_type = np.asarray(df["geography_type"].apply(lambda x: x.split(",") if x!="N,A" else [x]))
    # geography_type = mlb_geography_type.fit_transform(geography_type)
    # dic_features["features"].append(geography_type)
    # dic_features["names"]+=list(mlb_geography_type.classes_)

    # add sequence features

    kept_columns_dense_32 = [0,6,15,19]
    model_dense_32_output = np.load(f"save/outputs/model_dense_32_{dataset_type}_output_5000_train_steps_no_val.npz")["arr_0"][:,kept_columns_dense_32]
    dic_features["features"].append(model_dense_32_output)
    dic_features["names"]+=[f"model_dense_32_output_{j}" for j in kept_columns_dense_32]

    kept_columns_dense_16 = [15,6,1]
    model_dense_16_output = np.load(f"save/outputs/model_dense_16_{dataset_type}_output_5000_train_steps_no_val.npz")["arr_0"]
    model_dense_16_output = model_dense_16_output[:,kept_columns_dense_16]
    dic_features["features"].append(model_dense_16_output)
    dic_features["names"]+=[f"model_dense_16_output_{j}" for j in kept_columns_dense_16]

    
    # PCA on sequence features :
    # PCA_kept_columns_dense_32 = [6, 3, 4, 2, 5][:3]
    # model_dense_32_output = np.load(f"save/old/model_dense_32_{dataset_type}_output_val_0f7151.npz")["arr_0"]
    # PCA_model_dense_32_output = algo.apply_PCA(model_dense_32_output)[:,PCA_kept_columns_dense_32]
    # print("PCA 32 : ",PCA_model_dense_32_output.shape)
    # dic_features["features"].append(PCA_model_dense_32_output)
    # dic_features["names"]+=[f"PCA_model_dense_32_output_{j}" for j in np.arange(PCA_model_dense_32_output.shape[-1])]

    # kept_PCA_columns_dense_16 = np.arange(4)
    # model_dense_16_output = np.load(f"save/old/model_dense_16_{dataset_type}_output_val_0f7151.npz")["arr_0"]
    # PCA_model_dense_16_output = algo.apply_PCA(model_dense_16_output)[:,kept_PCA_columns_dense_16]
    # print("PCA 16 : ",PCA_model_dense_16_output.shape)
    # dic_features["features"].append(PCA_model_dense_16_output)
    # dic_features["names"]+=[f"PCA_model_dense_16_output_{j}" for j in np.arange(PCA_model_dense_16_output.shape[-1])]


    # for feat in features:
    #     print(feat.shape)

    res = np.concatenate(dic_features["features"], axis=-1)

    return res,dic_features


train_x,train_dic_features = get_features(train_df,dataset_type="train")
train_y = train_df['change_type'].apply(lambda x: change_type_map[x])

index_x = test_df["index"]
test_x,_ = get_features(test_df,dataset_type="test")

print("train_x.shape, train_y.shape, test_x.shape :\n",
      train_x.shape, train_y.shape, test_x.shape)


--- Feature engineering ---



  perimeter = np.asarray(df['geometry'].length)

  area_values = np.asarray(df['geometry'].area)

  perimeter = np.asarray(df['geometry'].length)

  area_values = np.asarray(df['geometry'].area)


train_x.shape, train_y.shape, test_x.shape :
 (296146, 24) (296146,) (120526, 24)


In [93]:
best_features_init = list(np.arange(8))
print("best features to keep:",algo.forward_selection(train_x,train_y, significance_level=0.01,best_features_init=best_features_init))

*****
---- best_features so far: [0, 1, 2, 3, 4, 5, 6, 7]  ----
---- perf so far: 0  ----
---- remaining_features so far: [8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23]  ----

---- ---- ratio features explored: 0.0
---- ---- features_kept: [0, 1, 2, 3, 4, 5, 6, 7, 8]
---- ---- perf: 0.33963783239292017
---- ---- time execution iteration: 47.23449945449829

---- ---- ratio features explored: 0.0625
---- ---- features_kept: [0, 1, 2, 3, 4, 5, 6, 7, 9]
---- ---- perf: 0.3775475345531448
---- ---- time execution iteration: 57.2589430809021

---- ---- ratio features explored: 0.125
---- ---- features_kept: [0, 1, 2, 3, 4, 5, 6, 7, 10]
---- ---- perf: 0.3400499843885996
---- ---- time execution iteration: 50.68463182449341

---- ---- ratio features explored: 0.1875
---- ---- features_kept: [0, 1, 2, 3, 4, 5, 6, 7, 11]
---- ---- perf: 0.3438807490223732
---- ---- time execution iteration: 50.55091571807861

---- ---- ratio features explored: 0.25
---- ---- features_kept: [0, 1

In [90]:
features_id_to_keep = list(set(np.arange(train_x.shape[-1]))-set([]))
extracted_train_x = train_x[:,features_id_to_keep]
extracted_test_x = test_x[:,features_id_to_keep]

In [91]:
######## Training ########
print("--- train ---")
rnd_clf = RandomForestClassifier(verbose=1,n_jobs=-1)

rnd_clf.fit(extracted_train_x,train_y)
pred_y = rnd_clf.predict(extracted_train_x)
train_rnd_clf_f1_score = f1_score(pred_y, train_y,average='macro')
print("f1_score on training set :", train_rnd_clf_f1_score)
print("prediction on test set shape :", pred_y.shape)

utils.display_feature_importances(train_dic_features,rnd_clf)

print([estimator.get_depth() for estimator in rnd_clf.estimators_])
print([estimator.get_n_leaves() for estimator in rnd_clf.estimators_])

utils.plot_and_save_confusion_matrix(pred_y,train_y,"./results/confusion_matrix_simple_rnd_clfxCNN.png")



k_fold_number = 10
val_scores = [-1]
if k_fold_number>1:
    def display_scores(scores):
        print("Scores:",scores)
        print("Mean",scores.mean())
        print("Std:",scores.std())
    val_scores = cross_val_score(rnd_clf,extracted_train_x,train_y,scoring="f1_macro",cv=k_fold_number)
    display_scores(val_scores)

    utils.save_experiment_in_excel("./results/automatic_study_features_rndxCNN.xlsx",rnd_clf,train_rnd_clf_f1_score,k_fold_number,val_scores,train_dic_features,)


--- train ---


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    3.3s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:   11.3s finished
[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done  18 tasks      | elapsed:    0.4s
[Parallel(n_jobs=16)]: Done 100 out of 100 | elapsed:    1.6s finished


f1_score on training set : 0.9999235361075586
prediction on test set shape : (296146,)
perimeter importance: 0.17927484213846892
area_values importance: 0.006445515156503034
ratio_min_max_lengths importance: 0.143966692240987
urban_type importance: 0.047291272069124195
model_dense_32_output_0 importance: 0.1361580941790445
model_dense_32_output_6 importance: 0.1843413663405821
model_dense_32_output_15 importance: 0.04420786344941158
model_dense_32_output_19 importance: 0.037097514885244764
model_dense_16_output_1 importance: 0.1780104094960045
model_dense_16_output_4 importance: 0.007312474340219226
model_dense_16_output_8 importance: 0.034157933233957445
model_dense_16_output_9 importance: 0.0017360224704526918
[48, 49, 55, 51, 53, 48, 55, 52, 49, 51, 53, 53, 50, 47, 49, 46, 47, 51, 46, 59, 60, 49, 51, 52, 53, 48, 51, 49, 49, 54, 53, 51, 48, 50, 49, 50, 52, 51, 50, 50, 46, 46, 51, 48, 52, 52, 51, 52, 49, 54, 50, 52, 51, 52, 49, 50, 48, 52, 52, 50, 48, 48, 48, 55, 50, 55, 69, 50, 49, 5

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    2.6s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    9.6s finished
[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done  18 tasks      | elapsed:    0.0s
[Parallel(n_jobs=16)]: Done 100 out of 100 | elapsed:    0.1s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    2.7s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:   10.0s finished
[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done  18 tasks      | elapsed:    0.0s
[Parallel(n_jobs=16)]: Done 100 out of 100 | elapsed:    0.1s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    2.9s
[Parallel(n

Scores: [0.40903499 0.40323326 0.41983164 0.39189278 0.40243189 0.39471981
 0.37076696 0.38227393 0.41861951 0.42228183]
Mean 0.4015086606573693
Std: 0.01604397074603381
experiment saved at ./results/automatic_study_features_rndxCNN.xlsx


  df = df_read.append(new_df, ignore_index=True)


In [87]:
######## Save results to submission file ########
pred_y = rnd_clf.predict(extracted_test_x)
print("--- save ---")
pred_df = pd.DataFrame(pred_y, columns=['change_type'])
pred_df.to_csv("my_submissions/greedy.csv", index=True, index_label='Id')

[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done  18 tasks      | elapsed:    0.0s


--- save ---


[Parallel(n_jobs=16)]: Done 100 out of 100 | elapsed:    0.4s finished
