In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
# data handling
import geopandas as gpd
import pandas as pd
import numpy as np
import utils

# data analysis
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.preprocessing import LabelEncoder

from sklearn.model_selection import cross_val_score

from sklearn.metrics import accuracy_score, f1_score

change_type_map = {'Demolition': 0, 'Road': 1, 'Residential': 2, 'Commercial': 3, 'Industrial': 4,
                   'Mega Projects': 5}



In [3]:
# Read csvs
print("--- read .csv files ---")
train_df = gpd.read_file('train.geojson', index_col=0)
# train_df = train_df.dropna()
test_df = gpd.read_file('test.geojson', index_col=0)
# test_df = test_df.dropna()

--- read .csv files ---


In [4]:
print(train_df.shape, test_df.shape)

(296146, 45) (120526, 44)


In [7]:
######## Feature engineering ########
print("--- Feature engineering ---")


def get_features(df, dataset_type):
    dic_features = {"names":[],"features":[]}

    # geometry features
    perimeter = np.asarray(df['geometry'].length)
    perimeter = np.expand_dims(perimeter, axis=-1)
    dic_features["features"].append(perimeter)
    dic_features["names"].append("perimeter")

    area_values = np.asarray(df['geometry'].area)
    area_values = np.expand_dims(area_values, axis=-1)
    dic_features["features"].append(area_values)
    dic_features["names"].append("area_values")

    def get_min_length_ratio(exte):
        x, y = exte.xy
        lengths = [
            np.sqrt((x[i] - x[i + 1]) ** 2 + (y[i] - y[i + 1]) ** 2) for i in range(4)
        ]
        return np.min(lengths)

    def get_max_length_ratio(exte):
        x, y = exte.xy
        lengths = [
            np.sqrt((x[i] - x[i + 1]) ** 2 + (y[i] - y[i + 1]) ** 2) for i in range(4)
        ]
        return np.max(lengths)

    
    min_lengths = np.asarray(df["geometry"].exterior.apply(get_min_length_ratio))
    min_lengths = np.expand_dims(min_lengths,axis=-1)

    max_lengths = np.asarray(df["geometry"].exterior.apply(get_max_length_ratio))
    max_lengths = np.expand_dims(max_lengths,axis=-1)

    ratio_min_max_lengths = min_lengths/max_lengths
    dic_features["features"].append(ratio_min_max_lengths)
    dic_features["names"].append("ratio_min_max_lengths")

    # dic_features["features"].append(min_lengths)
    # dic_features["names"].append("min_lengths")

    # dic_features["features"].append(max_lengths)
    # dic_features["names"].append("max_lengths")

    
    # geography features
    le_urban_type = LabelEncoder()
    urban_type = np.asarray(df["urban_type"])
    le_urban_type.fit(urban_type)
    # print("possible urban_type list :", list(le_urban_type.classes_))
    urban_type = le_urban_type.transform(urban_type)
    urban_type = np.expand_dims(urban_type, axis=-1)
    dic_features["features"].append(urban_type)
    dic_features["names"].append("urban_type")

    # le_geography_type = LabelEncoder()
    # geography_type = np.asarray(df["geography_type"])
    # le_geography_type.fit(geography_type)
    # # print("possible geography_type list :", list(le_geography_type.classes_))
    # geography_type = le_geography_type.transform(geography_type)
    # geography_type = np.expand_dims(geography_type, axis=-1)
    # features.append(geography_type)

    # add sequence features

    # kept_columns_dense_32 = np.arange(32)
    # model_dense_32_output = np.load(f"save/outputs/model_dense_32_{dataset_type}_output_5000_train_steps_no_val.npz")["arr_0"][:,kept_columns_dense_32]
    # dic_features["features"].append(model_dense_32_output)
    # dic_features["names"]+=[f"model_dense_32_output_{j}" for j in kept_columns_dense_32]

    kept_columns_dense_16 = np.arange(16)
    model_dense_16_output = np.load(f"save/outputs/model_dense_16_{dataset_type}_output_5000_train_steps_no_val.npz")["arr_0"]
    model_dense_16_output = model_dense_16_output[:,kept_columns_dense_16]
    # dic_features["features"].append(model_dense_16_output)
    # dic_features["names"]+=[f"model_dense_16_output_{j}" for j in kept_columns_dense_16]

    kept_columns_dense_6 = np.arange(6)
    model_dense_6_output = np.load(f"save/outputs/CNN_model_{dataset_type}_output_5000_train_steps_no_val.npz")["arr_0"]
    model_dense_6_output = model_dense_16_output[:,kept_columns_dense_6]
    dic_features["features"].append(model_dense_6_output)
    dic_features["names"]+=[f"model_dense_6_output_{i}" for i in range(model_dense_6_output.shape[-1])]
    
    # PCA on sequence features :

    n_kept_32 = 10
    n_kept_16 = 5

    model_dense_32_output = np.load(f"save/outputs/model_dense_32_{dataset_type}_output_5000_train_steps_no_val.npz")["arr_0"]
    mean_column = np.zeros(np.shape(model_dense_32_output)[1])
    for i in range(np.shape(model_dense_32_output)[0]) :
        mean_column+=model_dense_32_output[i,0]
    mean_column = mean_column/np.shape(model_dense_32_output)[1]
    mean_matrix = [mean_column for i in range(np.shape(model_dense_32_output)[0])]
    Centered_data = model_dense_32_output-mean_matrix
    Covariance_matrix = np.dot(Centered_data.T,Centered_data)
    S,V = np.linalg.eig(Covariance_matrix)
    order = np.argsort(S)
    U=V[order][::-1][:n_kept_32]
    PCA_model_dense_32_output = np.dot(Centered_data,U.T)
    print("PCA 32 : ",PCA_model_dense_32_output.shape)
    dic_features["features"].append(PCA_model_dense_32_output)
    dic_features["names"]+=[f"PCA_model_dense_32_output_{j}" for j in np.arange(n_kept_32)]


    model_dense_16_output = np.load(f"save/outputs/model_dense_16_{dataset_type}_output_5000_train_steps_no_val.npz")["arr_0"]
    mean_column = np.zeros(np.shape(model_dense_16_output)[1])
    for i in range(np.shape(model_dense_16_output)[0]) :
        mean_column+=model_dense_16_output[i,0]
    mean_column = mean_column/np.shape(model_dense_16_output)[1]
    mean_matrix = [mean_column for i in range(np.shape(model_dense_16_output)[0])]
    Centered_data = model_dense_16_output-mean_matrix
    Covariance_matrix = np.dot(Centered_data.T,Centered_data)
    S,V = np.linalg.eig(Covariance_matrix)
    order = np.argsort(S)
    U=V[order][::-1][:n_kept_16]
    PCA_model_dense_16_output = np.dot(Centered_data,U.T)
    print("PCA 16 : ",PCA_model_dense_16_output.shape)
    dic_features["features"].append(PCA_model_dense_16_output)
    dic_features["names"]+=[f"PCA_model_dense_16_output_{j}" for j in np.arange(n_kept_16)]


    # for feat in features:
    #     print(feat.shape)

    res = np.concatenate(dic_features["features"], axis=-1)

    return res,dic_features


train_x,train_dic_features = get_features(train_df,dataset_type="train")
train_y = train_df['change_type'].apply(lambda x: change_type_map[x])

index_x = test_df["index"]
test_x,_ = get_features(test_df,dataset_type="test")

print("train_x.shape, train_y.shape, test_x.shape :\n",
      train_x.shape, train_y.shape, test_x.shape)


--- Feature engineering ---



  perimeter = np.asarray(df['geometry'].length)

  area_values = np.asarray(df['geometry'].area)


PCA 32 :  (296146, 10)
PCA 16 :  (296146, 5)



  perimeter = np.asarray(df['geometry'].length)

  area_values = np.asarray(df['geometry'].area)


PCA 32 :  (120526, 10)
PCA 16 :  (120526, 5)
train_x.shape, train_y.shape, test_x.shape :
 (296146, 25) (296146,) (120526, 25)


In [8]:
######## Training ########
print("--- train ---")
rnd_clf = RandomForestClassifier(verbose=1,n_jobs=-1)

rnd_clf.fit(train_x,train_y)
pred_y = rnd_clf.predict(train_x)
train_rnd_clf_f1_score = f1_score(pred_y, train_y,average='macro')
print("f1_score on training set :", train_rnd_clf_f1_score)
print("prediction on test set shape :", pred_y.shape)

utils.display_feature_importances(train_dic_features,rnd_clf)

print([estimator.get_depth() for estimator in rnd_clf.estimators_])
print([estimator.get_n_leaves() for estimator in rnd_clf.estimators_])

utils.plot_and_save_confusion_matrix(pred_y,train_y,"./results/confusion_matrix_simple_rnd_clfxCNN.png")



k_fold_number = 0
val_scores = [-1]
if k_fold_number>1:
    def display_scores(scores):
        print("Scores:",scores)
        print("Mean",scores.mean())
        print("Std:",scores.std())

    val_scores = cross_val_score(rnd_clf,train_x,train_y,scoring="f1_macro",cv=k_fold_number)
    display_scores(val_scores)


    utils.save_experiment_in_excel("./results/automatic_study_features_rndxCNN.xlsx",rnd_clf,train_rnd_clf_f1_score,k_fold_number,val_scores,train_dic_features,)


--- train ---


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   25.7s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  1.3min finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.7s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    2.0s finished


f1_score on training set : 0.9999965254593876
prediction on test set shape : (296146,)
perimeter importance: 0.04712156328974852
area_values importance: 0.00229579494676868
ratio_min_max_lengths importance: 0.029843709821277757
urban_type importance: 0.009329512399329871
model_dense_6_output_0 importance: 0.0006091320663268304
model_dense_6_output_1 importance: 0.039569299405721875
model_dense_6_output_2 importance: 0.0006112461979450328
model_dense_6_output_3 importance: 0.007078542665558596
model_dense_6_output_4 importance: 0.0006152245782863676
model_dense_6_output_5 importance: 0.051929617704291775
PCA_model_dense_32_output_0 importance: 0.023114637516062873
PCA_model_dense_32_output_1 importance: 0.06969745868081968
PCA_model_dense_32_output_2 importance: 0.03320287241901874
PCA_model_dense_32_output_3 importance: 0.029475831000314763
PCA_model_dense_32_output_4 importance: 0.030590855597817067
PCA_model_dense_32_output_5 importance: 0.05939241045402901
PCA_model_dense_32_output_

In [23]:
######## Save results to submission file ########
pred_y = rnd_clf.predict(test_x)
print("--- save ---")
pred_df = pd.DataFrame(pred_y, columns=['change_type'])
pred_df.to_csv("./my_submissions/my_submission_simple_rndxCNN.csv", index=True, index_label='Id')

[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.2s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    0.7s finished


--- save ---


OSError: Cannot save file into a non-existent directory: 'my_submissions'