In [1]:
"""
This script can be used as skeleton code to read the challenge train and test
geojsons, to train a trivial model, and write data to the submission file.
"""
# data handling
import geopandas as gpd
import pandas as pd
import numpy as np

# data analysis
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.preprocessing import LabelEncoder

from sklearn.model_selection import cross_val_score

from sklearn.metrics import accuracy_score, f1_score

change_type_map = {'Demolition': 0, 'Road': 1, 'Residential': 2, 'Commercial': 3, 'Industrial': 4,
                   'Mega Projects': 5}

# Read csvs
print("--- read .csv files ---")
train_df = gpd.read_file('train.geojson', index_col=0)

test_df = gpd.read_file('test.geojson', index_col=0)


--- read .csv files ---


In [2]:
def handle_na_in_df(df):
    print("number of lines at first:",df.shape[0])
    df_without_na = df.dropna()
    print("number of lines without na:",df_without_na.shape[0])
    indices_without_na = np.asarray(df_without_na.index)
    df_with_na = df[df.isna().any(axis=1)]
    indices_with_na = np.asarray(df_with_na.index)

    indices = np.concatenate([indices_without_na,indices_with_na],axis=0)

    dummy_values = 2*np.ones((indices_with_na.shape[0],))

    return df_without_na,indices,dummy_values

df_without_na,indices,dummy_values = handle_na_in_df(test_df)
print(dummy_values.shape)
print(indices.shape)
print(test_df.shape)

number of lines at first: 120526
number of lines without na: 119176
(1350,)
(120526,)
(120526, 44)


In [None]:
diameter = train_df[]

In [3]:
######## Feature engineering ########
print("--- Feature engineering ---")


def get_features(df):
    dic_features = {"names":[],"features":[]}

    # geometry features
    perimeter = np.asarray(df['geometry'].length)
    perimeter = np.expand_dims(perimeter, axis=-1)
    dic_features["features"].append(perimeter)
    dic_features["names"].append("perimeter")

    area_values = np.asarray(df['geometry'].area)
    area_values = np.expand_dims(area_values, axis=-1)
    dic_features["features"].append(area_values)
    dic_features["names"].append("area")

    ratio_length_over_area = perimeter/area_values
    dic_features["features"].append(ratio_length_over_area)
    dic_features["names"].append("ratio_length_over_area")

    diameter = 


    # geography features
    mlb_urban_type = MultiLabelBinarizer()
    urban_type = np.asarray(df["urban_type"].apply(lambda x: x.split(",") if x!="N,A" else [x]))
    mlb_urban_type.fit(urban_type)
    print("possible urban_type list :", list(mlb_urban_type.classes_))
    urban_type = mlb_urban_type.transform(urban_type)
    dic_features["features"].append(urban_type)
    dic_features["names"]+=list(mlb_urban_type.classes_)

    mlb_geography_type = MultiLabelBinarizer()
    geography_type = np.asarray(df["geography_type"].apply(lambda x: x.split(",") if x!="N,A" else [x]))
    mlb_geography_type.fit(geography_type)
    print("possible geography_type list :", list(mlb_geography_type.classes_))
    geography_type = mlb_geography_type.transform(geography_type)
    print("geography_type",geography_type)
    dic_features["features"].append(geography_type)
    dic_features["names"]+=list(mlb_geography_type.classes_)

    # dates/images features
    def get_sorted_date_diff_with_indices(df):
        dates_to_add = []
        for i in range(5):
            date = df[f'date{i}'].apply(lambda x: int(str(x)[-4:]))
            date = np.asarray(date)
            date = np.expand_dims(date,axis=-1)
            dates_to_add.append(date)

        dates = np.concatenate(dates_to_add,axis=-1)

        indices_dates = np.argsort(dates,axis=-1)
        dates = np.array([dates[i,indices_dates[i,:]] for i in range(dates.shape[0])])

        date_diff = dates - dates[:,-1:]@np.ones((1,5),dtype=int)
        return date_diff, indices_dates

    date_diff,indices_dates=get_sorted_date_diff_with_indices(df)
    dic_features["features"].append(date_diff)
    dic_features["names"]+=[f"new_date_diff{i}" for i in range(5)]

    
    trad_colors = {"red":0,"blue":1,"green":2}
    def get_mean_std(df):
        colors = list(trad_colors.keys())
        res = np.zeros((df.shape[0],5,3,2))
        for i in range(1,6):
            for j_color,color in enumerate(colors):
                res[:,i-1,j_color,0] = np.asarray(df[f"img_{color}_mean_date{i}"])
                res[:,i-1,j_color,1] = np.asarray(df[f"img_{color}_std_date{i}"])
                
        return res

    color_mean_std = get_mean_std(df)
    color_mean_std = np.array([color_mean_std[i,indices_dates[i,:],:,:] for i in range(color_mean_std.shape[0])])

    for i in range(color_mean_std.shape[2]):
        for j in range(color_mean_std.shape[3]):
            dic_features["features"].append(color_mean_std[:,:,i,j])
            dic_features["names"]+=[f"color_mean_std_{date}_{i}_{j}" for date in range(5)]

    res = np.concatenate(dic_features["features"], axis=-1)

    return res,dic_features

def display_features(dic_features):
    i_names = 0
    for i_feat in range(len(dic_features["features"])):
        feat = dic_features["features"][i_feat]
        name = dic_features["names"][i_names:i_names+feat.shape[-1]]
        i_names+=feat.shape[-1]
        print(name,feat.shape)

train_df_without_na,train_indices,train_dummy_values = handle_na_in_df(train_df)
train_x,train_dic_features= get_features(train_df_without_na)
display_features(train_dic_features)
train_y = train_df_without_na['change_type'].apply(lambda x: change_type_map[x])

test_df_without_na,test_indices,test_dummy_values = handle_na_in_df(test_df)
test_x,_ = get_features(test_df_without_na)

print("train_x.shape, train_y.shape, test_x.shape :\n",
      train_x.shape, train_y.shape, test_x.shape)


--- Feature engineering ---
number of lines at first: 296146
number of lines without na: 292758



  perimeter = np.asarray(df['geometry'].length)

  area_values = np.asarray(df['geometry'].area)
  ratio_length_over_area = perimeter/area_values


possible urban_type list : ['Dense Urban', 'Industrial', 'N,A', 'Rural', 'Sparse Urban', 'Urban Slum']
possible geography_type list : ['Barren Land', 'Coastal', 'Dense Forest', 'Desert', 'Farms', 'Grass Land', 'Hills', 'Lakes', 'N,A', 'River', 'Snow', 'Sparse Forest']
geography_type [[0 0 1 ... 0 0 0]
 [0 0 1 ... 0 0 0]
 [0 0 1 ... 0 0 0]
 ...
 [0 0 1 ... 0 0 0]
 [0 0 1 ... 0 0 0]
 [0 0 1 ... 0 0 0]]
['perimeter'] (292758, 1)
['area'] (292758, 1)
['ratio_length_over_area'] (292758, 1)
['Dense Urban', 'Industrial', 'N,A', 'Rural', 'Sparse Urban', 'Urban Slum'] (292758, 6)
['Barren Land', 'Coastal', 'Dense Forest', 'Desert', 'Farms', 'Grass Land', 'Hills', 'Lakes', 'N,A', 'River', 'Snow', 'Sparse Forest'] (292758, 12)
['new_date_diff0', 'new_date_diff1', 'new_date_diff2', 'new_date_diff3', 'new_date_diff4'] (292758, 5)
['color_mean_std_0_0_0', 'color_mean_std_1_0_0', 'color_mean_std_2_0_0', 'color_mean_std_3_0_0', 'color_mean_std_4_0_0'] (292758, 5)
['color_mean_std_0_0_1', 'color_mean_s

In [27]:
print(train_x[0,:])

[ 4.13700987e-03  8.17460143e-07  0.00000000e+00  0.00000000e+00
  0.00000000e+00  0.00000000e+00  1.00000000e+00  0.00000000e+00
  0.00000000e+00  0.00000000e+00  1.00000000e+00  0.00000000e+00
  0.00000000e+00  1.00000000e+00  0.00000000e+00  0.00000000e+00
  0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
 -6.00000000e+00 -3.00000000e+00 -2.00000000e+00 -1.00000000e+00
  0.00000000e+00  1.25773062e+02  1.50766726e+02  1.04614233e+02
  9.33717748e+01  9.22913469e+01  2.82699838e+01  5.57453108e+01
  3.38131603e+01  2.98120405e+01  3.98199487e+01  1.34900701e+02
  1.49356684e+02  1.02844339e+02  8.98273794e+01  7.95700644e+01
  2.50080316e+01  4.27232176e+01  3.48180115e+01  2.53242936e+01
  2.81896038e+01  1.39833243e+02  1.58964529e+02  1.00950353e+02
  1.07291113e+02  8.87942502e+01  2.82649070e+01  4.75763832e+01
  3.30640141e+01  2.83283680e+01  3.08642300e+01]


In [38]:

######## Training ########

print("--- train ---")
rnd_clf = RandomForestClassifier(n_estimators=500,max_depth=50,max_leaf_nodes=30000, bootstrap =True, verbose=True, n_jobs=-1)

rnd_clf.fit(train_x,train_y)
pred_y = rnd_clf.predict(train_x)
print("f1_score on training set :", f1_score(pred_y, train_y,average='macro'))

# knn_clf = KNeighborsClassifier(n_neighbors=3,n_jobs=-1)
# knn_clf.fit(train_x,train_y)
# pred_y = knn_clf.predict(train_x)
# print("f1_score on training set :", f1_score(pred_y, train_y,average='macro'))

if True:
    def display_scores(scores):
        print("Scores:",scores)
        print("Mean",scores.mean())
        print("Std:",scores.std())

    scores = cross_val_score(rnd_clf,train_x,train_y,scoring="f1_macro",cv=3)
    display_scores(scores)



--- train ---


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:   10.3s
[Parallel(n_jobs=-1)]: Done 168 tasks      | elapsed:   56.6s
[Parallel(n_jobs=-1)]: Done 418 tasks      | elapsed:  2.3min
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:  2.7min finished
[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done  18 tasks      | elapsed:    0.2s
[Parallel(n_jobs=16)]: Done 168 tasks      | elapsed:    1.8s
[Parallel(n_jobs=16)]: Done 418 tasks      | elapsed:    4.3s
[Parallel(n_jobs=16)]: Done 500 out of 500 | elapsed:    5.1s finished


f1_score on training set : 0.8695494151963375


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    8.0s
[Parallel(n_jobs=-1)]: Done 168 tasks      | elapsed:   35.9s
[Parallel(n_jobs=-1)]: Done 418 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:  1.7min finished
[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done  18 tasks      | elapsed:    0.0s
[Parallel(n_jobs=16)]: Done 168 tasks      | elapsed:    0.5s
[Parallel(n_jobs=16)]: Done 418 tasks      | elapsed:    1.3s
[Parallel(n_jobs=16)]: Done 500 out of 500 | elapsed:    1.6s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    5.4s
[Parallel(n_jobs=-1)]: Done 168 tasks      | elapsed:   34.3s
[Parallel(n_jobs=-1)]: Done 418 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:  1.7min fi

Scores: [0.23546004 0.21843254 0.24000189]
Mean 0.23129815687661548
Std: 0.00928440102862603


In [None]:
import optuna

def objective(trial):
    max_leaf_nodes = trial.suggest_int("max_leaf_nodes", 20000, 40000)
    n_estimators = trial.suggest_int("n_estimators", 100, 500)
    rnd_clf = RandomForestClassifier(n_estimators=n_estimators,max_leaf_nodes=max_leaf_nodes, bootstrap=True, verbose=True, n_jobs=-1)

    scores = cross_val_score(rnd_clf,train_x,train_y,scoring="f1_macro",cv=4)
    return scores.mean()


study = optuna.create_study()
study.optimize(objective, n_trials=10)
best_params = study.best_params
found_n_estimators = best_params["n_estimators"]
found_max_leaf_nodes = best_params["max_leaf_nodes"]


import yaml

with open('hp_optim.yaml', 'w') as outfile:
    yaml.dump({"found_max_leaf_nodes":found_max_leaf_nodes,"found_n_estimators":found_n_estimators}, outfile, default_flow_style=False)


In [None]:
print(found_n_estimators,found_max_leaf_nodes)

In [52]:
def display_feature_importances(dic_features,clf):
    for i,feat_name in enumerate(dic_features["names"]):
        print(f"{feat_name} importance:",clf.feature_importances_[i]) 

display_feature_importances(train_dic_features,rnd_clf)
print([estimator.get_depth() for estimator in rnd_clf.estimators_])
print([estimator.get_n_leaves() for estimator in rnd_clf.estimators_])

perimeter importance: 0.10219737044553817
area importance: 0.0036836961081480263
Dense Urban importance: 0.006730438623292298
Industrial importance: 0.012330979066035577
N,A importance: 0.0020832256373468742
Rural importance: 0.0014650795241021807
Sparse Urban importance: 0.0030162967204882163
Urban Slum importance: 0.000928947942529226
Barren Land importance: 0.0034075354337502455
Coastal importance: 0.0011276901200453322
Dense Forest importance: 0.004083278926514398
Desert importance: 0.001313713824412587
Farms importance: 0.0035944781613635123
Grass Land importance: 0.004243327447610884
Hills importance: 0.0002834329644954271
Lakes importance: 0.0040487115892368294
N,A importance: 0.0009093203569023029
River importance: 0.0033164005014205272
Snow importance: 1.8774482851913879e-06
Sparse Forest importance: 0.0037208802476631103
new_date_diff0 importance: 0.01354480228506595
new_date_diff1 importance: 0.012647771068156727
new_date_diff2 importance: 0.011039582022025603
new_date_diff3

In [43]:
pred_y = rnd_clf.predict(test_x)
print("prediction on test set shape :", pred_y.shape)
print(pred_y)

pred_y = np.concatenate([pred_y,test_dummy_values],axis=0)
print("pred_y.shape after:",pred_y.shape)
print(test_dummy_values.shape)
print(test_indices.shape)
new_test_indices = np.expand_dims(test_indices,axis=-1)
new_pred = np.expand_dims(pred_y,axis=-1)
print(new_pred.shape,new_test_indices.shape)
new_pred_y = np.concatenate([new_test_indices,new_pred],axis=-1)
new_pred_y = new_pred_y[np.argsort(new_pred[:,0],axis=0),1]
new_pred_y = new_pred_y.astype(int)
print(new_pred_y)


######## Save results to submission file ########
print("--- save ---")
pred_df = pd.DataFrame(new_pred_y, columns=['change_type'])
pred_df.to_csv("my_submission.csv", index=True, index_label='Id')

[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done  18 tasks      | elapsed:    0.1s
[Parallel(n_jobs=16)]: Done 168 tasks      | elapsed:    1.0s
[Parallel(n_jobs=16)]: Done 418 tasks      | elapsed:    2.6s
[Parallel(n_jobs=16)]: Done 500 out of 500 | elapsed:    3.0s finished


prediction on test set shape : (119176,)
[3 2 2 ... 3 3 3]
pred_y.shape after: (120526,)
(1350,)
(120526,)
(120526, 1) (120526, 1)
[0 0 0 ... 5 5 5]
--- save ---
