In [8]:
%load_ext autoreload
%autoreload 2

In [9]:
# data handling
import geopandas as gpd
import pandas as pd
import numpy as np
import utils

# data analysis
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

from sklearn.preprocessing import LabelEncoder

from sklearn.metrics import accuracy_score, f1_score

change_type_map = {'Demolition': 0, 'Road': 1, 'Residential': 2, 'Commercial': 3, 'Industrial': 4,
                   'Mega Projects': 5}

# Read csvs
print("--- read .csv files ---")
train_df = gpd.read_file('train.geojson', index_col=0)
# train_df = train_df.dropna()
test_df = gpd.read_file('test.geojson', index_col=0)
# test_df = test_df.dropna()


--- read .csv files ---


In [10]:
print(train_df.shape, test_df.shape)

(296146, 45) (120526, 44)


In [11]:
def length_ratio(exte):
    x, y = exte.xy
    lengths = [
        np.sqrt((x[i] - x[i + 1]) ** 2 + (y[i] - y[i + 1]) ** 2) for i in range(4)
    ]
    return np.min(lengths) / np.max(lengths)


In [12]:
######## Feature engineering ########
print("--- Feature engineering ---")


def get_features(df, begin_index = 3):
    # columns = df.columns
    # features = [np.expand_dims(np.asarray(df[column]), axis = -1) for column in columns[begin_index : -12]] #TODO modify this line to sort images chronologically

    features = []
    # geometry features
    perimeter = np.asarray(df['geometry'].length)
    perimeter = np.expand_dims(perimeter, axis=-1)
    features.append(perimeter)

    area_values = np.asarray(df['geometry'].area)
    area_values = np.expand_dims(area_values, axis=-1)
    features.append(area_values)

    ratios = np.asarray(df["geometry"].exterior.apply(length_ratio))
    ratios = np.expand_dims(ratios, axis=-1)
    features.append(ratios)
    
    # geography features
    le_urban_type = LabelEncoder()
    urban_type = np.asarray(df["urban_type"])
    le_urban_type.fit(urban_type)
    # print("possible urban_type list :", list(le_urban_type.classes_))
    urban_type = le_urban_type.transform(urban_type)
    urban_type = np.expand_dims(urban_type, axis=-1)
    features.append(urban_type)

    # le_geography_type = LabelEncoder()
    # geography_type = np.asarray(df["geography_type"])
    # le_geography_type.fit(geography_type)
    # # print("possible geography_type list :", list(le_geography_type.classes_))
    # geography_type = le_geography_type.transform(geography_type)
    # geography_type = np.expand_dims(geography_type, axis=-1)
    # features.append(geography_type)

    # for feat in features:
    #     print(feat.shape)

    res = np.concatenate(features, axis=-1)

    return res


train_x = get_features(train_df)
train_y = train_df['change_type'].apply(lambda x: change_type_map[x])

index_x = test_df["index"]
test_x = get_features(test_df, begin_index=2)

print("train_x.shape, train_y.shape, test_x.shape :\n",
      train_x.shape, train_y.shape, test_x.shape)


--- Feature engineering ---



  perimeter = np.asarray(df['geometry'].length)

  area_values = np.asarray(df['geometry'].area)

  perimeter = np.asarray(df['geometry'].length)

  area_values = np.asarray(df['geometry'].area)


train_x.shape, train_y.shape, test_x.shape :
 (296146, 4) (296146,) (120526, 4)


In [13]:
######## Training ########
print("--- train ---")
rnd_clf = RandomForestClassifier(verbose=1,n_jobs=-1)

rnd_clf.fit(train_x,train_y)
pred_y = rnd_clf.predict(train_x)
print("f1_score on training set :", f1_score(pred_y, train_y,average='macro'))
print("prediction on test set shape :", pred_y.shape)

for i in range(len(rnd_clf.feature_importances_)):
    print("feat_name importance:", rnd_clf.feature_importances_[i])

print([estimator.get_depth() for estimator in rnd_clf.estimators_])
print([estimator.get_n_leaves() for estimator in rnd_clf.estimators_])

utils.plot_and_save_confusion_matrix(pred_y,train_y,"./results/confusion_matrix_simple_rnd_clf.png")

######## Save results to submission file ########
pred_y = rnd_clf.predict(test_x)
print("--- save ---")
pred_df = pd.DataFrame(pred_y, columns=['change_type'])
pred_df.to_csv("my_submission_paul_root.csv", index=True, index_label='Id')


--- train ---


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    3.1s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:   12.0s finished
[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done  18 tasks      | elapsed:    0.5s
[Parallel(n_jobs=16)]: Done 100 out of 100 | elapsed:    2.0s finished


f1_score on training set : 0.9536415125584977
prediction on test set shape : (296146,)
feat_name importance: 0.47392575177934826
feat_name importance: 0.011533393361079215
feat_name importance: 0.4752358348571511
feat_name importance: 0.03930502000242144
[63, 90, 60, 58, 63, 66, 64, 63, 69, 63, 63, 64, 67, 60, 65, 60, 71, 72, 64, 63, 63, 63, 65, 63, 64, 62, 65, 70, 68, 61, 68, 70, 84, 76, 67, 59, 66, 64, 68, 60, 62, 68, 75, 65, 65, 66, 66, 63, 62, 67, 70, 67, 69, 72, 76, 63, 71, 60, 65, 64, 59, 62, 77, 62, 71, 65, 70, 62, 72, 76, 69, 65, 63, 69, 60, 66, 61, 57, 67, 63, 70, 58, 73, 62, 59, 66, 63, 64, 60, 71, 66, 73, 63, 75, 63, 66, 67, 69, 61, 61]
[61083, 83217, 77565, 79244, 84507, 82887, 81025, 78822, 79523, 80517, 81999, 72085, 81643, 76369, 81352, 79626, 86832, 86156, 79930, 78705, 83485, 73427, 85299, 84924, 74924, 82933, 84723, 85855, 80116, 83725, 74961, 81821, 82194, 83737, 78417, 82487, 84623, 84282, 83112, 85271, 81091, 80919, 32134, 87623, 82584, 83486, 82965, 82152, 82872, 

[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done  18 tasks      | elapsed:    0.1s


--- save ---


[Parallel(n_jobs=16)]: Done 100 out of 100 | elapsed:    0.7s finished
