In [1]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.preprocessing import StandardScaler

from joblib import dump, load
import geopandas as gpd
import pandas as pd
import numpy as np

In [2]:
train = gpd.read_file( "./data/train_data_final.geojson")
test =  gpd.read_file("./data/test_data_final.geojson")

In [3]:
X_train = train.loc[:,'NDVI_2000':'NDVI_2019']
y_train = train['label']
X_test = test.loc[:,'NDVI_2000':'NDVI_2019']
y_test = test['label']

In [4]:
X_test

Unnamed: 0,NDVI_2000,NDVI_2001,NDVI_2002,NDVI_2003,NDVI_2004,NDVI_2005,NDVI_2006,NDVI_2007,NDVI_2008,NDVI_2009,NDVI_2010,NDVI_2011,NDVI_2012,NDVI_2013,NDVI_2014,NDVI_2015,NDVI_2016,NDVI_2017,NDVI_2018,NDVI_2019
0,0.234313,0.357835,0.424702,0.483381,0.449997,0.461925,0.449416,0.383269,0.441211,0.424893,0.409912,0.384427,0.444399,0.435384,0.281297,0.250821,0.391019,0.365752,0.364020,0.398754
1,0.259537,0.422734,0.467305,0.466144,0.484087,0.412429,0.452404,0.443558,0.498046,0.431269,0.428367,0.451218,0.357088,0.427859,0.327946,0.288966,0.424729,0.360664,0.392814,0.380607
2,0.477138,0.453352,0.498541,0.578853,0.564869,0.526432,0.529288,0.503792,0.519499,0.556097,0.497590,0.522317,0.552941,0.476867,0.426623,0.403017,0.536244,0.510117,0.455523,0.373811
3,0.254954,0.377850,0.433457,0.462341,0.436086,0.407982,0.501653,0.427673,0.450609,0.427829,0.449063,0.435687,0.474683,0.436132,0.428942,0.260044,0.429690,0.434661,0.427567,0.399792
4,0.369445,0.393338,0.427074,0.425301,0.400655,0.372527,0.348613,0.337494,0.340376,0.344852,0.372999,0.396595,0.411514,0.403225,0.448769,0.370953,0.376665,0.384846,0.352982,0.407736
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2631,0.189081,0.302253,0.357573,0.389088,0.337776,0.353739,0.343483,0.319770,0.312302,0.342891,0.330212,0.366863,0.362699,0.337353,0.217665,0.257839,0.360269,0.364850,0.245910,0.315479
2632,0.271267,0.241062,0.316804,0.290820,0.327361,0.321276,0.324242,0.302519,0.306803,0.279131,0.318295,0.292774,0.315454,0.310930,0.354042,0.271838,0.252772,0.340212,0.301725,0.338962
2633,0.365754,0.364973,0.367954,0.299517,0.356399,0.357291,0.378934,0.407844,0.409869,0.349353,0.396730,0.419728,0.410604,0.410315,0.384567,0.379902,0.402073,0.496995,0.401466,0.375143
2634,0.361940,0.366046,0.409233,0.381233,0.313657,0.344250,0.285034,0.263516,0.236294,0.225042,0.310419,0.260082,0.279730,0.247344,0.315774,0.348482,0.350645,0.318772,0.337674,0.373489


In [4]:
pipe = Pipeline(
    [
        ('preproc_scaling', StandardScaler()),
        ('rf', RandomForestClassifier())
    
    ])

In [5]:
parameters = {'rf__n_estimators':[100, 200, 300, 400, 500],
              'rf__max_features':np.arange(1, X_train.shape[1]+1),
              'rf__max_depth':[1,10,25,30, 50,75]}

In [6]:
grid_search = GridSearchCV(pipe, 
                        parameters,
                        verbose=1,
                        cv=KFold(n_splits=5, shuffle=True, random_state=123),
                        n_jobs=16,
                        #return_train_score=True,
                        scoring='f1_macro',
                        #refit=False,
                       )


In [7]:
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 600 candidates, totalling 3000 fits


GridSearchCV(cv=KFold(n_splits=5, random_state=123, shuffle=True),
             estimator=Pipeline(steps=[('preproc_scaling', StandardScaler()),
                                       ('rf', RandomForestClassifier())]),
             n_jobs=16,
             param_grid={'rf__max_depth': [1, 10, 25, 30, 50, 75],
                         'rf__max_features': array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19, 20]),
                         'rf__n_estimators': [100, 200, 300, 400, 500]},
             scoring='f1_macro', verbose=1)

In [8]:
y_pred = grid_search.predict(X_test)

In [9]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.72      0.59      0.65       877
           1       0.88      0.91      0.89       900
           2       0.70      0.81      0.75       859

    accuracy                           0.77      2636
   macro avg       0.77      0.77      0.76      2636
weighted avg       0.77      0.77      0.76      2636



In [10]:
dump(grid_search, './models/cv_rf.joblib') 

['./models/cv_rf.joblib']