In [1]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, KFold
from joblib import dump, load
import geopandas as gpd

import numpy as np

In [2]:
train = gpd.read_file( "./data/train_data_final.geojson")
test =  gpd.read_file("./data/test_data_final.geojson")

In [3]:
X = train.loc[:,'NDVI_2000':'NDVI_2019']
y = train['label_0']
X_test = test.loc[:,'NDVI_2000':'NDVI_2019']

In [4]:
pipe = Pipeline([('rf', RandomForestClassifier())])

In [5]:
parameters = {'rf__n_estimators':[100, 200, 300, 400, 500],
              'rf__max_features':np.arange(1, X.shape[1]+1),
              'rf__max_depth':[1,10,25,30, 50,75]
             }

pipe = GridSearchCV(pipe, 
                    parameters,
                    verbose=1,
                    cv=KFold(n_splits=5, shuffle=True, random_state=123),
                    n_jobs=16,
                    #return_train_score=True,
                    scoring='f1_macro'                    
                    #refit=False
                   )

In [6]:
%%time
pipe.fit(X, y)

Fitting 5 folds for each of 570 candidates, totalling 2850 fits
CPU times: user 7.94 s, sys: 748 ms, total: 8.68 s
Wall time: 26min 23s


In [7]:
dump(pipe, './models/cv_rf.joblib') 

['./models/cv_rf.joblib']

In [8]:
pipe.cv_results_

{'mean_fit_time': array([ 0.18639579,  0.44042616,  0.61967816,  0.81395392,  0.97937078,
         0.23236799,  0.44968553,  0.666748  ,  0.90043125,  1.10211539,
         0.25529695,  0.50085044,  0.74125786,  0.99287467,  1.22541714,
         0.27382832,  0.54213581,  0.81162553,  1.11142821,  1.37846704,
         0.31571689,  0.59709682,  0.88592429,  1.1756053 ,  1.4668417 ,
         0.32748151,  0.64226089,  0.96178112,  1.27954512,  1.59270215,
         0.351788  ,  0.69421358,  1.03447475,  1.37446094,  1.71337757,
         0.37403278,  0.74123626,  1.10255957,  1.47138143,  1.82914681,
         0.39505706,  0.78491616,  1.17298636,  1.5632153 ,  1.95781417,
         0.41994433,  0.83203492,  1.24831305,  1.65365725,  2.07403626,
         0.44153647,  0.88113217,  1.31550698,  1.75546775,  2.19428387,
         0.46523633,  0.92654696,  1.387639  ,  1.84644957,  2.3136857 ,
         0.49277291,  0.97584014,  1.46282144,  1.94542832,  2.4295774 ,
         0.51450515,  1.02397599, 

In [9]:
pipe.best_estimator_

In [10]:
pipe.best_params_

{'rf__max_depth': 30, 'rf__max_features': 2, 'rf__n_estimators': 500}

In [11]:
pipe.best_score_

0.7572894906101438

In [12]:
pipe.score(X_test, test.label_0)

0.762544470120179

In [None]:
test.to_file("/content/drive/MyDrive/fundar_deforestacion/data/predictions_sklearn.geojson")