In [5]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.preprocessing import StandardScaler

from joblib import dump, load
import geopandas as gpd

import numpy as np

In [6]:
train = gpd.read_file( "./data/train_data_final.geojson")
test =  gpd.read_file("./data/test_data_final.geojson")

In [7]:
X = train.loc[:,'NDVI_2000':'NDVI_2019']
y = train['label_0']
X_test = test.loc[:,'NDVI_2000':'NDVI_2019']

In [8]:
pipe = Pipeline([('preproc_scaling', StandardScaler()),
                 ('rf', RandomForestClassifier())])

In [9]:
parameters = {'rf__n_estimators':[100, 200, 300, 400, 500],
              'rf__max_features':np.arange(1, X.shape[1]+1),
              'rf__max_depth':[1,10,25,30, 50,75]
             }

pipe = GridSearchCV(pipe, 
                    parameters,
                    verbose=1,
                    cv=KFold(n_splits=5, shuffle=True, random_state=123),
                    n_jobs=16,
                    #return_train_score=True,
                    scoring='f1_macro'                   
                    #refit=False
                   )

In [None]:
pipe.fit(X, y)

Fitting 5 folds for each of 570 candidates, totalling 2850 fits


In [14]:
dump(pipe, './models/cv_rf.joblib') 

['./models/cv_rf.joblib']

In [16]:
pipe.best_estimator_

Pipeline(steps=[('rf',
                 RandomForestClassifier(max_depth=30, max_features=2,
                                        n_estimators=500))])

In [17]:
pipe.best_params_

{'rf__max_depth': 30, 'rf__max_features': 2, 'rf__n_estimators': 500}

In [18]:
pipe.best_score_

0.7569701913345194

In [19]:
pipe.score(X_test, test.label_0)

0.7617997135700626

In [None]:
test.to_file("/content/drive/MyDrive/fundar_deforestacion/data/predictions_sklearn.geojson")