In [6]:
!pip install scikit-learn scipy matplotlib

Collecting scikit-learn
  Downloading scikit_learn-1.0.2-cp37-cp37m-win_amd64.whl (7.1 MB)
Collecting scipy
  Downloading scipy-1.7.3-cp37-cp37m-win_amd64.whl (34.1 MB)
Collecting matplotlib
  Downloading matplotlib-3.5.3-cp37-cp37m-win_amd64.whl (7.2 MB)
Collecting joblib>=0.11
  Downloading joblib-1.2.0-py3-none-any.whl (297 kB)
Collecting threadpoolctl>=2.0.0
  Downloading threadpoolctl-3.1.0-py3-none-any.whl (14 kB)
Collecting cycler>=0.10
  Downloading cycler-0.11.0-py3-none-any.whl (6.4 kB)
Collecting fonttools>=4.22.0
  Downloading fonttools-4.38.0-py3-none-any.whl (965 kB)
Collecting kiwisolver>=1.0.1
  Downloading kiwisolver-1.4.4-cp37-cp37m-win_amd64.whl (54 kB)
Collecting typing-extensions; python_version < "3.8"
  Downloading typing_extensions-4.4.0-py3-none-any.whl (26 kB)
Installing collected packages: joblib, threadpoolctl, scipy, scikit-learn, cycler, fonttools, typing-extensions, kiwisolver, matplotlib
Successfully installed cycler-0.11.0 fonttools-4.38.0 joblib-1.2.0 

In [7]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, KFold
from joblib import dump, load
import geopandas as gpd

import numpy as np

In [8]:
train = gpd.read_file( "./data/train_data_final.geojson")
test =  gpd.read_file("./data/test_data_final.geojson")

In [9]:
X = train.loc[:,'NDVI_2000':'NDVI_2019']
y = train['label_0']
X_test = test.loc[:,'NDVI_2000':'NDVI_2019']

In [10]:
pipe = Pipeline([('rf', RandomForestClassifier())])

In [11]:
parameters = {'rf__n_estimators':[100, 200, 300, 400, 500],
              'rf__max_features':np.arange(1, X.shape[1]+1),
              'rf__max_depth':[1,10,25,30, 50,75]
             }

pipe = GridSearchCV(pipe, 
                    parameters,
                    verbose=1,
                    cv=KFold(n_splits=5, shuffle=True, random_state=123),
                    n_jobs=16,
                    #return_train_score=True,
                    scoring='f1_macro'                    
                    #refit=False
                   )

In [12]:
pipe.fit(X, y)

Fitting 5 folds for each of 570 candidates, totalling 2850 fits


GridSearchCV(cv=KFold(n_splits=5, random_state=123, shuffle=True),
             estimator=Pipeline(steps=[('rf', RandomForestClassifier())]),
             n_jobs=16,
             param_grid={'rf__max_depth': [1, 10, 25, 30, 50, 75],
                         'rf__max_features': array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19]),
                         'rf__n_estimators': [100, 200, 300, 400, 500]},
             scoring='f1_macro', verbose=1)

In [14]:
dump(pipe, './models/cv_rf.joblib') 

['./models/cv_rf.joblib']

In [15]:
pipe.cv_results_

{'mean_fit_time': array([  1.33036275,   2.13338156,   3.52481341,   3.82889085,
          5.03514614,   1.43687577,   2.46068769,   3.90868559,
          5.08323226,   6.91808085,   1.60487885,   3.13357224,
          4.06107888,   5.65531716,   7.25304775,   1.59904037,
          3.17591653,   4.66135759,   6.43047352,   7.84734864,
          1.78446884,   3.52643409,   5.17508168,   7.14109092,
          8.94859166,   2.03589482,   3.82827744,   5.48160272,
          7.35204787,   9.20309749,   1.93482733,   3.85940876,
          5.64241638,   7.69271431,   9.8111794 ,   1.96313939,
          4.24626365,   6.28480592,   8.30175047,  10.17937174,
          2.21922569,   4.50383554,   6.56732268,   8.66636152,
         11.10870562,   2.30482864,   4.79254298,   7.11687355,
          9.45099063,  12.08668065,   2.49060736,   5.02385306,
          7.66444097,  10.12401009,  12.68457494,   2.66612124,
          5.265592  ,   7.79279552,  10.33489504,  12.86540079,
          2.70498538,  

In [16]:
pipe.best_estimator_

Pipeline(steps=[('rf',
                 RandomForestClassifier(max_depth=30, max_features=2,
                                        n_estimators=500))])

In [17]:
pipe.best_params_

{'rf__max_depth': 30, 'rf__max_features': 2, 'rf__n_estimators': 500}

In [18]:
pipe.best_score_

0.7569701913345194

In [19]:
pipe.score(X_test, test.label_0)

0.7617997135700626

In [None]:
test.to_file("/content/drive/MyDrive/fundar_deforestacion/data/predictions_sklearn.geojson")