In [1]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.preprocessing import StandardScaler

from joblib import dump, load
import geopandas as gpd
import pandas as pd
import numpy as np

In [2]:
train = gpd.read_file( "./data/train_data_final.geojson")
test =  gpd.read_file("./data/test_data_final.geojson")

In [3]:
X_train = train.loc[:,'NDVI_2000':'NDVI_2019']
y_train = train['label_0']
X_test = test.loc[:,'NDVI_2000':'NDVI_2019']
y_test = test['label_0']

In [4]:
pipe = Pipeline(
    [
        ('preproc_scaling', StandardScaler()),
        ('rf', RandomForestClassifier())
    
    ])

In [5]:
parameters = {'rf__n_estimators':[100, 200, 300, 400, 500],
              'rf__max_features':np.arange(1, X_train.shape[1]+1),
              'rf__max_depth':[1,10,25,30, 50,75]}

In [6]:
grid_search = GridSearchCV(pipe, 
                        parameters,
                        verbose=1,
                        cv=KFold(n_splits=5, shuffle=True, random_state=123),
                        n_jobs=16,
                        #return_train_score=True,
                        scoring='f1_macro',
                        #refit=False,
                       )


In [None]:
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 570 candidates, totalling 2850 fits


In [None]:
y_pred = grid_search.predict(X_test)

In [None]:
print(classification_report(y_test, y_pred))

In [None]:
dump(grid_search, './models/cv_rf.joblib') 