In [9]:
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.preprocessing import StandardScaler

from joblib import dump, load
import geopandas as gpd

import numpy as np

In [10]:
train = gpd.read_file( "./data/train_data_final.geojson")
test =  gpd.read_file("./data/test_data_final.geojson")

In [44]:
X = train.loc[:,'NDVI_2000':'NDVI_2019']
y = train['label.x']
X_test = test.loc[:,'NDVI_2000':'NDVI_2019']

In [45]:
pipe = Pipeline([('preproc_scaling', StandardScaler()),
                 ('lr', LogisticRegression())])

In [46]:
parameters = {'lr__C':np.arange(0.1, 1.1, 0.05),
             'lr__penalty': ['l1', 'l2', 'elasticnet']}

pipe = GridSearchCV(pipe, 
                    parameters,
                    verbose=1,
                    cv=KFold(n_splits=5, shuffle=True, random_state=123),
                    n_jobs=16,
                    #return_train_score=True,
                    scoring='f1_macro'                   
                    #refit=False
                   )

In [47]:
pipe.fit(X, y)

Fitting 5 folds for each of 60 candidates, totalling 300 fits


200 fits failed out of a total of 300.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
100 fits failed with the following error:
Traceback (most recent call last):
  File "/home/laia/anaconda3/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/laia/anaconda3/lib/python3.9/site-packages/sklearn/pipeline.py", line 394, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "/home/laia/anaconda3/lib/python3.9/site-packages/sklearn/linear_model/_logistic.py", line 1461, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "/home/laia/anaconda3/lib/python3.9/site-packages/sklearn/linear_mo

GridSearchCV(cv=KFold(n_splits=5, random_state=123, shuffle=True),
             estimator=Pipeline(steps=[('preproc_scaling', StandardScaler()),
                                       ('lr', LogisticRegression())]),
             n_jobs=16,
             param_grid={'lr__C': array([0.1 , 0.15, 0.2 , 0.25, 0.3 , 0.35, 0.4 , 0.45, 0.5 , 0.55, 0.6 ,
       0.65, 0.7 , 0.75, 0.8 , 0.85, 0.9 , 0.95, 1.  , 1.05]),
                         'lr__penalty': ['l1', 'l2', 'elasticnet']},
             scoring='f1_macro', verbose=1)

In [38]:
dump(pipe, './models/cv_lr.joblib') 

['./models/cv_lr.joblib']

In [48]:
pipe.best_estimator_

Pipeline(steps=[('preproc_scaling', StandardScaler()),
                ('lr', LogisticRegression(C=0.3500000000000001))])

In [49]:
pipe.best_params_

{'lr__C': 0.3500000000000001, 'lr__penalty': 'l2'}

In [50]:
pipe.best_score_

0.6611110386794122

In [51]:
pipe.score(X_test, test.label_0)

0.6640504270689774

In [43]:
test.to_file("data/predictions_reglog.geojson")