In [None]:
import os
import os.path as pa
import numpy as np
import pandas as pd
import glob
import logging
import time
import collections
import matplotlib.pyplot as plt
%matplotlib inline

import datetime as dt
import geopandas as gpd
from pandas.api.types import is_string_dtype
# import seaborn as sn

import math
from tqdm import tqdm

from math import sqrt
import json
import geopandas as gpd
import pickle
import tsfresh
from shapely.geometry import mapping
from tqdm.contrib.concurrent import process_map 



In [1]:
from pygeosys.timeserie.smoothers import  whitw

C:\Users\qle\AppData\Local\miniconda3\envs\text\Lib\site-packages\numpy\.libs\libopenblas.FB5AE2TYXYH2IJRDKGDGQ3XBKLKTF43H.gfortran-win_amd64.dll
C:\Users\qle\AppData\Local\miniconda3\envs\text\Lib\site-packages\numpy\.libs\libopenblas64__v0.3.21-gcc_10_3_0.dll


In [None]:
os.chdir(r'C:/Users/lwh/Documents/PROJECTS/tillage_detection/truterra_carbon')

## Training

### Prepare TS

In [None]:
def apply_smoother(x,beta = 10000):
    weights = (~x[1].isna()).astype(int)
    return whitw(x[1].fillna(0.0).values, weights.values, alpha=3, beta=beta)

In [None]:
def decibel_to_linear(band):
     # convert to linear units
    return np.power(10,np.array(band)/10)

In [None]:
## Load dataframe for several years

# with pickle
with open("data/02_clean/2016-to-2020-SAR_training_corrected_angle_indices_remaped.pkl", "rb") as f:
    gdf = pickle.load(f)

In [None]:
cols_to_convert = ['VH-max','VH-mean','VH-median','VH-min','VH-stdDev','VV-max','VV-mean','VV-median','VV-min','VV-stdDev']

In [None]:
for band in cols_to_convert:
    gdf[band]=decibel_to_linear(gdf[band])

In [None]:
# convert date to time serie readable format
gdf['date'] = pd.to_datetime(gdf['date'], format='%Y%m%d') 


In [None]:
gdf_2017 = gdf[(gdf['date']<'2017-06-30') ]
gdf_2018 = gdf[(gdf['date']>'2017-06-30') & (gdf['date']<'2018-06-30') ]
gdf_2019 = gdf[(gdf['date']>'2018-06-30') & (gdf['date']<'2019-06-30') ]
gdf_2020 = gdf[(gdf['date']>'2019-06-30') ]
gdf_2017.set_index('date',inplace=True)
gdf_2018.set_index('date',inplace=True)
gdf_2019.set_index('date',inplace=True)
gdf_2020.set_index('date',inplace=True)

In [None]:
cols_to_interpolate = ['DiffVVVH-max','DiffVVVH-mean','DiffVVVH-median','DiffVVVH-min','DiffVVVH-stdDev','VH-max',
 'VH-mean','VH-median','VH-min','VH-stdDev','VV-max','VV-mean','VV-median','VV-min','VV-stdDev','angle-max','angle-mean','angle-median',
 'angle-min','angle-stdDev'] # Change your columns here

In [None]:
def treat_gdf(gdf,cols_to_interpolate):
    fields = gdf['id'].unique()

    #1st field
    gdf_sub = gdf.loc[gdf['id']==fields[0]]
    resampled = gdf_sub.resample('D').asfreq()#'Y' for Yearly
    for col in cols_to_interpolate:
        for row in resampled.loc[:,[f'{col}']].items():
            resampled[f'{col}'] = apply_smoother(row,beta=1000)
    #other fields
    for field in tqdm(fields[1:]) : 
        try: 
            gdf_sub = gdf.loc[gdf['id']==field]
            resampled_tmp = gdf_sub.resample('D').asfreq()#'Y' for Yearly
            for col in cols_to_interpolate:
                for row in resampled_tmp.loc[:,[f'{col}']].items():
                    resampled_tmp[f'{col}'] = apply_smoother(row,beta=1000)

            resampled = pd.concat([resampled,resampled_tmp],axis=0)

        except:
            print(field)
    return(resampled)

In [None]:
resampled_2017 = treat_gdf(gdf_2017,cols_to_interpolate)
resampled_2018 = treat_gdf(gdf_2018,cols_to_interpolate)
resampled_2019 = treat_gdf(gdf_2019,cols_to_interpolate)
resampled_2020 = treat_gdf(gdf_2020,cols_to_interpolate)
gdf_resampled = pd.concat([resampled_2017,resampled_2018,resampled_2019,resampled_2020],axis=0)

In [None]:
gdf_resampled[['id', 'TillageType', 'Tillagedate', 'CoverCrop', 'field_id', 'State',
       'Tillage_newType']] = gdf_resampled[['id', 'TillageType', 'Tillagedate', 'CoverCrop', 'field_id', 'State',
       'Tillage_newType']].ffill() # to complete the qualitative features in the dataframe because the upsampled left some Nans
gdf_resampled.reset_index(inplace=True)

### Prepare datasets

In [None]:
# Dataframe work
gdf=gdf_resampled
# filter data by date if required
gdf['date'] = pd.to_datetime(gdf['date'], format='%Y%m%d') 

gdf_train=gdf[(gdf['date']<'2019-10-30') ]
gdf_test=gdf[gdf['date']>'2019-10-30']

# drop unwanted columns 
timeseries_train = gdf_train.drop(['field_id','State','Tillage_newType'],axis=1)

timeseries_test = gdf_test.drop(['field_id','State','Tillage_newType'],axis=1)

timeseries = gdf.drop(['field_id','State','Tillage_newType'],axis=1)

# store labels (could store several different labels if needed )
y_train=gdf_train[['id','Tillage_newType']] # field id and targets columns
y_train=y_train.drop_duplicates(subset=['id'],keep='first') # keep only first occurence of field to get the target label
y_train=y_train.set_index('id')
y_train=y_train['Tillage_newType'] #target column

y_test=gdf_test[['id','Tillage_newType']] #field id and targets columns
y_test=y_test.drop_duplicates(subset=['id'],keep='first')
y_test=y_test.set_index('id')
y_test=y_test['Tillage_newType']#target column

#prepare time serie
timeseries_train=timeseries_train.set_index(['date'])
timeseries_test=timeseries_test.set_index(['date'])

### Plot timeserie

In [None]:
# plot time series to check if everything is in order
timeseries[timeseries['id'] == 90].plot(subplots=True, sharex=True, figsize=(10,20))
plt.show()

### Handle NaNs

In [None]:
#in the whole timeserie
for col in timeseries.columns :
    timeseries[col].fillna((timeseries[col].mean()), inplace=True)
    
# remove date index
timeseries=timeseries.reset_index(0)

# in specific timeseries
for col in timeseries_train.columns :
    timeseries_train[col].fillna((timeseries_train[col].mean()), inplace=True)
# remove date index
timeseries_train=timeseries_train.reset_index(0)

for col in timeseries_test.columns :
    timeseries_test[col].fillna((timeseries_test[col].mean()), inplace=True)
   
# remove date index
timeseries_test=timeseries_test.reset_index(0)


### Training classifier with a pipeline

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from tsfresh.transformers import RelevantFeatureAugmenter
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import AdaBoostClassifier

In [None]:
pipeline = Pipeline([('augmenter', RelevantFeatureAugmenter(column_id='id', column_sort='date')),
            ('classifier', RandomForestClassifier())])
X_train = pd.DataFrame(index=y_train.index)

pipeline.set_params(augmenter__timeseries_container=timeseries_train) ## /!\ very important step : make sure the right timeserie is used here
pipeline.fit(X_train,y_train)


In [None]:
# saving model pipeline
import pickle
with open("data/07_models/SAR-2019correctedangle_pipeline_resampled_dates_interpol_linear", "wb") as f:
    pickle.dump(pipeline, f)  # to save the model

### Check results

In [None]:
# predict on test data
pipeline.set_params(augmenter__timeseries_container=timeseries_test) ## /!\ very important step : make sure the right timeserie is used here

X_test = pd.DataFrame(index=y_test.index) 
y_pred = pipeline.predict(X_test)


In [None]:
from sklearn.metrics import classification_report

# display results
print(classification_report(y_test.values, y_pred))

In [None]:
# Plot confusion matrix
import matplotlib.pyplot as plt
from sklearn.datasets import make_classification
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y_test, y_pred, labels=pipeline.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix=cm,
                              display_labels=pipeline.classes_)
disp.plot()

plt.show()


In [None]:
# Compute probabilities 
X_test = pd.DataFrame(index=y_test.index)
pipeline.set_params(augmenter__timeseries_container=timeseries_test)
proba = pipeline.predict_proba(X_test)
proba_df = pd.DataFrame(data=proba) #store result in dataframe

In [None]:
 # rename columns with actual pipeline classes
for cl in range(0,len(pipeline.classes_)):
    proba_df=proba_df.rename(columns={cl: pipeline.classes_[cl]})

In [None]:
# Put everything together in a nice dataframe to view result per field
# y_test=gdf_test[['id','TillageType']]
# y_test=y_test.drop_duplicates(subset=['id'],keep='first')
df_pred = pd.DataFrame(y_pred,columns=['Prediction'])

df_pred = pd.DataFrame(y_pred,columns=['Prediction'])
df=df_pred.join(y_test.reset_index())
df=df.join(proba_df)
df.head()

In [None]:
# Save results to csv
df.to_csv('data/09_processed/results_test_newmodel.csv')