In [1]:
# Visualization
import ipyleaflet
import matplotlib.pyplot as plt
from IPython.display import Image
import seaborn as sns

# Data Science
import numpy as np
import pandas as pd
# Feature Engineering
from sklearn.preprocessing import StandardScaler, MinMaxScaler, MaxAbsScaler, RobustScaler
from sklearn.model_selection import train_test_split
import statsmodels.api as sm

# Machine Learning
from sklearn.metrics import f1_score, accuracy_score,classification_report,confusion_matrix
from sklearn.linear_model import LogisticRegression
from lazypredict.Supervised import LazyClassifier

# Planetary Computer Tools
import pystac
import pystac_client
from pystac_client import Client
from pystac.extensions.eo import EOExtension as eo
import odc
import toolz
from odc.stac import stac_load
import planetary_computer as pc
pc.settings.set_subscription_key('**************')

# Others
import requests
import rich.table
from itertools import cycle
from tqdm import tqdm
tqdm.pandas()


#Get vh_vv function

def get_sentinel_data(latlong,time_slice,assets):
    '''
    Returns VV and VH values for a given latitude and longitude 
    Attributes:
    latlong - A tuple with 2 elements - latitude and longitude
    time_slice - Timeframe for which the VV and VH values have to be extracted
    assets - A list of bands to be extracted
    '''

    latlong=latlong.replace('(','').replace(')','').replace(' ','').split(',')
    
    box_size_deg = 0.0009 # Surrounding box in degrees, yields approximately 5x5 pixel region

    min_lon = float(latlong[1])-box_size_deg/2
    min_lat = float(latlong[0])-box_size_deg/2
    max_lon = float(latlong[1])+box_size_deg/2
    max_lat = float(latlong[0])+box_size_deg/2

    bbox_of_interest = (min_lon, min_lat, max_lon, max_lat)
    #bbox_of_interest = (float(latlong[1]) , float(latlong[0]), float(latlong[1]) , float(latlong[0]))
    
    time_of_interest = time_slice

    catalog = pystac_client.Client.open(
        "https://planetarycomputer.microsoft.com/api/stac/v1",
        modifier=pc.sign_inplace,
    )
    search = catalog.search(
        collections=["sentinel-1-rtc"], bbox=bbox_of_interest, datetime=time_of_interest
    )
    items = list(search.get_all_items())
    
    
    
    # Define the pixel resolution for the final product
    # Define the scale according to our selected crs, so we will use degrees

    resolution = 10  # meters per pixel 
    scale = resolution / 111320.0 # degrees per pixel for crs=4326 
    
    # Load the data using Open Data Cube
    data = stac_load(items,bands=["vv", "vh"], patch_url=pc.sign, bbox=bbox_of_interest, crs="EPSG:4326", resolution=scale)
   
    # Calculate the mean of the data across the sample region
    mean = data.mean(dim=['latitude','longitude']).compute()
    
    # Calculate V
    v = mean.vv + mean.vh
    
    
    return v.values

def get_new_features(data,L):
    #data = 600 time series list of values
    
    
    new_features=[]
    if 'mean' in L:
        the_means=[]
        for col in data.columns:
            mean_v = float(np.mean(data[col]))
            the_means.append(mean_v)
        new_features.append(the_means)
    
    if 'std' in L:
        the_stds=[]
        for col in data.columns:
            sigma_v = float(np.std(data[col]))
            the_stds.append(sigma_v)
        new_features.append(the_stds)
    
    if 'CV' in L:
        the_CVs=[]
        for col in data.columns:
            mean_v = float(np.mean(data[col]))
            sigma_v = float(np.std(data[col]))
            CV_v = float(mean_v/sigma_v)
            the_CVs.append(CV_v)
        new_features.append(the_CVs)
    
    if 'max' in L:
        the_maxs=[]
        for col in data.columns:
            max_v = float(max(data[col]))
            the_maxs.append(max_v)
        new_features.append(the_maxs)
        
    if 'min' in L:
        the_mins=[]
        for col in data.columns:
            min_v = float(min(data[col]))
            the_mins.append(min_v)
        new_features.append(the_mins)
        
    if 'median' in L:
        the_medians=[]
        for col in data.columns:
            median_v = float(np.median(data[col]))
            the_medians.append(median_v)
        new_features.append(the_medians)
    
    if 'amplitude' in L:
        the_amplis=[]
        for col in data.columns:
            max_v = float(max(data[col]))
            min_v = float(min(data[col]))
            ampli_v = abs(max_v-min_v)
            the_amplis.append(ampli_v)
        new_features.append(the_amplis)
        
    if 'absolute energy' in L:
        the_abs_Es=[]
        for col in data.columns:
            abs_E_v = float(sum((data[col])**2))
            the_abs_Es.append(abs_E_v)
        new_features.append(the_abs_Es)
    
    if 'mean diff' in L:
        the_mean_diffs=[]
        col_1 = np.delete(data[col].values,0)
        col_2 = np.delete(data[col].values,len(data[col])-1)
        delta_v = col_1-col_2
        for col in data.columns:
            mean_delta_v = float(np.mean(delta_v))
            the_mean_diffs.append(mean_delta_v)
        new_features.append(the_mean_diffs)
        
    if 'autocorrelation lag 1' in L:
        the_auto_lag_1s=[]
        for col in data.columns: 
            the_auto=sm.tsa.acf(data[col])[1]
            the_auto_lag_1s.append(the_auto)
        new_features.append(the_auto_lag_1s)
    
    #median_delta_v = float(np.median(delta_v))
    #sum_delta_v = float(sum(abs(delta_v)))
    #dist_v = float(sum(np.sqrt(1+delta_v**2)))#signal distance 
    
    return np.transpose(pd.DataFrame(new_features))
        
        
def combine_two_datasets(dataset1,dataset2):
    '''
    Returns a  vertically concatenated dataset.
    Attributes:
    dataset1 - Dataset 1 to be combined 
    dataset2 - Dataset 2 to be combined
    '''
    data = pd.concat([dataset1,dataset2], axis=1)
    return data

def plot_confusion_matrix(true_value,predicted_value,title,labels):
    '''
    Plots a confusion matrix.
    Attributes:
    true_value - The ground truth value for comparision.
    predicted_value - The values predicted by the model.
    title - Title of the plot.
    labels - The x and y labels of the plot.
    '''
    cm = confusion_matrix(true_value,predicted_value)
    ax= plt.subplot()
    sns.heatmap(cm, annot=True, fmt='g', ax=ax, cmap='Blues');
    ax.set_xlabel('Predicted labels');
    ax.set_ylabel('True labels'); 
    ax.set_title(title); 
    ax.xaxis.set_ticklabels(labels); 
    ax.yaxis.set_ticklabels(labels);
    
if __name__ == "__main__":
    
    crop_presence_data = pd.read_csv("Crop_Location_Data.csv")
    crop_presence_data.head()
    
    

In [105]:
 # Function call to extract VV,VH Values
time_slice = "2020-01-01/2020-12-31"
assests = ['vh','vv']
list_v = []
for coordinates in tqdm(crop_presence_data['Latitude and Longitude']):
    v=get_sentinel_data(coordinates,time_slice,assests)
    list_v.append(v)
list_v = np.transpose(pd.DataFrame(list_v))

100%|██████████| 600/600 [7:09:15<00:00, 42.93s/it]  


In [2]:
#list_v.to_csv('list_vh+vv_mean10x10.csv', index=False)
list_v = pd.read_csv('list_vh+vv_mean10x10.csv')


In [109]:
#list_v=list_v.drop(index=list_v.index[181:])
#list_v=list_v.drop(labels=['Unnamed: 0'], axis=1)

In [3]:
list_v

In [4]:
list_vf=get_new_features(list_v,['mean','std','absolute energy','median','amplitude','max','min','CV','autocorrelation lag 1'])

In [None]:
list_vf

In [184]:
len(list_vf)

600

In [123]:
#anomaly detection

to_drop=[]
for i in range(np.shape(list_vf)[0]):
    for j in range(np.shape(list_vf)[1]):
        if abs(list_vf[j][i])>=70:
            to_drop.append(i)
list_vf=list_vf.drop(index=list_vf.index[to_drop])
crop_presence_data=crop_presence_data.drop(index=crop_presence_data.index[to_drop])
list_vf=list_vf.reset_index(drop=True)
crop_presence_data=crop_presence_data.reset_index(drop=True)

In [None]:
pd.set_option('display.max_rows', 10)
list_vf

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,0.15,0.08,1.85,0.42,0.03,0.14,0.38,2.66,0.67
1,0.16,0.09,1.72,0.40,0.02,0.14,0.38,3.17,0.50
2,0.17,0.09,1.91,0.42,0.02,0.16,0.40,3.54,0.56
3,0.15,0.08,1.82,0.37,0.02,0.14,0.35,2.71,0.66
4,0.17,0.10,1.69,0.47,0.03,0.14,0.44,3.62,0.46
...,...,...,...,...,...,...,...,...,...
595,0.36,0.05,6.67,0.46,0.24,0.37,0.22,11.93,-0.28
596,0.39,0.05,7.14,0.50,0.25,0.40,0.25,14.01,-0.27
597,0.39,0.05,7.43,0.52,0.26,0.39,0.26,13.77,-0.17
598,0.38,0.05,7.76,0.52,0.29,0.37,0.23,13.28,-0.07


In [None]:
list_vf[].plot()

In [85]:
#list_vf=list_vf.drop(labels=[7], axis=1)

In [5]:
#Combine Latitude, Longitude and VV/VH
crop_data = combine_two_datasets(crop_presence_data,list_vf)

In [6]:
#Model building
#crop_data = crop_data[['vh','vv','Class of Land']]
X = crop_data.drop(columns=['Class of Land','Latitude and Longitude']).values
y = crop_data ['Class of Land'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,stratify=y,random_state=40)



In [7]:
#Feature scaling 1
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [34]:
#Feature scaling 2
sc = MinMaxScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [40]:
#Feature scaling 3
sc = MaxAbsScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [44]:
#Feature scaling 4
sc = RobustScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [8]:

# fit all models
pd.set_option('display.max_rows', None)
clf = LazyClassifier(predictions=True)
models, predictions = clf.fit(X_train, X_test, y_train, y_test)
print(models)
    
    #Out sample evaluation
#model=models.axes[0][0]
#outsample_predictions = model.predict(X_test)
#print("Accuracy {0:.2f}%".format(100*accuracy_score(outsample_predictions, y_test)))
    #print(classification_report(y_test, outsample_predictions))
    #plot_confusion_matrix(y_test, outsample_predictions,"Model Level 1: Logistic\nRegression Model Out-Sample Results",['Rice', 'Non Rice'])
    

100%|██████████| 29/29 [00:01<00:00, 26.55it/s]

                               Accuracy  Balanced Accuracy  ROC AUC  F1 Score  \
Model                                                                           
AdaBoostClassifier                 1.00               1.00     1.00      1.00   
LabelSpreading                     1.00               1.00     1.00      1.00   
RandomForestClassifier             1.00               1.00     1.00      1.00   
QuadraticDiscriminantAnalysis      1.00               1.00     1.00      1.00   
ExtraTreeClassifier                1.00               1.00     1.00      1.00   
ExtraTreesClassifier               1.00               1.00     1.00      1.00   
LabelPropagation                   1.00               1.00     1.00      1.00   
SVC                                0.99               0.99     0.99      0.99   
PassiveAggressiveClassifier        0.99               0.99     0.99      0.99   
LogisticRegression                 0.99               0.99     0.99      0.99   
BaggingClassifier           




In [None]:
model = LogisticRegression(solver='lbfgs')
model.fit(X_train,y_train)

In [75]:
from sklearn import svm
model = svm.SVC(kernel='linear')
model.fit(X_train,y_train)

In [92]:
from sklearn.ensemble import AdaBoostClassifier
model = AdaBoostClassifier(n_estimators=500,learning_rate=0.01)
model.fit(X_train,y_train)

In [11]:
from sklearn.ensemble import RandomForestClassifier
model=RandomForestClassifier(n_estimators=400)
model.fit(X_train,y_train)

In [137]:
from sklearn.ensemble import ExtraTreesClassifier
model=ExtraTreesClassifier(n_estimators=400)
model.fit(X_train,y_train)

In [26]:
from sklearn.neighbors import KNeighborsClassifier
model = KNeighborsClassifier(n_neighbors=4)
model.fit(X_train,y_train)

In [12]:
#Out sample evaluation
outsample_predictions = model.predict(X_test)
print("Accuracy {0:.2f}%".format(100*accuracy_score(outsample_predictions, y_test)))
#print(classification_report(y_test, outsample_predictions))
#plot_confusion_matrix(y_test, outsample_predictions,"Model Level 1: Logistic\nRegression Model Out-Sample Results",['Rice', 'Non Rice'])

Accuracy 98.89%


In [None]:
#Submission test 
test_file = pd.read_csv('challenge_1_submission_template_correct_columns_fixed.csv')
test_file

In [None]:
    
## Get Sentinel-1-RTC Data
time_slice = "2019-01-01/2020-12-31"
assests = ['vh','vv']
list_vs = []
for coordinates in tqdm(test_file['id']):
    vs=get_sentinel_data(coordinates,time_slice,assests)
    list_vs.append(vs)
submission_v_data = np.transpose(pd.DataFrame(list_vs))


In [34]:
#submission_v_data.to_csv('submission_vh+vv_ampli_mean10x10.csv', index=False)
submission_v_data= pd.read_csv('submission_vh+vv_ampli_mean10x10.csv')

In [None]:
#submission_vh_vv_data = submission_vh_vv_data.drop(columns=[8,9,10])
submission_v_data

In [73]:
submission_v_data_f=get_new_features(submission_v_data,['mean','std','absolute energy','median','amplitude','max','min','CV','autocorrelation lag 1'])

In [None]:
submission_v_data_f

In [98]:
submission_v_data_f=submission_v_data_f.drop(labels=[7], axis=1)

In [99]:
# Feature Scaling 
submission_v_data_f #= submission_vh_vv_data.values
transformed_submission_data = sc.transform(submission_v_data_f)

#Making predictions
final_predictions = model.predict(transformed_submission_data)
final_prediction_series = pd.Series(final_predictions)
    
#Combining the results into dataframe
submission_df = pd.DataFrame({'id':test_file['id'].values, 'target':final_prediction_series.values})
#Displaying the sample submission dataframe
display(submission_df)


Unnamed: 0,id,target
0,"(10.18019073690894, 105.32022315786804)",Rice
1,"(10.561107033461816, 105.12772097986661)",Rice
2,"(10.623790611954897, 105.13771401411867)",Rice
3,"(10.583364246115156, 105.23946127195805)",Non Rice
4,"(10.20744446668854, 105.26844107128906)",Rice
5,"(10.3101001821917, 105.50872812216863)",Non Rice
6,"(10.823197068175638, 105.2026687367572)",Non Rice
7,"(10.579730415477876, 105.15179510783753)",Rice
8,"(10.583364246115156, 105.2390070431284)",Non Rice
9,"(10.620156781317617, 105.11681948795524)",Rice


In [100]:
#Submission file
#Dumping the predictions into a csv file.
submission_df.to_csv("challenge_1_submission_rice_crop_prediction.csv",index = False)