In [1]:
import rasterio
import numpy as np
from sklearn.model_selection import train_test_split
import lightgbm as lgbm

from typing import Any, Dict, Optional, List
from tqdm import tqdm as tqdm
import matplotlib.pyplot as plt
import seaborn as sns
import utils
import main
import sys

## Arguments to be passed

In [2]:

data_dir = "gs://earth-engine-seminar/urbanization/data/export_22122024"
output_path="prediction_tiff.tiff"
filter_size=5
block_coverage=0.3
total_blocks=100
test_size=0.3

## Loading the input files 

In [3]:

labels = utils.files_in_dir(data_dir, "label.tif")
features = utils.files_in_dir(data_dir, "feat.tif")
train_labels = utils.files_in_dir(data_dir, "label.tif")
train_features = utils.files_in_dir(data_dir, "feat.tif")
test_labels = utils.load_tif_data(labels[0])
test_features = utils.load_tif_data(features[0])




## Pre-processing

### Removing dublicate features from train and test set of features

In [None]:

train_feature=np.concatenate((train_features.data[0,:,:].reshape(1,train_features.metadata['height'],train_features.metadata['width'] ), train_features.data[2:,:,:]), axis=0)
test_feature= np.concatenate((test_features.data[0,:,:].reshape(1,test_features.metadata['height'],test_features.metadata['width']),test_features.data[2:,:,:]),axis=0)

In [6]:
x_train,y_train,x_test,y_test,x_val,y_val,train_mask, test_mask, val_mask=main.pre_process(train_feature,train_labels.data,test_feature, test_labels.data, block_coverage, total_blocks,test_size)


block size 121


  new_data.append(np.concatenate(((np.nanmean(mean_data,axis=(1,2))),static_data),axis=0 ))
100%|██████████| 226/226 [07:41<00:00,  2.04s/it] 
100%|██████████| 98/98 [00:08<00:00, 11.17it/s]
  new_data.append(np.concatenate(((np.nanmean(mean_data,axis=(1,2))),static_data),axis=0 ))
100%|██████████| 2207/2207 [04:13<00:00,  8.72it/s]


(26442, 15) (26442,)
(4932645, 15) (4932645,)
(11466, 15) (11466,)
(18200, 15) (18200,)
(3304360, 15) (3304360,)
(3304360, 15) (3304360,)


### Correlation-Matrix

In [None]:
y_train=y_train.reshape(len(y_train),1)
train=np.concatenate((x_train, y_train), axis=1)

# Normalizing features 

col_means = np.mean(train,axis=0)
train_normalized = (train - col_means) / np.std(train, axis=0)
print(np.mean(train_normalized,axis=0))
print(np.std(train_normalized,axis=0))
coerr=np.corrcoef(train, rowvar=False)



plt.figure(figsize=(12, 8))
sns.heatmap(coerr, annot=True, xticklabels=['Palmer Drought Severity Index','Precipitation accumulation',
'min temp','max temp','16-day NDVI avg','16-day EVI avg','LC_Type1','population_density','urban'], yticklabels=['Palmer Drought Severity Index','Precipitation accumulation',
'min temp','max temp','16-day NDVI avg','16-day EVI avg','LC_Type1','population_density','urban'], cmap='coolwarm', linewidths=0.5)
plt.title('Correlation Heatmap')
plt.show()

## Training

In [7]:

model = lgbm.LGBMClassifier(objective="multiclass", num_class=4)
main.train(model,x_train,y_train,x_val,y_val)




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000569 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3483
[LightGBM] [Info] Number of data points in the train set: 18200, number of used features: 15
[LightGBM] [Info] Start training from score -6.551080
[LightGBM] [Info] Start training from score -7.611952
[LightGBM] [Info] Start training from score -6.198259
[LightGBM] [Info] Start training from score -0.003964


## Testing

In [8]:
test_pred,cm,report=main.predict(model,x_test,y_test)



## Results

In [9]:
print(report)

              precision    recall  f1-score   support

           0       0.01      0.04      0.02      8586
           1       0.01      0.06      0.01      2260
           2       0.02      0.11      0.04      6524
           3       1.00      0.98      0.99   3286990

    accuracy                           0.98   3304360
   macro avg       0.26      0.30      0.27   3304360
weighted avg       0.99      0.98      0.98   3304360



In [10]:
# Confusion matrix
print(cm)

[[    318     489     152    7627]
 [    127     143     358    1632]
 [    213     115     717    5479]
 [  20568   16523   28307 3221592]]


## Saving

In [14]:
main.save_prediction(test_labels,filter_size,test_mask,test_pred,output_path)