In [1]:
import numpy as np
from collections import Counter
import pandas as pd
from osgeo import gdal
from timeit import default_timer as timer
from datetime import timedelta
import warnings
warnings.filterwarnings('ignore')

In [2]:
def open_image(path, data1):
    f00 = gdal.Open(path + data1 + ".tif")
    imarray = np.array(f00.ReadAsArray())
    #print (imarray.shape)
    return imarray

In [3]:
# need to check if images are same (could be error on my end)
def img_to_df(mask, img):
    print(Counter(mask.flatten()))
    l = mask.flatten()
    df = pd.DataFrame(data=l, columns=['label'])
    # loop through bands and add to df
    for i in range(0, img.shape[0]):
        #print (i)
        b = img[i,:,:].flatten()
        df['b'+str(i)] = b
    return df

In [4]:
def remove_nans(df00):
    # remove NaNs
    label = df00[['label']]
    del df00['label']

    # change labels
    # -1 background 1 fire
    label = label.astype(int).replace(0,-1)
    label = label.astype(int).replace(1,1)

    df00['label'] = label.values
    # drops NaNs
    df00 = df00.dropna()
    # switch column order
    cols = df00.columns.tolist()
    cols = cols[-1:] + cols[:-1]
    df00 = df00[cols]
    return df00

In [5]:
def path_to_df(ipath, iname, mpath, mname):
    print (mname)
    f00 = open_image(mpath, mname)
    m11_00 = open_image(ipath, iname)
    df00 = img_to_df(f00, m11_00)
    df00 = remove_nans(df00)
    print (df00.shape)
    return df00

In [6]:
ipath = 'C:/Users/lql/Desktop/rs_petsc/ca_data/modis/'
mpath = 'C:/Users/lql/Desktop/rs_petsc/ca_data/fire_rasters/'

data = []

for i in range(0,21):
    try:

        if len(str(i)) == 1:
            print ('modis_200' + str(i))
            iname = 'modis_200' + str(i)
            mname = 'fire0' + str(i)
            df = path_to_df(ipath, iname, mpath, mname)
            data.append(df)
        else:
            print ('modis_20' + str(i))
            iname = 'modis_20' + str(i)
            mname = 'fire' + str(i)
            df = path_to_df(ipath, iname, mpath, mname)
            data.append(df)
    except:
        print ('Error', i)



modis_2000
fire00
Counter({0: 4824892, 1: 7008})
(452478, 208)
modis_2001
fire01
Error 1
modis_2002
fire02
Counter({0: 4818747, 1: 13153})
(1870710, 208)
modis_2003
fire03
Counter({0: 4808116, 1: 23784})
(1261062, 208)
modis_2004
fire04
Counter({0: 4823540, 1: 8360})
(1868492, 208)
modis_2005
fire05
Counter({0: 4823959, 1: 7941})
(1471772, 208)
modis_2006
fire06
Counter({0: 4812433, 1: 19467})
(1827588, 208)
modis_2007
fire07
Counter({0: 4806641, 1: 25259})
(1964512, 208)
modis_2008
fire08
Counter({0: 4795777, 1: 36123})
(1848880, 208)
modis_2009
fire09
Counter({0: 4820543, 1: 11357})
(1854346, 208)
modis_2010
fire10
Counter({0: 4828290, 1: 3610})
(1405527, 208)
modis_2011
fire11
Counter({0: 4825293, 1: 6607})
(1600628, 208)
modis_2012
fire12
Counter({0: 4811191, 1: 20709})
(1643565, 208)
modis_2013
fire13
Counter({0: 4817168, 1: 14732})
(1963539, 208)
modis_2014
fire14
Counter({0: 4817405, 1: 14495})
(1925184, 208)
modis_2015
fire15
Counter({0: 4811530, 1: 20370})
(1894947, 208)
modis

In [7]:
mod = pd.concat(data, axis=0)

In [10]:
mod.shape
mod['label'].value_counts()

-1    33202600
 1      347980
Name: label, dtype: int64

# Train/Test Sets
* 5%, 10%, 20% data size

In [21]:
import random

def random_grid(mod, percent):
    ls = mod.shape[0]
    am = round(ls * percent)
    start = random.randint(0,ls-am)
    end = start + am
    print (start, end)
    mod2 = mod[start:end]
    print (mod2.shape)
    print (mod2['label'].value_counts())
    return mod2

mod_05 = random_grid(mod, 0.05)

8124582 9802111
(1677529, 208)
-1    1662655
 1      14874
Name: label, dtype: int64


In [29]:
mod_10 = random_grid(mod, 0.10)

8119854 11474912
(3355058, 208)
-1    3304210
 1      50848
Name: label, dtype: int64


In [94]:
mod_02 = random_grid(mod, 0.02)

sample = 30000



28218341 28889353
(671012, 208)
-1    646584
 1     24428
Name: label, dtype: int64


# Get Train/Test Grids

In [34]:
# split into train/test groups
from sklearn.model_selection import train_test_split

def train_test_grids(mod11):
    X = mod11.copy()
    y = mod11['label'].values
    
    Xtrain, Xtest, y_train, y_test = train_test_split(X, y, test_size=0.25, shuffle=False, random_state=42)
    print ('Xtrain ********************')
    print (Xtrain['label'].value_counts())
    print ('Xtest ********************')
    print (Xtest['label'].value_counts())
    
    return Xtrain, Xtest
    
    

In [95]:
tr_mod02, te_mod02 = train_test_grids(mod_02)

Xtrain ********************
-1    489899
 1     13360
Name: label, dtype: int64
Xtest ********************
-1    156685
 1     11068
Name: label, dtype: int64


In [96]:
path = 'C:\\Users\\lql\\Desktop\\rs_petsc\\ca_data\\permon_data\\'
name = path + 'ca_modis02_train.bin'
df_to_petsc(tr_mod02,name)

name = path + 'ca_modis02_test.bin'
df_to_petsc(te_mod02,name)

(<503259x207 sparse matrix of type '<class 'numpy.float32'>'
	with 104173150 stored elements in Compressed Sparse Row format>, Vec([-1, -1, -1, ..., -1, -1, -1]))
(<167753x207 sparse matrix of type '<class 'numpy.float32'>'
	with 34724292 stored elements in Compressed Sparse Row format>, Vec([-1, -1, -1, ..., -1, -1, -1]))


In [36]:
tr_mod05, te_mod05 = train_test_grids(mod_05)
tr_mod10, te_mod10 = train_test_grids(mod_10)



Xtrain ********************
-1    1245594
 1      12552
Name: label, dtype: int64
Xtest ********************
-1    417061
 1      2322
Name: label, dtype: int64
Xtrain ********************
-1    2489572
 1      26721
Name: label, dtype: int64
Xtest ********************
-1    814638
 1     24127
Name: label, dtype: int64


In [42]:
mod_20 = random_grid(mod, 0.20)
tr_mod20, te_mod20 = train_test_grids(mod_20)

path = 'C:\\Users\\lql\\Desktop\\rs_petsc\\ca_data\\permon_data\\'
name = path + 'ca_modis20_train.bin'
df_to_petsc(tr_mod20,name)

name = path + 'ca_modis20_test.bin'
df_to_petsc(te_mod20,name)


1537312 8247428
(6710116, 208)
-1    6662542
 1      47574
Name: label, dtype: int64
Xtrain ********************
-1    4995136
 1      37451
Name: label, dtype: int64
Xtest ********************
-1    1667406
 1      10123
Name: label, dtype: int64
(<5032587x207 sparse matrix of type '<class 'numpy.float32'>'
	with 1041743833 stored elements in Compressed Sparse Row format>, Vec([-1, -1, -1, ..., -1, -1, -1]))
(<1677529x207 sparse matrix of type '<class 'numpy.float32'>'
	with 347247272 stored elements in Compressed Sparse Row format>, Vec([-1, -1, -1, ..., -1, -1, -1]))


# Convert to PETSc

In [37]:
import sys
import numpy as np

sys.path.insert(0, 'C:/Users/lql/Desktop/rs_petsc/repos/petsc/lib/petsc/bin')
import PetscBinaryIO

In [38]:
from scipy import sparse
from sklearn.preprocessing import MinMaxScaler

In [39]:
def df_to_petsc(df,name):
    ytrain = df['label'].values
    xtrain = df.copy()
    del xtrain['label']
    xtrain = xtrain.values
    scaler = MinMaxScaler()
    scaler.fit(xtrain)
    xtrain = scaler.transform(xtrain)
    sXtrain = sparse.csr_matrix(xtrain)
    sYtrain = ytrain.view(PetscBinaryIO.Vec)
    permon_train = (sXtrain, sYtrain)
    print (permon_train)
    io = PetscBinaryIO.PetscBinaryIO()
    io.writeBinaryFile(name, permon_train)

In [40]:
path = 'C:\\Users\\lql\\Desktop\\rs_petsc\\ca_data\\permon_data\\'
name = path + 'ca_modis05_train.bin'
df_to_petsc(tr_mod05,name)

name = path + 'ca_modis05_test.bin'
df_to_petsc(te_mod05,name)


(<1258146x207 sparse matrix of type '<class 'numpy.float32'>'
	with 260435137 stored elements in Compressed Sparse Row format>, Vec([-1, -1, -1, ..., -1, -1, -1]))
(<419383x207 sparse matrix of type '<class 'numpy.float32'>'
	with 86811805 stored elements in Compressed Sparse Row format>, Vec([-1, -1, -1, ..., -1, -1, -1]))


In [41]:
name = path + 'ca_modis10_train.bin'
df_to_petsc(tr_mod10,name)

name = path + 'ca_modis10_test.bin'
df_to_petsc(te_mod10,name)

(<2516293x207 sparse matrix of type '<class 'numpy.float32'>'
	with 520871486 stored elements in Compressed Sparse Row format>, Vec([-1, -1, -1, ..., -1, -1, -1]))
(<838765x207 sparse matrix of type '<class 'numpy.float32'>'
	with 173623216 stored elements in Compressed Sparse Row format>, Vec([-1, -1, -1, ..., -1, -1, -1]))


# XGBoost

In [30]:
# test xgboost 
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

def xgboost_pipeline(mod11, shuffle):
    X = mod11.copy()
    y = mod11['label'].values
    del X['label']
    
    Xtrain, Xtest, y_train, y_test = train_test_split(X, y, test_size=0.33, shuffle=shuffle, random_state=42)
    
    # fit model no training data
    model = XGBClassifier()
    model.fit(Xtrain, y_train)
    
    
    ypred = model.predict(Xtest)
    print (classification_report(y_test, ypred))
    
    

In [31]:
xgboost_pipeline(mod_10, True)

              precision    recall  f1-score   support

          -1       1.00      1.00      1.00   1090404
           1       0.96      0.76      0.85     16766

    accuracy                           1.00   1107170
   macro avg       0.98      0.88      0.92   1107170
weighted avg       1.00      1.00      1.00   1107170



In [43]:
xgboost_pipeline(mod_05, False)

              precision    recall  f1-score   support

          -1       1.00      1.00      1.00    550717
           1       0.67      0.21      0.32      2868

    accuracy                           1.00    553585
   macro avg       0.83      0.60      0.66    553585
weighted avg       0.99      1.00      0.99    553585



In [44]:
xgboost_pipeline(mod_05, True)

              precision    recall  f1-score   support

          -1       1.00      1.00      1.00    548701
           1       0.96      0.74      0.84      4884

    accuracy                           1.00    553585
   macro avg       0.98      0.87      0.92    553585
weighted avg       1.00      1.00      1.00    553585



# Viz Code!