# Random forest algorithm model creator

In [1]:
#importing libraries

from sklearn.ensemble import RandomForestClassifier
import pandas as pd
import numpy as np
from osgeo import gdal, osr, ogr
np.random.seed(0) #setting random seed
import matplotlib.pyplot as plt
import pickle 
from sklearn.metrics import accuracy_score, cohen_kappa_score

**Processes starts from here**





In [2]:
#loading training data
#converting all training data into np array

#first, loading all training samples
#training data is given as tif files with 10 samples for each class

import glob
dataList=[]
folderPath='/content/drive/MyDrive/trainingData5'
fileList=sorted(glob.glob(folderPath+'/*.tif'))
for filePath in fileList:
  ds=gdal.Open(filePath)
  dataList.append(ds)
  ds=None



In [3]:
#converting each samples into np array
npDataList=[]
for i in dataList:
  tmpLst=[]
  for j in range(1,7):
    bnd=i.GetRasterBand(j).ReadAsArray()
    tmpLst.append(bnd)

  npDataList.append(np.stack(tmpLst,axis=-1))



In [None]:
print(len(npDataList)) #checking total no of samples

20


In [4]:
#converting training samples into dict with key is class name and value is the samples pixel values

classes=['blt','brn','veg','wtr','noData']
dataDict={}
n=0
cnt=0
for i in npDataList:
  shpAry=i.shape
  val=[]
  for j in range(0,shpAry[0]):
    for k in range(0,shpAry[1]):
      val.append(list(i[j,k,:]))


  dataDict.update({classes[n]:val})

  if (cnt+1)%5==0:
    n+=1

  cnt+=1

  
  


In [5]:
#assigning dictionary of samples into pandas dataframe for training

dfTrain=pd.DataFrame(columns=['label','blue','green','red','nir','swir1','swir2'])
pv=0
for p in range(0,len(dataDict)):
  clsName=list(dataDict.keys())[p]
  for q in range(0,len(dataDict[clsName])):
    itm=dataDict[clsName][q]
    data={'label':clsName,
      'blue':itm[0], 
      'green':itm[1], 
      'red':itm[2], 
      'nir':itm[3], 
      'swir1':itm[4],
      'swir2':itm[5]
      }
    pv+=1
    dfTrain.loc[pv]=data


In [6]:
dfTrain.head() #printing and checking

Unnamed: 0,label,blue,green,red,nir,swir1,swir2
1,blt,8776.0,9777.0,10219.0,15405.0,16678.0,13288.0
2,blt,9435.0,9927.0,11452.0,15856.0,20162.0,16180.0
3,blt,9435.0,10455.0,12133.0,16007.0,21233.0,17298.0
4,blt,9655.0,10530.0,12133.0,16007.0,20698.0,17018.0
5,blt,9507.0,10754.0,12201.0,16158.0,20788.0,17112.0


In [7]:
print(len(dfTrain.index))

922


In [8]:
#adding data into training dataframe for 0 values for identifying it as noData class

for s in range(1,20):
  r=len(dfTrain.index)+s
  data0={'label':classes[4],
      'blue':-999.99, 
      'green':-999.99, 
      'red':-999.99, 
      'nir':-999.99, 
      'swir1':-999.99 ,
      'swir2':-999.99
      }
  dfTrain.loc[r]=data0

In [9]:
print(len(dfTrain.index))

941


In [None]:
#dfTrain.dropna(inplace=True)

In [10]:
dfTrain['is_train']=np.random.uniform(0,1,len(dfTrain))<=.80  #dividing into train and test samples
dfTrain.head()

Unnamed: 0,label,blue,green,red,nir,swir1,swir2,is_train
1,blt,8776.0,9777.0,10219.0,15405.0,16678.0,13288.0,True
2,blt,9435.0,9927.0,11452.0,15856.0,20162.0,16180.0,True
3,blt,9435.0,10455.0,12133.0,16007.0,21233.0,17298.0,True
4,blt,9655.0,10530.0,12133.0,16007.0,20698.0,17018.0,True
5,blt,9507.0,10754.0,12201.0,16158.0,20788.0,17112.0,True


In [None]:
dfTrain.to_csv("/content/sample_data/dfTrain1.csv", sep='\t')

In [11]:
#creating dataframes with test rows and training rows
train,test=dfTrain[dfTrain['is_train']==True],dfTrain[dfTrain['is_train']==False]

#show the number of observations for the test and training dataframes
print('no: of obs in training data: ',len(train))
print('no: of obs in testing data: ',len(test))

no: of obs in training data:  751
no: of obs in testing data:  190


In [12]:
lab= pd.factorize(train["label"])[0]
print(lab)
lab1=pd.factorize(test["label"])[0]
features=dfTrain.columns[1:7]  #creating feature vector for training from dataframe columns
features

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 

Index(['blue', 'green', 'red', 'nir', 'swir1', 'swir2'], dtype='object')

In [13]:
#creating a random forest classifier

clf=RandomForestClassifier(n_jobs=2,n_estimators=100, random_state=42)
#Training the classifier
clf.fit(train[features],lab)

In [14]:
testPred=clf.predict(test[features])  #predicting the test features
testPred

array([0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3,
       3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4])

In [15]:
#checking accuracy of classifier

accuracy= accuracy_score(lab1,testPred)
kappa = cohen_kappa_score(lab1, testPred)
print(accuracy)
print(kappa)

1.0
1.0


In [16]:
clsAry=np.array(classes)
print(clsAry)

['blt' 'brn' 'veg' 'wtr' 'noData']


In [17]:
preds=clsAry[clf.predict(test[features])]
preds

array(['blt', 'blt', 'blt', 'blt', 'blt', 'blt', 'blt', 'blt', 'brn',
       'brn', 'brn', 'brn', 'brn', 'brn', 'brn', 'brn', 'brn', 'brn',
       'brn', 'brn', 'brn', 'brn', 'brn', 'brn', 'brn', 'brn', 'brn',
       'brn', 'brn', 'brn', 'brn', 'brn', 'brn', 'brn', 'brn', 'brn',
       'brn', 'brn', 'brn', 'brn', 'brn', 'brn', 'brn', 'brn', 'brn',
       'brn', 'brn', 'brn', 'brn', 'brn', 'brn', 'brn', 'brn', 'brn',
       'brn', 'brn', 'brn', 'brn', 'brn', 'veg', 'veg', 'veg', 'veg',
       'veg', 'veg', 'veg', 'veg', 'veg', 'veg', 'veg', 'veg', 'veg',
       'veg', 'veg', 'veg', 'veg', 'veg', 'veg', 'veg', 'veg', 'veg',
       'veg', 'veg', 'veg', 'veg', 'veg', 'veg', 'veg', 'veg', 'veg',
       'veg', 'veg', 'veg', 'veg', 'veg', 'veg', 'veg', 'veg', 'veg',
       'veg', 'veg', 'veg', 'veg', 'veg', 'veg', 'veg', 'veg', 'veg',
       'veg', 'veg', 'veg', 'veg', 'veg', 'veg', 'veg', 'veg', 'veg',
       'veg', 'veg', 'veg', 'veg', 'veg', 'veg', 'veg', 'veg', 'veg',
       'veg', 'veg',

In [21]:
crstb=pd.crosstab(test['label'],preds,rownames=['actual labels'],colnames=['predicted labels'])  #creating confusion matrix out of test data
crstb.to_csv("/content/sample_data/confMat.csv", sep='\t')
crstb.head()

predicted labels,blt,brn,noData,veg,wtr
actual labels,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
blt,8,0,0,0,0
brn,0,51,0,0,0
noData,0,0,5,0,0
veg,0,0,0,116,0
wtr,0,0,0,0,10


In [None]:
#saving the model

modelName="/content/drive/MyDrive/BigGeoProj/rfekm02.sav"
pickle.dump(clf, open(modelName, 'wb'))

In [None]:

test=None
train=None
dfTrain=None

In [None]:
clf=None
