# Read in computed data for new polygons

The previously optimized RFC algorithm will predict MIVI levels for these polygons. Then, after reading the new, predicted target variable into arcGIS pro, I will create a map color coded by level of MIVI invasion 

In [143]:
#import pandas and numpy for future usage

import pandas as pd
import numpy as np

In [144]:
#read in exported tables from ArcGIS pro

EAF = pd.read_csv("MIVI_tables/MIVI_EAF.csv")
dist_from_RB_farm = pd.read_csv("MIVI_tables/MIVI_Dist_farm_RB_FPft.csv")
dist_FP = pd.read_csv("MIVI_tables/MIVI_avg_DistFP.csv")
conversion_1937_1957 = pd.read_csv("MIVI_tables/MIVI_1937.csv")
conversion_1957_1974 = pd.read_csv("MIVI_tables/MIVI_1957.csv")
conversion_1974_1996 = pd.read_csv("MIVI_tables/MIVI_1974.csv")
evergreen = pd.read_csv("MIVI_tables/MIVI_evergreen.csv")

In [145]:
dist_from_RB_farm.head()

Unnamed: 0,OID_,Join_Count,Dist_FPft,TARGET_FID,Join_Count_1,Dist_RB,TARGET_FID_1,Join_Count_12,Dist_Farmland,TARGET_FID_12,Id,Id_1,Shape_Leng,Id_12,Id_12_13,Shape_Length_1,Shape_Length,Shape_Area
0,1,1,0.0,0,1,0.0,1,1,137.437725,1,0,0.0,1544.024081,0,0,2993.084765,441.665673,56728.850531
1,2,1,0.0,1,1,173.033638,2,1,165.194898,2,0,0.0,1544.024081,0,0,2993.084765,514.520701,56728.850531
2,3,1,0.0,2,1,343.606191,3,1,309.282734,3,0,0.0,1544.024081,0,0,2993.084765,653.003349,56728.850531
3,4,1,0.0,3,1,685.86854,4,1,331.353408,4,0,0.0,2556.666935,0,0,2993.084765,602.603455,251285.730617
4,5,1,0.0,4,1,897.800429,5,1,144.466065,5,0,0.0,2556.666935,0,0,2993.084765,538.862241,251285.730617


In [146]:
# begin adding MIVI necessary features to dataframe 
Xdf = pd.DataFrame()

#add arcGIS pro-generated FID to dataframe
Xdf = Xdf.assign(MIVI_ID = dist_from_RB_farm['TARGET_FID'])

In [147]:
# clean data from CSVs, add to df
## start with dist_from_RB 
dist_floodplain = []
EAF_list = []
for i in range(Xdf.shape[0]): 
    #iterate over range of num rows of df
    
    mean_distance = dist_FP.at[i, 'MEAN_Dist_stream']
    #retrieves value from mean distance col at row specified by idx
    
    dist_floodplain.append(mean_distance * 3.28) 
    #feet to meters formula

for i in range(Xdf.shape[0]): #same proc for EAF
    
    mean_EAF = EAF.at[i, 'MEAN_EAF'] 
    EAF_list.append(mean_EAF)

In [148]:
# fix dist_floodplain values for PW buffer Target IDs

for i in range(Xdf.shape[0]):
    if(Xdf.iloc[i,0] >= 16 and Xdf.iloc[i,0] <= 36): 
        dist_floodplain[i] = 300
for i in range(Xdf.shape[0]):
    if(Xdf.iloc[i,0] <= 15):
        dist_floodplain[i] = 0

# assign riparian buffer distance to 300 meters if not within 450 ft of pw buffer
dist_RB_list = []
for i in range(Xdf.shape[0]):
    if(dist_from_RB_farm.iloc[i,5] >= 300):
        dist_RB_list.append(300)
    else:
        dist_RB_list.append(dist_from_RB_farm.iloc[i,5])
        
#assign new FP dist, RB dist, and EAF lists to df
Xdf = Xdf.assign(Dist_RB = dist_RB_list)
Xdf = Xdf.assign(Dist_Floodplain = dist_floodplain)
Xdf = Xdf.assign(EAF = EAF_list)
Xdf.head()

Unnamed: 0,MIVI_ID,Dist_RB,Dist_Floodplain,EAF
0,0,0.0,0.0,5.666667
1,1,173.033638,0.0,4.285714
2,2,300.0,0.0,6.787879
3,3,300.0,0.0,15.319149
4,4,300.0,0.0,26.518519


In [149]:
# add in converted forest
con_37 = [] # create empty list for each forest conversion
con_57 = []
con_74 = []
for i in range(Xdf.shape[0]):
    OID = Xdf.iloc[i,0] + 1
    if(any(OID == conversion_1937_1957['OBJECTID_1'])): 
        #if the current object ID exists in converted forest df
        
        idx = conversion_1937_1957['OBJECTID_1'] ==  OID
        con_37.append(float(conversion_1937_1957[idx]['PERCENTAGE']))
        #extract series from idx in conv forest df then convert to percentage float
        
    else:
        con_37.append(0) 
        #if tabulate overlay did not overlap with any conv forest, overlay is 0 percent
        
    if(any(OID == conversion_1957_1974['OBJECTID_1'])): 
        
        #same procedure for other conv forest dfs
        idx = conversion_1957_1974['OBJECTID_1'] ==  OID
        con_57.append(float(conversion_1957_1974[idx]['PERCENTAGE'])) 
    else:
        con_57.append(0)
    if(any(OID == conversion_1974_1996['OBJECTID_1'])): 
        
        #same procedure for other conv forest dfs
        idx = conversion_1974_1996['OBJECTID_1'] ==  OID
        con_74.append(float(conversion_1974_1996[idx]['PERCENTAGE'])) 
    else:
        con_74.append(0)

#assign new column to list with percentages

Xdf = Xdf.assign(conversion_1937_1957=con_37) 
Xdf = Xdf.assign(conversion_1957_1974=con_57)
Xdf = Xdf.assign(conversion_1974_1996=con_74)

In [150]:
# distance from farmland
Xdf = Xdf.assign(Dist_Farmland = dist_from_RB_farm['Dist_Farmland'])

In [151]:
#evergreen forest
eg = []
for i in range(Xdf.shape[0]):
    # same procedure as forest dfs
    
    OID = Xdf.iloc[i,0] + 1
    if(any(OID == evergreen['OBJECTID_1'])): 
        idx = evergreen['OBJECTID_1'] ==  OID
        eg.append(float(evergreen[idx]['PERCENTAGE'])) #extract series from idx in conv forest df then convert to percentage float
    else:
        eg.append(0)
Xdf = Xdf.assign(evergreen_forest = eg)

# Read in supervised polygons for training

In [152]:
#read in CSVs for EAF., distance from Riparian Buf, forest age/type

path = 'Beyond450_EAF_avg_ExportTable.csv' 
Beyond450_EAF = pd.read_csv(path)
EAF = pd.read_csv("Compartment_avgEAF.csv")
dist_from_RB = pd.read_csv("Compartment_avgDistance_RB.csv")
conversion_1937_1957 = pd.read_csv("conversion_1937_1957.csv")
conversion_1957_1974 = pd.read_csv("conversion_1957_1974.csv")
conversion_1974_1996 = pd.read_csv("conversion_1974_1996.csv")
evergreen = pd.read_csv("evergreen.csv")

In [153]:
# read in CSV for overall inv. supervised data with farmland, riparian features to append former data to

invasion = pd.read_csv("Compartment_Inva_ExportTable.csv")

In [154]:
# get rid of unnecessary columns for ML

df = invasion.iloc[:,[2,5,10,20,21]]
df.head()

Unnamed: 0,Distance_Riparian_Buffer,Distance_Farmland,HID,overall_in,individual
0,136.620988,0.0,252.0,25-50%,"MIVI-3,LOJA-2,ELUM-1,ROMU-1,LISE-1,CEOR-2,BETH..."
1,-1.0,0.0,253.0,75-100%,"MIVI-4,CEOR-4,LISE-1,LOJA-1,RUPH-1,ALJU-1"
2,137.086561,0.0,254.0,50-75%,"MIVI-4,CEOR-3,LISE-1,RUPH-1,ELUM-1,LOJA-1,ROMU-1"
3,137.08653,0.0,255.0,50-75%,"MIVI-4,CEOR-3,RUPH-3,LOJA-1,ALJU-1,CIVU-2,LISE-2"
4,137.1077,0.0,256.0,50-75%,"MIVI-3,LISE-2,CEOR-3,RUPH-2,PATO-1,LOJA-2,RUCR..."


In [155]:
dist_floodplain = []
EAF_list = []
for i in range(df.shape[0]): 
    #iterate over range of num rows of df
    
    HID = df.iloc[i,2] 
    #find the compartment ID for current iteration
    
    idx = dist_from_RB.index[dist_from_RB['HID'] == HID] 
    #find which index of dist_from_RB equals current HID iteration
    
    mean_distance = dist_from_RB.at[idx[0], 'MEAN_Distance_From_RB']
    #retrieves value from mean d col at row specified by idx
    
    dist_floodplain.append(mean_distance * 3.28) 
    #meters to feet formula

for i in range(df.shape[0]): #same proc for EAF
    HID = df.iloc[i,2] 
    idx = EAF.index[EAF['HID'] == HID] #find which index of overall EAF equals current HID iteration
    mean_EAF = EAF.at[idx[0], 'MEAN_EAF'] #retrieves value from mean d col at row specified by idx
    EAF_list.append(mean_EAF)

In [156]:
# fix dist_floodplain values for PW buffer HIDs

for i in range(df.shape[0]):
    if(df.iloc[i,2] >= 80 and df.iloc[i,2] <= 246):
        dist_floodplain[i] = 300
for i in range(df.shape[0]):
    if(df.iloc[i,2] <= 79):
        dist_floodplain[i] = 0

# assign riparian buffer distance to 300 meters if not within 450 ft of pw buffer
for i in range(df.shape[0]):
    if(df.iloc[i,0] == -1):
        df.iloc[i,0] = 300
        
# assign new FP dist and EAF lists to df
df = df.assign(EAF = EAF_list)
df = df.assign(Dist_Floodplain = dist_floodplain)
df.head()

Unnamed: 0,Distance_Riparian_Buffer,Distance_Farmland,HID,overall_in,individual,EAF,Dist_Floodplain
0,136.620988,0.0,252.0,25-50%,"MIVI-3,LOJA-2,ELUM-1,ROMU-1,LISE-1,CEOR-2,BETH...",38.745098,814.578296
1,300.0,0.0,253.0,75-100%,"MIVI-4,CEOR-4,LISE-1,LOJA-1,RUPH-1,ALJU-1",58.2,1281.677749
2,137.086561,0.0,254.0,50-75%,"MIVI-4,CEOR-3,LISE-1,RUPH-1,ELUM-1,LOJA-1,ROMU-1",42.064516,898.304107
3,137.08653,0.0,255.0,50-75%,"MIVI-4,CEOR-3,RUPH-3,LOJA-1,ALJU-1,CIVU-2,LISE-2",36.842105,646.795242
4,137.1077,0.0,256.0,50-75%,"MIVI-3,LISE-2,CEOR-3,RUPH-2,PATO-1,LOJA-2,RUCR...",35.789474,578.104646


In [157]:
# add in converted forest
con_37 = [] ## create empty list for each forest conversion
con_57 = []
con_74 = []
for i in range(df.shape[0]):
    HID = df.iloc[i,2]
    if(any(HID == conversion_1937_1957['HID'])): 
        #if the current HID exists in converted forest df
        
        idx = conversion_1937_1957['HID'] ==  HID
        con_37.append(float(conversion_1937_1957[idx]['PERCENTAGE']))
        #extract series from idx in conv forest df then convert to percentage float
        
    else:
        con_37.append(0) 
        #if tabulate overlay did not overlap with any conv forest, overlay is 0 percent
        
    if(any(HID == conversion_1957_1974['HID'])): 
        
        #same procedure for other conv forest dfs
        idx = conversion_1957_1974['HID'] ==  HID
        con_57.append(float(conversion_1957_1974[idx]['PERCENTAGE'])) 
    else:
        con_57.append(0)
    if(any(HID == conversion_1974_1996['HID'])): 
        
        #same procedure for other conv forest dfs
        idx = conversion_1974_1996['HID'] ==  HID
        con_74.append(float(conversion_1974_1996[idx]['PERCENTAGE'])) 
    else:
        con_74.append(0)
df = df.assign(conversion_1937_1957=con_37) 
#assign new column to list with percentages
df = df.assign(conversion_1957_1974=con_57)
df = df.assign(conversion_1974_1996=con_74)

In [158]:
#evergreen forest
eg = []
for i in range(df.shape[0]):
    HID = df.iloc[i,2]
    if(any(HID == evergreen['HID'])): 
        #if the current HID exists in converted forest df
        idx = evergreen['HID'] ==  HID
        eg.append(float(evergreen[idx]['PERCENTAGE'])) 
        #extract series from idx in conv forest df then convert to percentage float
    else:
        eg.append(0)
df = df.assign(evergreen_forest = eg)

In [159]:
# percentage ranges to numeric, cover class values
percentage = {'5-25%': 2, '25-50%': 3, '50-75%': 4, '75-100%': 5}
numeric_values = []

for p in df['overall_in']:
    #convert the percentage range to a numeric value
    numeric_value = percentage.get(p)
    numeric_values.append(numeric_value)
    
# add new column with numeric values
df['overall_numeric'] = numeric_values

In [160]:
df.head()

Unnamed: 0,Distance_Riparian_Buffer,Distance_Farmland,HID,overall_in,individual,EAF,Dist_Floodplain,conversion_1937_1957,conversion_1957_1974,conversion_1974_1996,evergreen_forest,overall_numeric
0,136.620988,0.0,252.0,25-50%,"MIVI-3,LOJA-2,ELUM-1,ROMU-1,LISE-1,CEOR-2,BETH...",38.745098,814.578296,2.646251,97.352746,0.0,93.098833,3
1,300.0,0.0,253.0,75-100%,"MIVI-4,CEOR-4,LISE-1,LOJA-1,RUPH-1,ALJU-1",58.2,1281.677749,9.34323,91.347678,0.0,6.842801,5
2,137.086561,0.0,254.0,50-75%,"MIVI-4,CEOR-3,LISE-1,RUPH-1,ELUM-1,LOJA-1,ROMU-1",42.064516,898.304107,23.013302,77.373018,0.0,61.135065,4
3,137.08653,0.0,255.0,50-75%,"MIVI-4,CEOR-3,RUPH-3,LOJA-1,ALJU-1,CIVU-2,LISE-2",36.842105,646.795242,0.0,74.681526,0.0,0.0,4
4,137.1077,0.0,256.0,50-75%,"MIVI-3,LISE-2,CEOR-3,RUPH-2,PATO-1,LOJA-2,RUCR...",35.789474,578.104646,0.0,99.999908,0.0,0.0,4


# Train previously determined optimized MLP Classifier

In [161]:
#import MLP Classifier train_test_split and K_fold function 

from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split as tts
from intro_Data_4_3 import *

#first drop biased observations

## riparian buffer skew
idx = df["Distance_Riparian_Buffer"] == 300 
tempdf = df[idx]

## outliers far from floodplain skew
idx = tempdf["Dist_Floodplain"] > 1200 
tempdf = tempdf.drop(tempdf[idx].index)

X = np.array(tempdf.drop(columns=['individual', 'overall_in', 'overall_numeric', 'HID']))
y = np.array(tempdf['overall_numeric'])

X_train, X_test, y_train, y_test = tts(X, y, test_size=0.4, random_state = 146)

# Create and train the MLPClassifier
mlp = MLPClassifier(hidden_layer_sizes=10, activation='logistic', alpha=0.0004, max_iter = 10000, random_state= 146)
mlp.fit(X_train, y_train)

# Make predictions on the test set
y_pred = mlp.predict(X_test)

# Calculate accuracy
accuracy = mlp.score(X_test, y_test)
print("MLP Classifier Accuracy:", accuracy)

# You can also use your compare_classes function to display the confusion matrix
matrix = compare_classes(y_pred, y_test)
matrix 

MLP Classifier Accuracy: 0.7142857142857143
Test accuracy = 0.71


(Predicted  2  3  4  5
 Actual               
 2          2  0  0  0
 3          2  5  1  0
 4          0  0  2  1
 5          0  0  0  1,
 0.7142857142857143)

# Now, determine Cover Class for new data (polygons not surveyed in field)

### first rearrange column order so dfs match before converting to arrays

In [162]:
tempdf.drop(columns=['individual', 'overall_in', 'overall_numeric', 'HID']).head()

Unnamed: 0,Distance_Riparian_Buffer,Distance_Farmland,EAF,Dist_Floodplain,conversion_1937_1957,conversion_1957_1974,conversion_1974_1996,evergreen_forest
6,300.0,0.0,32.571429,818.178278,0.0,0.0,99.999877,0.0
8,300.0,173.037653,64.142857,675.009333,0.0,0.0,0.0,0.0
9,300.0,361.244783,45.309091,698.781851,0.0,0.0,0.0,0.0
10,300.0,522.854292,43.714286,580.959644,0.0,0.0,0.0,4.78292
11,300.0,331.102949,61.73913,538.781197,0.0,0.0,0.0,78.577919


In [163]:
Xdf.drop(columns = "MIVI_ID").head()

Unnamed: 0,Dist_RB,Dist_Floodplain,EAF,conversion_1937_1957,conversion_1957_1974,conversion_1974_1996,Dist_Farmland,evergreen_forest
0,0.0,0.0,5.666667,0.0,0.0,13.725485,137.437725,0.0
1,173.033638,0.0,4.285714,0.0,0.0,0.0,165.194898,6.312973
2,300.0,0.0,6.787879,0.0,0.0,0.0,309.282734,0.0
3,300.0,0.0,15.319149,0.0,0.0,0.0,331.353408,0.0
4,300.0,0.0,26.518519,0.0,0.0,0.0,144.466065,0.0


In [164]:
#create list of original df column names in order
column_names = tempdf.drop(columns=['individual', 'overall_in', 'overall_numeric', 'HID']).columns.tolist()

#change column names to match Xdf
column_names[0] = 'Dist_RB'
column_names[1] = 'Dist_Farmland'

#change column order of Xdf to match df
column_names
Xdf[column_names].head()

Unnamed: 0,Dist_RB,Dist_Farmland,EAF,Dist_Floodplain,conversion_1937_1957,conversion_1957_1974,conversion_1974_1996,evergreen_forest
0,0.0,137.437725,5.666667,0.0,0.0,0.0,13.725485,0.0
1,173.033638,165.194898,4.285714,0.0,0.0,0.0,0.0,6.312973
2,300.0,309.282734,6.787879,0.0,0.0,0.0,0.0,0.0
3,300.0,331.353408,15.319149,0.0,0.0,0.0,0.0,0.0
4,300.0,144.466065,26.518519,0.0,0.0,0.0,0.0,0.0


In [165]:
#create numpy array of new polygons dataframe
X = np.array(Xdf[column_names])  

#predict Cover Class with the existing MLP Classifier
y_pred = mlp.predict(X)
y_pred

array([3, 4, 3, 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 3, 3, 3, 3, 3, 3,
       3, 3, 3, 3, 4, 3, 3, 3, 3, 3, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4,
       4, 3])

# Train MLP Classifier on entire dataset (still without outliers)

In [166]:
X = np.array(tempdf.drop(columns=['individual', 'overall_in', 'overall_numeric', 'HID']))
y = np.array(tempdf['overall_numeric'])

#No TTS necessary
# X_train, X_test, y_train, y_test = tts(X, y, test_size=0.4, random_state = 146)

# Create and train the MLPClassifier with optimized params on entire dataset
mlp = MLPClassifier(hidden_layer_sizes=10, activation='logistic', alpha=0.0004, max_iter = 10000, random_state = 146)
mlp.fit(X, y)

In [167]:
#create numpy array of new polygons dataframe
X = np.array(Xdf[column_names])

#predict target values based on training of full dataset
y_pred_full = mlp.predict(X)

In [168]:
y_pred_full

array([5, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 3, 3, 3, 2, 3, 3,
       3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 2, 3, 2, 3, 3, 3, 2, 3, 2,
       2, 3])

In [169]:
# assign generated target values to dataset
Xdf = Xdf.assign(y_pred = y_pred_full)

In [172]:
#export dataframe to csv to read into arcGIS pro

Xdf.to_csv('target_polygons.csv', index=False)

In [171]:
Xdf

Unnamed: 0,MIVI_ID,Dist_RB,Dist_Floodplain,EAF,conversion_1937_1957,conversion_1957_1974,conversion_1974_1996,Dist_Farmland,evergreen_forest,y_pred
0,0,0.0,0.0,5.666667,0.0,0.0,13.725485,137.437725,0.0,5
1,1,173.033638,0.0,4.285714,0.0,0.0,0.0,165.194898,6.312973,4
2,2,300.0,0.0,6.787879,0.0,0.0,0.0,309.282734,0.0,4
3,3,300.0,0.0,15.319149,0.0,0.0,0.0,331.353408,0.0,4
4,4,300.0,0.0,26.518519,0.0,0.0,0.0,144.466065,0.0,4
5,5,300.0,0.0,-3.2,0.0,0.0,0.0,0.0,0.0,4
6,6,300.0,0.0,10.533333,0.0,0.0,0.0,194.395897,0.0,4
7,7,300.0,0.0,15.030303,0.013475,0.0,0.0,309.968475,0.0,4
8,8,300.0,0.0,5.066667,19.835422,32.534576,0.0,374.123191,0.0,4
9,9,300.0,0.0,5.333333,4.87175,49.324795,0.0,263.364902,2.933787,4
