Connected to Python 3.10.8

Script used to find HIT metric impact on existing fish populations in the 10 ecoregions in Texas using decision tree regressors

In [1]:
import pandas as pd
import pyreadr

# Decision Tree Regressor
import matplotlib.pyplot as plt
import matplotlib
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from dmba import plotDecisionTree
from sklearn import tree
import re
import sklearn


Read R Data and convert to DF

In [3]:
Combined_data = pyreadr.read_r("pre_tree_rerun.RData")

In [4]:
df = Combined_data['ivern']
df = df.drop(['COMID', 'year'], axis=1)

Create Dictionary for 171 HIT Metrics and split by ecoregions 

In [6]:
dfs_fixed = {}
for huc4, df_hucs in df.groupby('region'):
    dfs_fixed[huc4] = df_hucs
print(dfs_fixed)

{1:             ma1         ma2       ma3       ma4       ma5       ma6       ma7  \
0     -0.007841    0.113525  0.053372  0.660927  0.128379  0.134041  0.063539   
11    24.666238  441.854720 -0.351674 -0.665380 -0.277906 -0.393587 -0.332904   
12     0.429532    1.225859  0.188604  0.021217  0.148082  1.716948  1.427959   
13    -0.193438   -0.539418 -0.120411 -0.125013  0.822812  3.739632  1.058168   
14    -0.127781    0.627115  0.003855  0.053811 -0.020707  0.184604  0.089347   
...         ...         ...       ...       ...       ...       ...       ...   
6879   1.949575   12.418405 -0.430215 -0.109594 -0.460186 -0.431465 -0.232084   
6880   0.662278    2.243127 -0.166253 -0.089890 -0.309083 -0.500776 -0.394314   
6881   2.075829   16.790351 -0.432582 -0.253285 -0.472198 -0.389354 -0.149445   
6882   0.399466    1.215992 -0.097992 -0.118448 -0.160007 -0.013107  0.043720   
6883  -0.312176    0.341183  0.021121  0.010586 -0.031341 -0.054829 -0.032980   

           ma8       ma

In [5]:

################################################
### Define Functions Prior to any processing ###
################################################

# Define decision tree to df
def tree_to_df(reg_tree, feature_names):
    tree_ = reg_tree.tree_
    feature_name = [
        feature_names[i] if i != sklearn.tree._tree.TREE_UNDEFINED else "undefined!"
        for i in tree_.feature
    ]
    
    def recurse(node, row, ret):
        if tree_.feature[node] != sklearn.tree._tree.TREE_UNDEFINED:
            name = feature_name[node]
            threshold = tree_.threshold[node]
            # Add rule to row and search left branch
            row[-1].append(name + " <= " +  str(threshold))
            recurse(tree_.children_left[node], row, ret)
            # Add rule to row and search right branch
            row[-1].append(name + " > " +  str(threshold))
            recurse(tree_.children_right[node], row, ret)
        else:
            # Add output rules and start a new row
            label = tree_.value[node]
            ret.append("return " + str(label[0][0]))
            row.append([])
    
    # Initialize
    rules = [[]]
    vals = []
    
    # Call recursive function with initial values
    recurse(0, rules, vals)
    
    # Convert to table and output
    df_tree = pd.DataFrame(rules).dropna(how='all')
    df_tree['Return'] = pd.Series(vals)
    return df_tree


# Define object to Erase Values in Decision Tree Regressor
def replace_text(obj):
    if type(obj) == matplotlib.text.Annotation:
        txt = obj.get_text()
        txt = re.sub("\nvalue[^$]*]]","",txt)
        obj.set_text(txt)
    return obj

# Create a Dictionary Class
class my_dictionary(dict):
 
  # __init__ function
  def __init__(self):
    self = dict()
 
  # Function to add key:value
  def add(self, key, value):
    self[key] = value


Decision Tree Regressor

In [6]:
### Depth List
Depth_list = [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40]

### Create pd Dataframe 
my_dict = {"Ecoregion":[],"Metric":[],"Run":[], "Importance":[]}
df_Test = pd.DataFrame()
# Create the dataset
Counter = 0
for i in dfs_fixed:
    Counter +=1

    ###############################
    ### Create Empty Dictionary ###
    ###############################
    
    # Metric_Dictionary_Counter = my_dictionary()
    
    huc4 = dfs_fixed[i]
    huc4 = huc4.drop(['huc4','region'],axis =1)

    Metrics = huc4.iloc[:,0:130]

    for j in Metrics:
        column_names = list(huc4[[j]].columns.values)
        Fish = huc4.iloc[:,130:]

        # Convert to Array
        Metrics_arr = huc4[[j]].to_numpy()
        Fish_arr = Fish.to_numpy()
        # Make metrics for regression
        y = Fish_arr
        X = Metrics_arr
       
        for index in range(0,len(Depth_list)):

            #define model
            regr = DecisionTreeRegressor(max_depth=Depth_list[index]) # Define depth based off of Regression_Tree_Runs.ipynb

            ##########################################
            ### Calculate Score and Highest Metric ###
            ##########################################
            
            model = regr.fit(X, y)
            model_depth = model.tree_.max_depth

            if index < model_depth:
                Score_reg = regr.score(X,y)
                my_dict["Ecoregion"].append(i)
                my_dict["Metric"].append(j)
                my_dict["Run"].append(index)
                my_dict["Importance"].append(Score_reg)	
                
                # print(pd.DataFrame(Score_reg, columns=Dataframe_names))
                # print(f"Ecoregion {i} with Metric {j} at Depth {index} has R2 of {Score_reg}")
            else: 
                break                 


Dataframe to Excel

In [7]:
df = pd.DataFrame(my_dict)
df.to_excel("Metric_Runs_V2.xlsx")
print(df)

       Ecoregion Metric  Run  Importance
0              1    ma1    0    0.567633
1              1    ma1    1    0.571260
2              1    ma1    2    0.574964
3              1    ma1    3    0.581158
4              1    ma1    4    0.588946
...          ...    ...  ...         ...
29789         10    ra8   12    0.943550
29790         10    ra8   13    0.946715
29791         10    ra8   14    0.949211
29792         10    ra8   15    0.950762
29793         10    ra8   16    0.950832

[29794 rows x 4 columns]
