# Data Preprocessing I
*This script contains the preprocessing steps performed to create a clean and structured dataset for further processing and analysis.
The raw data is contained in several Excel sheets, these will be imported and cleaned separately initially, then processed into a suitable format for further analysis.*

In [1]:
import pandas as pd
import os
import re

## Prescription data sheet

In [2]:
# Load prescription data from excel sheet
file_path = r'Data_Masterthesis\3-Harvesting_Schedule\Presc_dataset21_missingstakeholders_sol.xlsx'

# Read 1st sheet into df
presc_df = pd.read_excel(file_path, sheet_name=0)

# Look at DataFrame shape and columns
print(presc_df.shape)
print(presc_df.columns)
presc_df

(317883, 41)
Index(['UG', 'ug_Sp', 'area', 'presc', 'Control', 'Na sol MaxRES',
       'Na sol MaxWood', 'period', 'species', 'c', 'v (m3)', 'vthin (m3) ',
       'vharv (m3)', 'vthin (ton)', 'vharv (ton)', 'Vremovido(ton)', 'fmm1',
       'fmm2', 'fmm3', 'fmm4', 'fmm5', 'fmm6', 'fmm7', 'fmm8', 'fmm9', 'fmm10',
       'fmm11', 'npv0', 'npv5', 'npv10', 'biodiversity0', 'biodiversity5',
       'biodiversity10', 'rait0', 'rait5', 'rait10', 'rit0', 'rit5', 'rit10',
       'erosion', 'Cortiça'],
      dtype='object')


Unnamed: 0,UG,ug_Sp,area,presc,Control,Na sol MaxRES,Na sol MaxWood,period,species,c,...,biodiversity5,biodiversity10,rait0,rait5,rait10,rit0,rit5,rit10,erosion,Cortiça
0,1,1_Ec,42.601782,1,1.1000,,,1,Ec,20290.7145,...,68.8904,73.4017,132.0655,166.1470,136.3257,0.9880,0.9963,0.9880,81862.7924,0.0000
1,1,1_Ec,42.601782,1,1.1000,,,2,Ec,36915.3071,...,68.6481,72.9810,140.5859,166.1470,140.5859,0.9912,0.9963,0.9883,78103.5629,0.0000
2,1,1_Ec,42.601782,1,1.1000,,,3,Ec,31452.4225,...,68.7367,73.1087,144.8461,166.1470,140.5859,0.9919,0.9967,0.9895,77765.0774,0.0000
3,1,1_Ec,42.601782,1,1.1000,,,4,Ec,33321.7817,...,67.4269,69.3376,161.8868,170.4071,161.8868,0.9959,0.9975,0.9959,70974.4382,0.0000
4,1,1_Ec,42.601782,1,1.1000,,,5,Ec,31034.9927,...,68.7400,73.1555,149.1062,166.1470,140.5859,0.9918,0.9966,0.9891,81249.0464,0.0000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
317878,1626,1626_Pb,0.761332,503700,1626.5037,,,6,Sb,30.8645,...,1.0996,1.1132,3.0453,3.0453,3.0453,4.7000,4.8000,4.8000,633.4482,151.3299
317879,1626,1626_Pb,0.761332,503700,1626.5037,,,7,Sb,42.4194,...,1.1208,1.1306,3.0453,3.0453,3.0453,5.0000,5.0000,5.0000,19.9468,232.7392
317880,1626,1626_Pb,0.761332,503700,1626.5037,,,8,Sb,51.7995,...,1.1461,1.1556,3.0453,3.0453,3.0453,5.0000,5.0000,5.0000,12.5449,232.2747
317881,1626,1626_Pb,0.761332,503700,1626.5037,,,9,Sb,59.9478,...,1.1713,1.1808,3.0453,3.0453,3.0453,5.0000,5.0000,5.0000,12.5572,231.5058


### Step 0: Subset the data
Create a smaller df:
- without the columns that are not to be used in this problem,
- that only contains data for the timeframe under study, i.e. the first 5 time periods (=50 years).

In [3]:
# Create the subset of columns to be used
df = presc_df[['UG', 'ug_Sp','species', 'presc', 'area', 'Control', 'Na sol MaxRES',
       'Na sol MaxWood', 'period', 'v (m3)', 'vthin (m3) ',
       'vharv (m3)', 'vthin (ton)', 'vharv (ton)', 'Vremovido(ton)', 'rait0', 'rait5', 'rait10']].copy()
print(df.shape)
df.head(1)

(317883, 18)


Unnamed: 0,UG,ug_Sp,species,presc,area,Control,Na sol MaxRES,Na sol MaxWood,period,v (m3),vthin (m3),vharv (m3),vthin (ton),vharv (ton),Vremovido(ton),rait0,rait5,rait10
0,1,1_Ec,Ec,1,42.601782,1.1,,,1,1222.6712,0.0,4443.3659,0.0,2999.271982,2999.271982,132.0655,166.147,136.3257


Basic formatting (strip spaces etc.):

In [4]:
# decide format, capitalize some columns, also strip the spaces from column names
df = df.astype({'UG': str, 
    'ug_Sp': str, 
    'species': str,
    'presc': str, 
    'Control': str, 
    'Na sol MaxRES': object, 
    'Na sol MaxWood': object, 
    'period': int,
    'v (m3)': float, 
    'vthin (m3) ': float, 
    'vharv (m3)': float, 
    'vthin (ton)': float, 
    'vharv (ton)': float, 
    'Vremovido(ton)': float, 
    'rait0': float, 
    'rait5': float, 
    'rait10': float})

# capitalize col name presc
df.rename(columns={'presc': 'Presc'}, inplace=True)

# remove spaces from col names
df.columns = df.columns.str.replace(' ', '')

df.head(1)

Unnamed: 0,UG,ug_Sp,species,Presc,area,Control,NasolMaxRES,NasolMaxWood,period,v(m3),vthin(m3),vharv(m3),vthin(ton),vharv(ton),Vremovido(ton),rait0,rait5,rait10
0,1,1_Ec,Ec,1,42.601782,1.1,,,1,1222.6712,0.0,4443.3659,0.0,2999.271982,2999.271982,132.0655,166.147,136.3257


In [5]:
# subset the first 5 time periods
print(df.shape)
df['period'] = df['period'].astype(int)
df = df[df['period']<=5]
df.shape

(317883, 18)


(169693, 18)

### Step 1: Check data consistency/plausibility
Check if

    Control == UG.Presc
    ug_Sp == UG_species
    (vthin(ton) = 0) IFF (vthin(m3) = 0)
    (vharv(ton) = 0) IFF (vharv(m3) = 0)
    vthin(ton)+vharv(ton) == Vremovido(ton)

In [6]:
## some data plausibility checks

# Remove trailing zeros 
df['Control'] = df['Control'].str.rstrip('0')
df['Presc'] = df['Presc'].str.rstrip('0')

# Check if concatenating 'UG' with a dot ('.') and 'Presc' gives the 'Control' column
check_control_column = (df['UG'] + '.' + df['Presc']) == df['Control']

if check_control_column.all(): 
    print("\u2713 column 'Control' is consistent. Can be dropped.")
else: print("\u26A0 column 'Control' has a problem.")

# Check if concatenating 'UG' with species gives the 'ug_sp' column
check_ugsp_column = (df['UG'] + '_' + df['species']) == df['ug_Sp']

if check_ugsp_column.all(): 
    print("\u2713 column 'ug_Sp' is consistent. Can be dropped.")
else: print("\u26A0 column 'ug_Sp' has a problem.", len(df) - check_ugsp_column.sum(), 'inconsistencies')

# Check if vthin (ton) and vthin (m3) are either both zero or both nonzero
bmask_thinning = ((df['vthin(ton)'] == 0) & (df['vthin(m3)'] == 0)) | ((df['vthin(ton)'] != 0) & (df['vthin(m3)'] != 0))
if bmask_thinning.all():
    print("\u2713 Thinning:")
else: 
    print("\u26A0 Thinning:")
print(f"There are {len(df) - bmask_thinning.sum()} rows where 'vthin(ton)' and 'vthin(m3)' are inconsistent.")

# Check if vharv (ton) and vharv (m3) are either both zero or both nonzero
bmask_harvest = ((df['vharv(ton)'] == 0) & (df['vharv(m3)'] == 0)) | ((df['vharv(ton)'] != 0) & (df['vharv(m3)'] != 0))
if bmask_harvest.all():
    print("\u2713 Harvest:")
else:
    print(f"\u26A0 Harvest:")
print(f"There are {len(df) - bmask_harvest.sum()} rows where 'vharv(ton)' and 'vharv(m3)' are inconsistent.")

# check if the amount of removed timber equals the sum of harvested and thinned timber
bmask_removido = df['Vremovido(ton)'] == (df['vthin(ton)'] + df['vharv(ton)'])
if bmask_removido.all():
    print("\u2713 Removed wood:")
else:
    print(f"\u26A0 Harvest:")
print(f"There are {len(df) - bmask_removido.sum()} rows where the sum of 'vthin (ton)' and 'vharv (ton)' does not equal 'Vremovido(ton)'.")

✓ column 'Control' is consistent. Can be dropped.
⚠ column 'ug_Sp' has a problem. 108525 inconsistencies
⚠ Thinning:
There are 5565 rows where 'vthin(ton)' and 'vthin(m3)' are inconsistent.
✓ Harvest:
There are 0 rows where 'vharv(ton)' and 'vharv(m3)' are inconsistent.
✓ Removed wood:
There are 0 rows where the sum of 'vthin (ton)' and 'vharv (ton)' does not equal 'Vremovido(ton)'.


&#9888; We need to address the data inconsistencies found above.
### Step 2: Address data inconsistencies

#### a. Resolve inconsistencies between `vthin(ton)` and `vthin(m3)` 
Have a look at the problematic rows:

In [7]:
# show the rows where thinning values are inconsistent:
print(df[~bmask_thinning].shape)
df[~bmask_thinning].head(7)

(5565, 18)


Unnamed: 0,UG,ug_Sp,species,Presc,area,Control,NasolMaxRES,NasolMaxWood,period,v(m3),vthin(m3),vharv(m3),vthin(ton),vharv(ton),Vremovido(ton),rait0,rait5,rait10
492,1,1_Ec,Ec,9001,42.6,1.9001,,,1,7263.3,,0.0,0.0,0.0,0.0,140.58,166.14,136.32
493,1,1_Ec,Ec,9001,42.6,1.9001,,,2,12975.96,,0.0,0.0,0.0,0.0,51.12,170.4,153.36
494,1,1_Ec,Ec,9001,42.6,1.9001,,,3,16916.46,,0.0,0.0,0.0,0.0,42.6,178.92,157.62
495,1,1_Ec,Ec,9001,42.6,1.9001,,,4,19736.58,,0.0,0.0,0.0,0.0,42.6,178.92,161.88
496,1,1_Ec,Ec,9001,42.6,1.9001,,,5,21849.54,,0.0,0.0,0.0,0.0,42.6,183.18,166.14
532,2,2_Ec,Ec,9002,8.26,2.9002,,,1,454.3,,0.0,0.0,0.0,0.0,33.04,38.822,33.04
533,2,2_Ec,Ec,9002,8.26,2.9002,,,2,893.732,,0.0,0.0,0.0,0.0,14.042,39.648,34.692


Problem seems to be caused by NaN values in `vthin(m3)`. Replace them by 0 and check if the problem persists:

In [8]:
# replace NaN by 0
df['vthin(m3)'].fillna(0, inplace=True)

# Check if vthin (ton) and vthin (m3) are either both zero or both nonzero
bmask_thinning = ((df['vthin(ton)'] == 0) & (df['vthin(m3)'] == 0)) | ((df['vthin(ton)'] != 0) & (df['vthin(m3)'] != 0))
if bmask_thinning.all():
    print("\u2713 Thinning:")
else: 
    print("\u26A0 Thinning:")
print(f"There are {len(df) - bmask_thinning.sum()} rows where 'vthin(ton)' and 'vthin(m3)' are inconsistent.")

✓ Thinning:
There are 0 rows where 'vthin(ton)' and 'vthin(m3)' are inconsistent.


#### b. Address inconsistencies between `ug_Sp` and `species` 
Let's look at the problematic rows:

In [9]:
# show the problematic rows
df[~check_ugsp_column]

Unnamed: 0,UG,ug_Sp,species,Presc,area,Control,NasolMaxRES,NasolMaxWood,period,v(m3),vthin(m3),vharv(m3),vthin(ton),vharv(ton),Vremovido(ton),rait0,rait5,rait10
30,1,1_Ec,Ct,1312,42.601782,1.1312,,,1,0.0000,0.0000,0.0000,0.000000,0.000000,0.000000,40.2712,51.1221,63.9027
32,1,1_Ec,Ct,1312,42.601782,1.1312,,,2,941.2790,0.0000,0.0000,0.000000,0.000000,0.000000,146.2009,170.4071,213.0089
33,1,1_Ec,Ct,1312,42.601782,1.1312,,,3,4879.1639,65.1807,0.0000,50.840946,0.000000,50.840946,151.4155,170.4071,213.0089
34,1,1_Ec,Ct,1312,42.601782,1.1312,,,4,6782.4034,2023.5847,0.0000,1578.396066,0.000000,1578.396066,160.0579,170.4071,213.0089
35,1,1_Ec,Ct,1312,42.601782,1.1312,,,5,0.0000,3208.3402,5405.6176,2502.505356,4216.381728,6718.887084,156.0558,170.4071,213.0089
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
317865,1626,1626_Pb,Sb,5027,0.761332,1626.5027,,,4,10.0572,0.0000,0.0000,0.000000,0.000000,0.000000,1.2943,1.5227,1.4465
317866,1626,1626_Pb,Sb,5027,0.761332,1626.5027,,,5,26.0985,0.0000,0.0000,0.000000,0.000000,0.000000,3.0453,3.0453,3.0453
317875,1626,1626_Pb,Sb,5037,0.761332,1626.5037,,,3,0.0000,0.0000,0.0000,0.000000,0.000000,0.000000,0.1523,0.1523,0.1523
317876,1626,1626_Pb,Sb,5037,0.761332,1626.5037,,,4,4.6898,0.0000,0.0000,0.000000,0.000000,0.000000,0.9136,0.9897,0.9897


Look at the data for just one Presc value (e.g. Presc=1312):

In [10]:
# look at just one Presc value
df[df['Presc']=='1312'].head(6)

Unnamed: 0,UG,ug_Sp,species,Presc,area,Control,NasolMaxRES,NasolMaxWood,period,v(m3),vthin(m3),vharv(m3),vthin(ton),vharv(ton),Vremovido(ton),rait0,rait5,rait10
30,1,1_Ec,Ct,1312,42.601782,1.1312,,,1,0.0,0.0,0.0,0.0,0.0,0.0,40.2712,51.1221,63.9027
31,1,1_Ec,Ec,1312,42.601782,1.1312,,,1,0.0,0.0,4443.3659,0.0,2999.271982,2999.271982,57.0695,110.7646,115.0248
32,1,1_Ec,Ct,1312,42.601782,1.1312,,,2,941.279,0.0,0.0,0.0,0.0,0.0,146.2009,170.4071,213.0089
33,1,1_Ec,Ct,1312,42.601782,1.1312,,,3,4879.1639,65.1807,0.0,50.840946,0.0,50.840946,151.4155,170.4071,213.0089
34,1,1_Ec,Ct,1312,42.601782,1.1312,,,4,6782.4034,2023.5847,0.0,1578.396066,0.0,1578.396066,160.0579,170.4071,213.0089
35,1,1_Ec,Ct,1312,42.601782,1.1312,,,5,0.0,3208.3402,5405.6176,2502.505356,4216.381728,6718.887084,156.0558,170.4071,213.0089


In this example, the species `Sp` changes from Ec to Ct. during period 1. Supposedly, when `ug_Sp` is not consistent with `Sp`, it indicates a species change during some period.

In [11]:
# look at all species changes
species_changes = df[df.duplicated(subset=['UG','ug_Sp','Presc','period'], keep=False)].copy()
species_changes

Unnamed: 0,UG,ug_Sp,species,Presc,area,Control,NasolMaxRES,NasolMaxWood,period,v(m3),vthin(m3),vharv(m3),vthin(ton),vharv(ton),Vremovido(ton),rait0,rait5,rait10
30,1,1_Ec,Ct,1312,42.601782,1.1312,,,1,0.0000,0.0,0.0000,0.0,0.000000,0.000000,40.2712,51.1221,63.9027
31,1,1_Ec,Ec,1312,42.601782,1.1312,,,1,0.0000,0.0,4443.3659,0.0,2999.271982,2999.271982,57.0695,110.7646,115.0248
41,1,1_Ec,Ct,1313,42.601782,1.1313,,,1,0.0000,0.0,0.0000,0.0,0.000000,0.000000,40.2712,51.1221,63.9027
42,1,1_Ec,Ec,1313,42.601782,1.1313,,,1,0.0000,0.0,4443.3659,0.0,2999.271982,2999.271982,57.0695,110.7646,115.0248
52,1,1_Ec,Ct,1314,42.601782,1.1314,,,1,0.0000,0.0,0.0000,0.0,0.000000,0.000000,40.2712,51.1221,63.9027
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
317852,1626,1626_Pb,Sb,5017,0.761332,1626.5017,,,2,0.0000,0.0,0.0000,0.0,0.000000,0.000000,0.1523,0.1523,0.1523
317863,1626,1626_Pb,Pb,5027,0.761332,1626.5027,,,3,0.0000,0.0,11.5722,0.0,6.943320,6.943320,0.3045,0.4568,0.4568
317864,1626,1626_Pb,Sb,5027,0.761332,1626.5027,,,3,0.2132,0.0,0.0000,0.0,0.000000,0.000000,0.6091,0.6091,0.6091
317874,1626,1626_Pb,Pb,5037,0.761332,1626.5037,,,3,0.0000,0.0,15.5312,0.0,9.318720,9.318720,0.9897,1.3704,1.2943


Look at the species changes in more detail later.

### Step 3: More data subsetting
Some columns are duplicates of some form, some are just not relevant for the problem under study. They will be dropped.

In [12]:
# drop unnecessary columns
df.drop(columns=['Control', 'v(m3)', 'vthin(m3)', 'vharv(m3)', 'vthin(ton)', 'vharv(ton)'], inplace=True)
df.head(1)

Unnamed: 0,UG,ug_Sp,species,Presc,area,NasolMaxRES,NasolMaxWood,period,Vremovido(ton),rait0,rait5,rait10
0,1,1_Ec,Ec,1,42.601782,,,1,2999.271982,132.0655,166.147,136.3257


## Annual prescription data sheet

In [13]:
# Read annual prescription data (2nd sheet) into df and look at head
presc_annual_df = pd.read_excel(file_path, sheet_name=1)

print(presc_annual_df.shape)
presc_annual_df.head(1)

(178243, 69)


Unnamed: 0,Periodo,UG,ug_sp,ug_presc,Si,fmm,areafmm,Area,Presc,Na sol MaxRES,...,Cbark,Cbranch,Cleaf,Croot,Nthin,CharvTree,CharvCork,CharvResin,CTree,cashflow
0,1,1,1_Ec,1.1,4,4,42.601782,42.601782,1,,...,0.0,0.0,0.0,0.0,0,1772.660162,0.0,0.0,0.0,131588.385201


### Step 0: Subset the data

In [14]:
# subset for correct time and look at df head
presc_anual_df = presc_annual_df[presc_annual_df['Periodo']<=5]
presc_anual_df['ug_sp'] = presc_anual_df['ug_sp'].str[-2:]
presc_anual_df.head(6)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  presc_anual_df['ug_sp'] = presc_anual_df['ug_sp'].str[-2:]


Unnamed: 0,Periodo,UG,ug_sp,ug_presc,Si,fmm,areafmm,Area,Presc,Na sol MaxRES,...,Cbark,Cbranch,Cleaf,Croot,Nthin,CharvTree,CharvCork,CharvResin,CTree,cashflow
0,1,1,Ec,1.1,4,4,42.601782,42.601782,1,,...,0.0,0.0,0.0,0.0,0,1772.660162,0.0,0.0,0.0,131588.385201
1,2,1,Ec,1.1,4,4,42.601782,42.601782,1,,...,0.0,0.0,0.0,0.0,0,3100.131699,0.0,0.0,0.0,232827.260682
2,3,1,Ec,1.1,4,4,42.601782,42.601782,1,,...,0.0,0.0,0.0,0.0,0,2673.155336,0.0,0.0,0.0,180591.085284
3,4,1,Ec,1.1,4,4,42.601782,42.601782,1,,...,0.0,0.0,0.0,0.0,0,2835.042108,0.0,0.0,0.0,206908.336318
4,5,1,Ec,1.1,4,4,42.601782,42.601782,1,,...,0.0,0.0,0.0,0.0,0,2648.872319,0.0,0.0,0.0,196053.402191
10,1,1,Ec,1.2,4,4,42.601782,42.601782,2,,...,0.0,0.0,0.0,0.0,0,2029.655414,0.0,0.0,0.0,150307.608334


In [15]:
pd.set_option('display.max_columns', None)
print(presc_annual_df[presc_annual_df.duplicated(subset=['UG','Periodo','Presc'], keep=False)].sort_values(by=['UG','Periodo','Year']).head(5))

     Periodo  UG ug_sp  ug_presc  Si  fmm    areafmm       Area  Presc  \
186        4   1  1_Qr    1.2600   4    6  42.601782  42.601782   2600   
193        4   1  1_Qr    1.2601   4    6  42.601782  42.601782   2601   
201        4   1  1_Qr    1.2602   4    6  42.601782  42.601782   2602   
209        4   1  1_Qr    1.2603   4    6  42.601782  42.601782   2603   
215        4   1  1_Qr    1.2604   4    6  42.601782  42.601782   2604   

    Na sol MaxRES Na sol MaxWood  Year  Ttotal   t  Rot      hdom    Nst  \
186           NaN            NaN  2050      31  23    1  2.682371  47927   
193           NaN            NaN  2050      31  23    1  2.682371  47909   
201           NaN            NaN  2050      31  23    1  2.682371  47909   
209           NaN            NaN  2050      31  23    1  2.682371  47909   
215           NaN            NaN  2050      31  23    1  2.682371  47909   

         N  Ndead  FW         G  Gu       dg  dug  cdw  pcob  cesp  \
186  47927    724 NaN  2.933

Not clear if this data will be used. Deal with it later.

## Solution data sheet
MaxRes and MaxWood solution are coming from two separate excel sheets. The stakeholder solution needs to be created from a different file.

In [16]:
# Read solution sheets into dfs
sol_mres = pd.read_excel('Data_Masterthesis/3-Harvesting_Schedule/Presc_dataset21_missingstakeholders_sol.xlsx', sheet_name=3)
sol_mres = sol_mres.astype(str)
sol_mres.name = "MaxRes"

sol_mwood = pd.read_excel('Data_Masterthesis/3-Harvesting_Schedule/Presc_dataset21_missingstakeholders_sol.xlsx', sheet_name=2)
sol_mwood = sol_mwood.astype(str)
sol_mwood.name = "MaxWood"

for sol in [sol_mres, sol_mwood]:
    print(sol.name, sol.shape)
    print(sol.head(1))
    print('-----------------------------------------------------------------')

MaxRes (1512, 7)
  Decision Variable name  Control Presc UG  Sp ShPer  Sol
0      Presc1601_Pa1_Ec_5  1.1601  1601  1  Ec     5  Yes
-----------------------------------------------------------------
MaxWood (1512, 7)
  Decision Variable name  Control Presc UG  Sp ShPer  Sol
0      Presc3502_Pa1_Ec_0  1.3502  3502  1  Ec     0  Yes
-----------------------------------------------------------------


#### Step 0: Check data consistency of MaxWood and MaxRes solution
Check if:
1. `Decision Variable name` == Presc`Presc`_ Pa`UG` _ `species`_ `ShPer`
2. `Control` == `UG`.`Presc`
3. `Sol` == 1 in every row

In [17]:
## Check data consistency and clean up unnecessary stuff

solutions = [(sol_mres, 'sol_mres'), (sol_mwood, 'sol_mwood')]
for sol, name in solutions:
    print('-----------------------------------------------------------------')
    print(name)

    # change dtypes
    sol = sol.astype(str)

    # Check if concatenating Presc	UG	Sp	ShPer gives the Decision Variable name column
    check_Dvar_column = ('Presc'+ sol['Presc'] + "_Pa" + sol['UG'] + "_" + sol['Sp'] + "_" + sol['ShPer']) == sol['Decision Variable name ']

    if check_Dvar_column.all(): 
        print("\u2713 column 'Decision Variable name ' is consistent. Can be dropped.")
    else: print("\u26A0 column 'Decision Variable name ' has a problem.")

    # Remove trailing zeros 
    sol['Control'] = sol['Control'].str.rstrip('0')
    sol['Presc'] = sol['Presc'].str.rstrip('0')

    # Check if concatenating 'UG' with a dot ('.') and 'Presc' gives the 'Control' column
    check_control_column = (sol['UG'] + '.' + sol['Presc']) == sol['Control']

    if check_control_column.all(): 
        print("\u2713 column 'Control' is consistent. Can be dropped.")
    else: print("\u26A0 column 'Control' has a problem.")

    # Check if column 'Sol' contains values other than 'Yes'
    if len(sol['Sol'].unique()) > 1:
        # Display the other values
        print('\u26A0', sol['Sol'].unique())
    else:
        print("\u2713 column 'Sol' contains no additional information. Can be dropped.")

    # change the order and drop unnecessary columns
    sol = sol[['UG', 'Presc', 'Sp', 'ShPer']]
    print(sol.shape)
    print(sol.head(1))

    # Update the original DataFrame
    if name == 'sol_mres':
        sol_mres = sol
    elif name == 'sol_mwood':
        sol_mwood = sol

-----------------------------------------------------------------
sol_mres
✓ column 'Decision Variable name ' is consistent. Can be dropped.
✓ column 'Control' is consistent. Can be dropped.
✓ column 'Sol' contains no additional information. Can be dropped.
(1512, 4)
  UG Presc  Sp ShPer
0  1  1601  Ec     5
-----------------------------------------------------------------
sol_mwood
✓ column 'Decision Variable name ' is consistent. Can be dropped.
✓ column 'Control' is consistent. Can be dropped.
✓ column 'Sol' contains no additional information. Can be dropped.
(1512, 4)
  UG Presc  Sp ShPer
0  1  3502  Ec     0


### Step 1: Bring stakeholder solution to the same format as maxwood and maxres

In [18]:
# Extract stakeholder solution from logfile
import re

log_file_path = 'Data_Masterthesis/3-Harvesting_Schedule/Stakeholders_Solution.log'

# Open and read the log file
with open(log_file_path, 'r') as file:
    log_content = file.read()

# Use regular expression to find the variables and their values
pattern = r'(Presc\S+)\s+(\d+\.\d{6})'
matches = re.findall(pattern, log_content)

# Extract variables and their respective values
variables_values = re.findall(r'(Presc\S+)\s+(\d+\.\d{6})', log_content)

# Create a DataFrame from the matches
sol_stake = pd.DataFrame(matches, columns=['Variable Name', 'Solution Value'])

# Convert the 'Solution Value' column to float
sol_stake['Solution Value'] = sol_stake['Solution Value'].astype(float)

print(sol_stake.shape)
sol_stake.head(1)

(1514, 2)


Unnamed: 0,Variable Name,Solution Value
0,Presc503_Pa101_Pb_10,1.0


In [19]:
# Extract the required parts of the variable name into seperate columns via regex
sol_stake['UG'] = sol_stake['Variable Name'].apply(lambda x: re.search(r'_Pa(\d+)_', x).group(1))
sol_stake['ug_Sp'] = sol_stake['Variable Name'].apply(lambda x: re.search(r'_Pa(\d+_[A-Za-z]{2})_', x).group(1))
sol_stake['Presc'] = sol_stake['Variable Name'].apply(lambda x: re.search(r'Presc(\d+)_Pa', x).group(1))
sol_stake['ShPer'] = sol_stake['Variable Name'].apply(lambda x: re.search(r'_[A-Za-z]{2}_(\d+)$', x).group(1))

# remove trailing zeros
sol_stake['Presc'] = sol_stake['Presc'].str.rstrip('0')

sol_stake.head(1)

Unnamed: 0,Variable Name,Solution Value,UG,ug_Sp,Presc,ShPer
0,Presc503_Pa101_Pb_10,1.0,101,101_Pb,503,10


In [20]:
# drop Variable name
sol_stake = sol_stake[['UG','Presc','ug_Sp','ShPer', 'Solution Value']]
sol_stake.head(1)

Unnamed: 0,UG,Presc,ug_Sp,ShPer,Solution Value
0,101,503,101_Pb,10,1.0


### Step 2: Check data consistency of stakeholder solution

In [21]:
# Check if column 'Solution Value' contains values other than 'Yes'
if len(sol_stake['Solution Value'].unique()) > 1:
    # Display the other values
    print('\u26A0 there are different solution values:', sol_stake['Solution Value'].unique())
else:
    print("\u2713 column 'Solution Value' contains no additional information. Can be dropped.")

⚠ there are different solution values: [1.       0.494295 0.567668 0.505705 0.432332]


In [22]:
# look at solution values that are not 1:
weird = sol_stake[sol_stake['Solution Value']!=1].sort_values(by='UG')
weird

Unnamed: 0,UG,Presc,ug_Sp,ShPer,Solution Value
298,1253,36503,1253_Ec,10,0.494295
1134,1253,36,1253_Ec,5,0.505705
776,1339,197,1339_Ec,0,0.567668
1155,1339,19318,1339_Ec,0,0.432332


⚠ There are two UGs (1253 and 1339) with two prescriptions each that have Solution Value <1. This might be caused by the linear relaxation of the problem (not only 0 or 1 are allowed for solution value). Look at this later.

### Step 3: Prepare solution data for merge with prescription data

In [23]:
# create col ug_Sp in maxWood and maxRes (needed for merge with presc, so we can still know where species changed)
sol_mres['ug_Sp'] = sol_mres['UG'] + '_' + sol_mres['Sp']
sol_mwood['ug_Sp'] = sol_mwood['UG'] + '_' + sol_mwood['Sp']
sol_mres.head(1)

Unnamed: 0,UG,Presc,Sp,ShPer,ug_Sp
0,1,1601,Ec,5,1_Ec


## Merge prescription data into each solution df

To get all information needed for the selected solutions, we need to merge each solution with the `presc`-dataframe containing the needed data.

In [24]:
# look at prescription df
df.head(1)

Unnamed: 0,UG,ug_Sp,species,Presc,area,NasolMaxRES,NasolMaxWood,period,Vremovido(ton),rait0,rait5,rait10
0,1,1_Ec,Ec,1,42.601782,,,1,2999.271982,132.0655,166.147,136.3257


In [25]:
# look at stakeholder solution df
sol_stake.head(1)

Unnamed: 0,UG,Presc,ug_Sp,ShPer,Solution Value
0,101,503,101_Pb,10,1.0


### Step 0: Prepare the dfs for merge (formatting)

In [26]:
# prepare for merge by converting all columns in all solution dfs to str
for datafr in [sol_mres, sol_mwood, sol_stake]:
    datafr = datafr.astype(str)
    
df = df.astype(str)

### Step 1: Merge (right) prescription df and each solution df on columns `ug_Sp` and `Presc`

In [27]:
## merge presc data with solutions on 'ug_Sp', 'Presc'
print('before merge:', df.shape, len(sol_mres), len(sol_mwood), len(sol_stake))
print('-------------------------')

# merge on 'ug_SP', 'Presc'
mres = df.merge(sol_mres[['UG','ug_Sp','ShPer','Presc']], on=['Presc', 'UG','ug_Sp'], how='right')
mwood = df.merge(sol_mwood[['UG','ug_Sp','ShPer','Presc']], on=['Presc', 'UG','ug_Sp'], how='right')
stake = df.merge(sol_stake[['UG','ug_Sp','ShPer','Presc','Solution Value']], on=['Presc', 'UG','ug_Sp'], how='right')

mres.name = 'mres'
mwood.name = 'mwood'
stake.name = 'stake'

print('after merge:')
for i in [mres, mwood, stake]:
    print(i.name)
    print(i.shape)
    print('-----------')
i.head(1)

before merge: (169693, 12) 1512 1512 1514
-------------------------
after merge:
mres
(8212, 13)
-----------
mwood
(7670, 13)
-----------
stake
(8403, 14)
-----------


Unnamed: 0,UG,ug_Sp,species,Presc,area,NasolMaxRES,NasolMaxWood,period,Vremovido(ton),rait0,rait5,rait10,ShPer,Solution Value
0,101,101_Pb,Pb,503,8.17812088,Yes,Yes,1,0.0,11.4494,12.2672,11.4494,10,1.0


## Addressing special cases

#### Special case 1: The species planted changes during a period
For some `UG` and `period`, we have
- same `Presc`,
- same `area`,
- different `species`.

In [28]:
# filter for species changes in mres solution
mres_spchanges = mres[mres.duplicated(subset=['UG','Presc','period','area'], keep=False)].copy()
print(len(mres_spchanges), 'species changes in mres')
mres_spchanges.head(4)

1304 species changes in mres


Unnamed: 0,UG,ug_Sp,species,Presc,area,NasolMaxRES,NasolMaxWood,period,Vremovido(ton),rait0,rait5,rait10,ShPer
0,1,1_Ec,Ec,1601,42.60178231,Yes,,1,2999.2719825,110.7646,115.0248,85.2036,5
1,1,1_Ec,Qr,1601,42.60178231,Yes,,1,0.0,51.1221,63.9027,51.1221,5
6,2,2_Ec,Ct,19301,8.263863486,Yes,,1,0.0,18.7209,28.9235,28.9235,0
7,2,2_Ec,Ec,19301,8.263863486,Yes,,1,90.365355,5.151,12.3958,10.743,0


In [29]:
# filter for species changes in mwood solution
mwood_spchanges = mwood[mwood.duplicated(subset=['UG','Presc','period','area'], keep=False)].copy()
print(len(mwood_spchanges), 'species changes in mwood')
mwood_spchanges.head(4)

220 species changes in mwood


Unnamed: 0,UG,ug_Sp,species,Presc,area,NasolMaxRES,NasolMaxWood,period,Vremovido(ton),rait0,rait5,rait10,ShPer
0,1,1_Ec,Ec,3502,42.60178231,,Yes,1,3968.35605,136.3257,149.1062,119.285,0
1,1,1_Ec,Pb,3502,42.60178231,,Yes,1,0.0,4.2602,4.2602,4.2602,0
26,6,6_Ec,Ec,21502,8.758456017,,Yes,1,678.1015575000001,20.1444,20.1444,16.6411,10
27,6,6_Ec,Pb,21502,8.758456017,,Yes,1,0.0,8.7585,8.7585,8.7585,10


In [30]:
# look for species changes in stakeholder solution
stake_spchanges = stake[stake.duplicated(subset=['UG','Presc','period','area'], keep=False)].copy()
print(len(stake_spchanges), 'species changes in stakeholder solution')
stake_spchanges.head(4)

1666 species changes in stakeholder solution


Unnamed: 0,UG,ug_Sp,species,Presc,area,NasolMaxRES,NasolMaxWood,period,Vremovido(ton),rait0,rait5,rait10,ShPer,Solution Value
13,1016,1016_Pb,Pb,500602,8.127216029,,,4,223.3359,1.6254,1.6254,1.6254,0,1.0
14,1016,1016_Pb,Qr,500602,8.127216029,,,4,0.0,32.5089,32.5089,32.5089,0,1.0
34,1066,1066_Pb,Pb,5007,7.916533087,Yes,,4,224.19624,4.7499,6.3332,4.7499,5,1.0
35,1066,1066_Pb,Sb,5007,7.916533087,Yes,,4,0.0,25.3329,26.9162,26.1246,5,1.0


#### Special case 2: Different species planted simultaneously in the same stand
There are some "mixed stands": Per same `UG` and `period`, we have:
- different `Presc`,
- different `area`,
- different `species`.

In [31]:
# filter mixed stands in mres
mres_dupl = mres[mres.duplicated(subset=['UG','period'], keep=False)].copy()
mres_mixed = mres_dupl[~mres_dupl.duplicated(subset=['UG','period','area', 'Presc'], keep=False)].sort_values(by=['UG','period'])
print(len(mres_mixed),'mixed stands in mres')
mres_mixed.head(4)

917 mixed stands in mres


Unnamed: 0,UG,ug_Sp,species,Presc,area,NasolMaxRES,NasolMaxWood,period,Vremovido(ton),rait0,rait5,rait10,ShPer
5263,1015,1015_Ec,Ec,3,2.934230183,Yes,,1,47.336467500000005,9.683,11.7369,11.1501,5
5268,1015,1015_Pb,Pb,503,8.036243378,Yes,Yes,1,0.0,8.0362,9.6435,8.8399,5
5264,1015,1015_Ec,Ec,3,2.934230183,Yes,,2,102.99150000000002,9.683,11.7369,11.4435,5
5269,1015,1015_Pb,Pb,503,8.036243378,Yes,Yes,2,0.0,9.6435,11.2507,10.4471,5


In [32]:
# filter for mixed stands in mwood
mwood_dupl = mwood[mwood.duplicated(subset=['UG', 'period'], keep=False)].copy()
mwood_mixed = mwood_dupl[~mwood_dupl.duplicated(subset=['UG', 'period', 'area', 'Presc'], keep=False)].sort_values(by=['UG', 'period'])
print(len(mwood_mixed),'mixed stands in mwood')
mwood_mixed.head(4)

1042 mixed stands in mwood


Unnamed: 0,UG,ug_Sp,species,Presc,area,NasolMaxRES,NasolMaxWood,period,Vremovido(ton),rait0,rait5,rait10,ShPer
4952,1015,1015_Ec,Ec,2,2.934230183,,Yes,1,39.216015,9.9764,11.7369,11.1501,0
4957,1015,1015_Pb,Pb,503,8.036243378,Yes,Yes,1,0.0,8.0362,9.6435,8.8399,0
4953,1015,1015_Ec,Ec,2,2.934230183,,Yes,2,88.92915750000002,10.2698,11.7369,11.4435,0
4958,1015,1015_Pb,Pb,503,8.036243378,Yes,Yes,2,0.0,9.6435,11.2507,10.4471,0


In [33]:
# filter for mixed stands in stakeholder
stake_dupl = stake[stake.duplicated(subset=['UG', 'period'], keep=False)].copy()
stake_mixed = stake_dupl[~stake_dupl.duplicated(subset=['UG', 'period', 'area', 'Presc'], keep=False)].sort_values(by=['UG', 'period'])
print(len(stake_mixed), 'mixed stands in stakeholder solution')
stake_mixed.head(4)

951 mixed stands in stakeholder solution


Unnamed: 0,UG,ug_Sp,species,Presc,area,NasolMaxRES,NasolMaxWood,period,Vremovido(ton),rait0,rait5,rait10,ShPer,Solution Value
5,1015,1015_Pb,Pb,503,8.036243378,Yes,Yes,1,0.0,8.0362,9.6435,8.8399,5,1.0
6147,1015,1015_Ec,Ec,3,2.934230183,Yes,,1,47.336467500000005,9.683,11.7369,11.1501,10,1.0
6,1015,1015_Pb,Pb,503,8.036243378,Yes,Yes,2,0.0,9.6435,11.2507,10.4471,5,1.0
6148,1015,1015_Ec,Ec,3,2.934230183,Yes,,2,102.99150000000002,9.683,11.7369,11.4435,10,1.0


#### Validate the data
Verify that the rows with mixed species and species changes together match the total rows for stands with multiple species in any period.

In [34]:
# check if the partial sums add up
print('mres')
if len(mres_mixed) + len(mres_spchanges) == len(mres_dupl):
    print('\u2714', len(mres_mixed), 'mixed stands', '+', len(mres_spchanges),'species changes', '=', len(mres_dupl), 'stands with mixed species')
else: 
    print('\u26A0', len(mres_mixed), 'mixed stands', '+', len(mres_spchanges), 'species changes', '!=', len(mres_dupl), 'stands with mixed species')

print('mwood')
if len(mwood_mixed) + len(mwood_spchanges) == len(mwood_dupl):
    print('\u2714', len(mwood_mixed), 'mixed stands', '+', len(mwood_spchanges),'species changes', '=', len(mwood_dupl), 'stands with mixed species')
else: 
    print('\u26A0', len(mwood_mixed), 'mixed stands', '+', len(mwood_spchanges), 'species changes', '!=', len(mwood_dupl), 'stands with mixed species')    

print('stake')
if len(stake_mixed) + len(stake_spchanges) == len(stake_dupl):
    print('\u2714', len(stake_mixed), 'mixed stands', '+', len(stake_spchanges),'species changes', '=', len(stake_dupl), 'stands with mixed species')
else: 
    print('\u26A0', len(stake_mixed), 'mixed stands', '+', len(stake_spchanges), 'species changes', '!=', len(stake_dupl), 'stands with mixed species')

mres
✔ 917 mixed stands + 1304 species changes = 2221 stands with mixed species
mwood
✔ 1042 mixed stands + 220 species changes = 1262 stands with mixed species
stake
✔ 951 mixed stands + 1666 species changes = 2617 stands with mixed species


#### Special case 3: Rows with solution value <1
We still need to address the weird rows from the stakeholder solution data:

In [35]:
# look again at the cases with solution value <1:
weird

Unnamed: 0,UG,Presc,ug_Sp,ShPer,Solution Value
298,1253,36503,1253_Ec,10,0.494295
1134,1253,36,1253_Ec,5,0.505705
776,1339,197,1339_Ec,0,0.567668
1155,1339,19318,1339_Ec,0,0.432332


In UGs 1253 and 1339, the species, as indicated by `ug_Sp`, remain the same, even though there are different prescriptions for both UG during that period. WHY???

*Could this be a consequence of the usage of linear relaxation when obtaining the stakeholder solution?*

In [36]:
# look at the full data for the UGs with solution value <1
print(len(stake[stake['Solution Value']!=1]),'rows')
stake[stake['Solution Value']!=1].sort_values(by=['UG','period'])

23 rows


Unnamed: 0,UG,ug_Sp,species,Presc,area,NasolMaxRES,NasolMaxWood,period,Vremovido(ton),rait0,rait5,rait10,ShPer,Solution Value
1585,1253,1253_Ec,Ec,36503,35.10247365,,,1,881.171595,24.5717,28.082,28.082,10,0.494295
1586,1253,1253_Ec,Pb,36503,35.10247365,,,1,0.0,35.1025,45.6332,38.6127,10,0.494295
6448,1253,1253_Ec,Ec,36,35.10247365,Yes,,1,881.171595,91.2664,126.3689,119.3484,5,0.505705
1587,1253,1253_Ec,Pb,36503,35.10247365,,,2,0.0,49.1435,63.1845,59.6742,10,0.494295
6449,1253,1253_Ec,Ec,36,35.10247365,Yes,,2,2213.035425,101.7972,122.8587,101.7972,5,0.505705
1588,1253,1253_Ec,Pb,36503,35.10247365,,,3,0.0,56.164,73.7152,66.6947,10,0.494295
6450,1253,1253_Ec,Ec,36,35.10247365,Yes,,3,1997.4184875,87.7562,122.8587,101.7972,5,0.505705
1589,1253,1253_Ec,Pb,36503,35.10247365,,,4,518.11254,66.6947,91.2664,87.7562,10,0.494295
6451,1253,1253_Ec,Ec,36,35.10247365,Yes,,4,1710.719055,84.2459,129.8792,105.3074,5,0.505705
1590,1253,1253_Ec,Pb,36503,35.10247365,,,5,0.0,91.2664,101.7972,98.2869,10,0.494295


## Select the correct fire resistance value (rait) based on shrub cleaning periodicity
Due to varying shrub cleaning periodicities (0, 5, or 10), corresponding fire resistance values are available in the `rait`, `rait5`, and `rait10` columns.

### Step 1: Select the fire resistance value based on shrub cleaning periodicity.

In [37]:
# Select the correct fire resistance value depending on shrub cleaning periodicitiy

# Define a function to select the correct 'rait' based on 'ShPer' value
def fill_rait(row):
    if row['ShPer'] == 0:
        return row['rait0']
    elif row['ShPer'] == 5:
        return row['rait5']
    elif row['ShPer'] == 10:
        return row['rait10']
    else:
        return None  # or any default value you prefer for other cases

# use the function defined above in a loop to select the correct rait
for i in [mres, mwood, stake]:
    print(i.name)
    # ensure correct format
    i['ShPer'] = i['ShPer'].astype(int)

    # Apply the function to create the 'rait' column
    i['Rait'] = i.apply(fill_rait, axis=1)

    # look at the result
    print(i.head(1))
    print('---------------------------------------------------')

mres
  UG ug_Sp species Presc         area NasolMaxRES NasolMaxWood period  \
0  1  1_Ec      Ec  1601  42.60178231         Yes          nan      1   

  Vremovido(ton)     rait0     rait5   rait10  ShPer      Rait  
0   2999.2719825  110.7646  115.0248  85.2036      5  115.0248  
---------------------------------------------------
mwood
  UG ug_Sp species Presc         area NasolMaxRES NasolMaxWood period  \
0  1  1_Ec      Ec  3502  42.60178231         nan          Yes      1   

       Vremovido(ton)     rait0     rait5   rait10  ShPer      Rait  
0  3968.3560500000003  136.3257  149.1062  119.285      0  136.3257  
---------------------------------------------------
stake
    UG   ug_Sp species Presc        area NasolMaxRES NasolMaxWood period  \
0  101  101_Pb      Pb   503  8.17812088         Yes          Yes      1   

  Vremovido(ton)    rait0    rait5   rait10  ShPer  Solution Value     Rait  
0            0.0  11.4494  12.2672  11.4494     10             1.0  11.4494  
------

### Step 2: Validate if the correct rait value was selected

In [38]:
# check if the Rait was assigned correctly

for i in [mres, mwood, stake]:

    # Check if value in 'Rait' column matches 'rait0' when 'ShPer' is 0
    is_rait0 = (i['Rait'] == i['rait0']) & (i['ShPer'] == 0)
    print(is_rait0.sum(), "rows have Rait = rait0 based on ShPer==0")

    # Check if value in 'Rait' column matches 'rait5' when 'ShPer' is 5
    is_rait5 = (i['Rait'] == i['rait5']) & (i['ShPer'] == 5)
    print(is_rait5.sum(), "rows have Rait = rait5 based on ShPer==5")

    # Check if value in 'Rait' column matches 'rait10' when 'ShPer' is 10
    is_rait10 = (i['Rait'] == i['rait10']) & (i['ShPer'] == 10)
    print(is_rait10.sum(), "rows have Rait = rait10 based on ShPer==10")

    if (is_rait10.sum()+is_rait5.sum()+is_rait0.sum())==len(i):
        print('\u2713', (is_rait10.sum()+is_rait5.sum()+is_rait0.sum()), 'out of', len(i),'were matched correctly')
    else:
        print('u26A0', (is_rait10.sum()+is_rait5.sum()+is_rait0.sum()), 'out of', len(i),'were matched correctly')
    print('--------------------------------------------------')

535 rows have Rait = rait0 based on ShPer==0
7365 rows have Rait = rait5 based on ShPer==5
312 rows have Rait = rait10 based on ShPer==10
✓ 8212 out of 8212 were matched correctly
--------------------------------------------------
3323 rows have Rait = rait0 based on ShPer==0
1926 rows have Rait = rait5 based on ShPer==5
2421 rows have Rait = rait10 based on ShPer==10
✓ 7670 out of 7670 were matched correctly
--------------------------------------------------
4301 rows have Rait = rait0 based on ShPer==0
3440 rows have Rait = rait5 based on ShPer==5
662 rows have Rait = rait10 based on ShPer==10
✓ 8403 out of 8403 were matched correctly
--------------------------------------------------


### Step 3: Subset the data, only keep the necessary

In [39]:
# Define columns to keep
columns_to_keep = [
    'UG', 'ug_Sp', 'species', 'Presc', 'area', 'period', 'Vremovido(ton)', 'Rait'
]

# Subset each DataFrame to include only the columns to keep
mres = mres[columns_to_keep]
mwood = mwood[columns_to_keep]
stake = stake[columns_to_keep]

# Display the first row of each updated DataFrame
print("mres:\n", mres.head(1))
print("mwood:\n", mwood.head(1))
print("stake:\n", stake.head(1))

mres:
   UG ug_Sp species Presc         area period Vremovido(ton)      Rait
0  1  1_Ec      Ec  1601  42.60178231      1   2999.2719825  115.0248
mwood:
   UG ug_Sp species Presc         area period      Vremovido(ton)      Rait
0  1  1_Ec      Ec  3502  42.60178231      1  3968.3560500000003  136.3257
stake:
     UG   ug_Sp species Presc        area period Vremovido(ton)     Rait
0  101  101_Pb      Pb   503  8.17812088      1            0.0  11.4494


## Save cleaned data to csv files

In [40]:
# Save the selected data to a CSV file
mres.to_csv('MaxRes_Data.csv', index=False)
mwood.to_csv('MaxWood_Data.csv', index=False)
stake.to_csv('Stakeholder_Sol_Data.csv', index=False)
print('saved solution data to csv files')

saved solution data to csv files


## Create lists of UG with a) timber to transport and b) fire resistance too low

For each solution, and each period (1 to 5), we need:
- a list of UG that need timbertruck road access (5m)
- a list of UG that need firetruck road access (10m)


### 5m road access list
Create a list with all stands that have timber to remove (Vremovido>0), so they need 5m road access:

In [41]:
# preparation of next steps (dict of dataframes and names for loop etc)
import os

# Dictionary to map names to DataFrames
df_dict = {
    'mres': mres,
    'mwood': mwood,
    'stake': stake
}

pd.options.mode.copy_on_write = True

In [42]:
print(mres.head(1))

  UG ug_Sp species Presc         area period Vremovido(ton)      Rait
0  1  1_Ec      Ec  1601  42.60178231      1   2999.2719825  115.0248


In [43]:
# create lists of UG that need timber transport access and save to file

process_info_list = []

# Iterate through the dictionary and process each DataFrame
for name, df in df_dict.items():

    # print info of inital df
    print(f"Initial shape of {name}:",df.shape)

    # Convert 'Vremovido(ton)' column to float and 'period' to int
    df['Vremovido(ton)'] = df['Vremovido(ton)'].astype(float)
    df['period'] = df['period'].astype(int)

    # Process data for each period from 1 to 5
    for period in range(1, 6):
        # Filter DataFrame for the current period and non-zero 'Vremovido(ton)'
        timberaccess = df[(df['Vremovido(ton)'] > 0) & (df['period'] == period)]

        # subset the columns, keep Vremovido for validation
        timberaccess = timberaccess[['UG','Vremovido(ton)']]

        # Determine the save file path
        tfilename = f'{name}_period{period}_timber.csv'
        subfolder = 'Processed_Data'
        os.makedirs(subfolder, exist_ok=True)
        tfile_path = os.path.join(subfolder, tfilename)

        # Save the filtered DataFrame to a CSV file if not empty
        if not timberaccess.empty:
            timberaccess.to_csv(tfile_path, index=False)
            saved_file = tfile_path
        else:
            saved_file = 'No data to save'

        # Append information to the process_info_list
        process_info_list.append({
            'DataFrame': name,
            'Period': period,
            'Filtered_Shape': timberaccess.shape,
            'Saved_File': saved_file
        })

# Convert the info list to a DataFrame
process_info = pd.DataFrame(process_info_list)

# Print the collected process information
print(process_info)
print(timberaccess.head(1))

Initial shape of mres: (8212, 8)
Initial shape of mwood: (7670, 8)
Initial shape of stake: (8403, 8)
   DataFrame  Period Filtered_Shape                               Saved_File
0       mres       1      (1067, 2)   Processed_Data\mres_period1_timber.csv
1       mres       2       (590, 2)   Processed_Data\mres_period2_timber.csv
2       mres       3       (766, 2)   Processed_Data\mres_period3_timber.csv
3       mres       4      (1276, 2)   Processed_Data\mres_period4_timber.csv
4       mres       5      (1246, 2)   Processed_Data\mres_period5_timber.csv
5      mwood       1      (1073, 2)  Processed_Data\mwood_period1_timber.csv
6      mwood       2       (986, 2)  Processed_Data\mwood_period2_timber.csv
7      mwood       3      (1114, 2)  Processed_Data\mwood_period3_timber.csv
8      mwood       4      (1306, 2)  Processed_Data\mwood_period4_timber.csv
9      mwood       5      (1304, 2)  Processed_Data\mwood_period5_timber.csv
10     stake       1      (1061, 2)  Processed_Data\

### 10m road access list
Select the roads where the rait is considered below critical minimum:

In [44]:
# create lists of UG that need fire transport access and save to file

# List to collect process information
process_info_list = []

# Iterate through the dictionary and process each DataFrame
for name, df in df_dict.items():

    # Print info of initial df
    print(f"Initial shape of {name}:", df.shape)

    # Convert 'Rait' column to float and 'period' to int
    df['Rait'] = df['Rait'].astype(float)
    df['period'] = df['period'].astype(int)

    # Process data for each period from 1 to 5
    for period in range(1, 6):
        # Find the most endangered stands in the period
        quant = 0.05
        low_rait = df[(df['Rait'] < df['Rait'].quantile(quant)) & (df['period'] == period)]

        # Subset the columns, keep Rait for validation
        low_rait = low_rait[['UG', 'Rait']]

        # Determine the save file path
        filename = f'{name}_period{period}_low_rait.csv'
        subfolder = 'Processed_Data'
        os.makedirs(subfolder, exist_ok=True)
        file_path = os.path.join(subfolder, filename)

        # Save the filtered DataFrame to a CSV file if not empty
        if not low_rait.empty:
            low_rait.to_csv(file_path, index=False)
            saved_file = file_path
        else:
            saved_file = 'No data to save'

        # Append information to the process_info_list
        process_info_list.append({
            'DataFrame': name,
            'Period': period,
            'Filtered_Shape': low_rait.shape,
            'Saved_File': saved_file
        })

# Convert the info list to a DataFrame
process_info = pd.DataFrame(process_info_list)

# Print the collected process information
print(process_info)

# Example print of the head of one of the filtered DataFrames for validation
print(low_rait.head(1))

Initial shape of mres: (8212, 8)
Initial shape of mwood: (7670, 8)
Initial shape of stake: (8403, 8)
   DataFrame  Period Filtered_Shape                                 Saved_File
0       mres       1       (126, 2)   Processed_Data\mres_period1_low_rait.csv
1       mres       2        (99, 2)   Processed_Data\mres_period2_low_rait.csv
2       mres       3        (73, 2)   Processed_Data\mres_period3_low_rait.csv
3       mres       4        (64, 2)   Processed_Data\mres_period4_low_rait.csv
4       mres       5        (49, 2)   Processed_Data\mres_period5_low_rait.csv
5      mwood       1        (91, 2)  Processed_Data\mwood_period1_low_rait.csv
6      mwood       2        (71, 2)  Processed_Data\mwood_period2_low_rait.csv
7      mwood       3        (76, 2)  Processed_Data\mwood_period3_low_rait.csv
8      mwood       4        (69, 2)  Processed_Data\mwood_period4_low_rait.csv
9      mwood       5        (76, 2)  Processed_Data\mwood_period5_low_rait.csv
10     stake       1       (16