# Data prep


In [1]:
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import geopandas as gpd

In [2]:
#Read in old AWI
gdf_old = gpd.read_file("O:/NaturalEngland/Ancient Woodland Inventory Update/Analysis/Old_AWI_Wilts/Old_AWI_Wilts.shp")
# Updated AWI
gdf_new = gpd.read_file("O:/NaturalEngland/Ancient Woodland Inventory Update/Analysis/New_AWI_Wilts/AWI_sources_updated_5_resolved.shp")
gdf_new_reduced = gdf_new[['NAME_OS', 'NAME_EP1', 'P3_UID', 'P3_IGNORE', 'C21_AP', 'EPOCH_2',
       'EPOCH_1', 'C20_AP', 'C20_AP_REF', 'TITHE_REF', 'C19_EM', 'C19_EM_REF',
       'OSD', 'OSD_REF', 'CM_1773', 'C18_EM', 'C18_EM_REF', 'C17_EM',
       'C17_EM_REF', 'EM', 'EM_REF', 'OTHER', 'OTHER_REF', 'NAME_TITHE',
       'NAME_OSD', 'NAME_EM', 'TITHE_REF_', 'TITHE_LAND', 'TITHE_PLOT',
       'TITHE_PL_1', 'COMMENTS', 'url', 'OSNAMES_UR', 'CHECK_NAME',
       'TO_CHECK_N', 'NOTES', 'LISTED_BUI', 'LISTED_PAR', 'P2_CLASS',
       'P3_CLASS', 'P3_Status', 'TITHE', 'GlobalID_1', 'EARLIEST_E', 'RESERVE',
       'FORESTRY', 'SURVEY_PRI', 'ACTION', 'SITE_ID', 'AreaHa', 'NAME',
       'Shape_Leng', 'OBJECTID_1', 'HE_REF', 'geometry']]
#Find intersections where old AWI is in same place as new AWI
gdf_new_reduced['intersects_old_AWI'] = gdf_new_reduced.geometry.apply(lambda x: gdf_old.geometry.intersects(x).any())



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super().__setitem__(key, value)


In [3]:
#df = pd.read_excel('O:/NaturalEngland/Ancient Woodland Inventory Update/Analysis/AWI_data_analysis_assessment.xlsm', sheet_name = 'AWI_data_RAW')
df = gdf_new_reduced.copy()

In [4]:
# Defining functions
def check_if_assessed(FIELD):
    if FIELD == 3 or FIELD ==4 or FIELD ==5:
        return 0
    else:
        return 1

def check_if_present(FIELD):
    if FIELD == 1 or FIELD ==2 or FIELD ==6:
        return 1
    else:
        return 0
    
def check_if_zero(FIELD):
    if FIELD == 0:
        return 1
    else:
        return 0
    
def check_if_name(FIELD):
    if FIELD.strip() == '' or FIELD == 'None':
        return 0
    else:
        return 1


In [5]:
# Columns to check if if parcel has been marked as woodland
columns_to_assess = ['TITHE','C19_EM', 'OSD', 'CM_1773', 'C18_EM', 'C17_EM', 'EM','OTHER' ]

#Loop through each parcel and create new column showing if woodland present in source, and if source is zero (not woodland)
for i in columns_to_assess:
    df[f"present_{i}"] = df[i].apply(check_if_present)
    df[f"zero_{i}"] = df[i].apply(check_if_zero)
    df[f"assessed_{i}"] = df[i].apply(check_if_assessed)

df["HE_REF"] = df['HE_REF'].astype(str)
df["assessed_HE_REF"] = df['HE_REF'].apply(check_if_name)

In [6]:
# Check which parcels are named

df['NAME_TITHE'] = df['NAME_TITHE'].astype(str)

names_to_assess = ['NAME_TITHE','NAME_EP1', 'NAME_OSD', 'NAME_EM', 'HE_REF']
# Add column showing which parcels are named in source
for i in names_to_assess:
    df[i] = df[i].astype(str)
    df[f"present_{i}"] = df[i].apply(check_if_name)

# Combine to get total named sources for parcel    
df['name_present'] = df['present_NAME_EM']+df['present_NAME_OSD']+df['present_NAME_EP1'] +df['present_NAME_TITHE']

In [7]:
# Combine columns showing woodland present to see how many sources show woodland present
df['present_combined'] = df['present_TITHE']+df['present_C19_EM'] +df['present_OSD']+df['present_CM_1773'] +df['present_C18_EM']+df['present_C17_EM'] +df['present_EM']+df['present_OTHER'] +df['present_HE_REF']
df['present_primary'] = df['present_TITHE'] + df['present_OSD'] +df['present_HE_REF']+df['present_CM_1773']
df['present_secondary'] =df['present_C19_EM'] +df['present_C18_EM'] +df['present_EM']+df['present_OTHER'] + df['present_C17_EM']

# Combine columns showing if zero present to show how many sources show no woodland present
df['zero_combined'] = df['zero_TITHE']+df['zero_C19_EM'] +df['zero_OSD']+df['zero_CM_1773'] +df['zero_C18_EM']+df['zero_C17_EM'] +df['zero_EM']+df['zero_OTHER'] 
df['zero_primary'] = df['zero_TITHE']+df['zero_OSD']+df['zero_CM_1773'] 
df['zero_secondary'] =  df['zero_C19_EM'] +df['zero_C18_EM']+df['zero_C17_EM'] +df['zero_EM']+df['zero_OTHER']

df['assessed_primary'] = df['assessed_TITHE'] + df['assessed_OSD'] +df['assessed_HE_REF']+df['assessed_CM_1773']



# Decision tree


In [8]:
# Filter for primary source present only
df_primary = df[df.present_primary >=1]
df_primary_one = df[df.present_primary ==1]
df_primary_two = df[df.present_primary >=2]
df_no_primary = df[df.present_primary ==0]

## A - Ancient woodland
## Contains primary evidence and no zero's present
df_primary_no_zero = df_primary[df_primary.zero_combined ==0]
df_primary_no_zero['Category'] = 'Ancient woodland'
df_primary_no_zero['subCategory'] = 'A'
df_one_primary_no_zero = df_primary_one[df_primary_one.zero_combined ==0]

#Filter for primary source present, one zero present
df_primary_zero = df_primary[df_primary.zero_combined >=1]
df_primary_one_zero = df_primary[df_primary.zero_combined ==1]
df_one_primary_zero = df_primary_one[df_primary_one.zero_combined >=1]
df_one_primary_one_zero = df_primary_one[df_primary_one.zero_combined ==1]

## A - Not ancient woodland 
## Has primary evidence but contains more than one zero
df_primary_plural_zero = df_primary[df_primary.zero_combined >1]
df_primary_plural_zero['Category'] = 'Not ancient woodland'
df_primary_plural_zero['subCategory'] = 'A'
df_one_primary_plural_zero = df_primary_one[df_primary_one.zero_combined >1]

## B - Ancient woodland
## Has primary evidence, one zero present, but is a named woodland 
df_primary_one_zero_named = df_primary_one_zero[df_primary_one_zero.name_present >=1]
df_primary_one_zero_named['Category'] = 'Ancient woodland'
df_primary_one_zero_named['subCategory'] = 'B'
df_one_primary_one_zero_named = df_one_primary_one_zero[df_one_primary_one_zero.name_present >=1]

## A- Maybe ancient woodland
## Has primary evidence, one zero present and is not a named woodland. 
### Further research needed, what source is zero?
df_primary_one_zero_not_named = df_primary_one_zero[df_primary_one_zero.name_present==0]
df_primary_one_zero_not_named['Category'] = 'Maybe ancient woodland'
df_primary_one_zero_not_named['subCategory'] = 'A'


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super().__setitem__(key, value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super().__setitem__(key, value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super().__setitem__(key, value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = 

In [9]:
# Filter for no primary evidence present, but secondary evidence present
df_secondary = df[(df.present_primary ==0)&(df.present_secondary >=1)]
# Filter for no evidence present
df_no_evidence = df[(df.present_primary ==0)&(df.present_secondary ==0)]

## D - Maybe ancient woodland
## No evidence, but contains no zeros, so may just not have been covered by maps
# Contains lots of 3/4's (no map coverage), requires further investigation to see if any maps preset
df_no_evidence_no_zero = df_no_evidence[df_no_evidence.zero_combined ==0]
df_no_evidence_no_zero['Category'] = 'Maybe ancient woodland'
df_no_evidence_no_zero['subCategory'] = 'D'

## C - Not ancient woodland
## No evidence, and contains at least one zero, so assumed not woodland. 
## 1637, ~600 1 zero, ~1000 more than one zero
df_no_evidence_zero = df_no_evidence[df_no_evidence.zero_combined >= 1]
df_no_evidence_zero['Category'] = 'Not ancient woodland'
df_no_evidence_zero['subCategory'] = 'C'

df_no_evidence_one_zero = df_no_evidence[df_no_evidence.zero_combined == 1]
df_no_evidence_plural_zero = df_no_evidence[df_no_evidence.zero_combined > 1]



In [10]:
#Filter for secondary evidence present, and no zeros present
df_secondary_no_zero = df_secondary[df_secondary.zero_combined ==0]

## C - Ancient woodland 
## No primary evidence, but contains secondary evidence, no zero's present and is named. 
df_secondary_no_zero_named = df_secondary_no_zero[df_secondary_no_zero.name_present >=1]
df_secondary_no_zero_named['Category'] = 'Ancient woodland'
df_secondary_no_zero_named['subCategory'] = 'C'

## B- Maybe ancient woodland
## No primary evidence, but contains secondary evidence, no zero's present, but is not named
## Further research needed; why is not present on primary evidence? 
df_secondary_no_zero_not_named  = df_secondary_no_zero[df_secondary_no_zero.name_present ==0]
df_secondary_no_zero_not_named['Category'] = 'Maybe ancient woodland'
df_secondary_no_zero_not_named['subCategory'] = 'B'

#Filter for secondary evidence present, and one zero present
df_secondary_one_zero = df_secondary[df_secondary.zero_combined ==1]

## C - Maybe ancient woodland
## No primary evidence, secondary evidence, but one zero present, and named
## Further research; why no primary evidence? and what source is zero? What source is name? 
df_secondary_one_zero_named  = df_secondary_one_zero[df_secondary_one_zero.name_present >=1]
df_secondary_one_zero_named['Category'] = 'Maybe ancient woodland'
df_secondary_one_zero_named['subCategory'] = 'C'

## E - Maybe ancient woodland
## No primary evidence, secondary evidence, but one zero present, not named
## Probably not ancient woodland, but check primary evidence, and zero. 
df_secondary_one_zero_not_named  = df_secondary_one_zero[df_secondary_one_zero.name_present ==0]
df_secondary_one_zero_not_named['Category'] = 'Maybe ancient woodland'
df_secondary_one_zero_not_named['subCategory'] = 'E'

## B - Not ancient woodland
## Multiple zeros present, and no primary evidence so probably not ancient woodland. 
df_secondary_plural_zero = df_secondary[df_secondary.zero_combined >1]
df_secondary_plural_zero['Category'] = 'Not ancient woodland'
df_secondary_plural_zero['subCategory'] = 'B'

df_secondary_zero = df_secondary[df_secondary.zero_combined >=1]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super().__setitem__(key, value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super().__setitem__(key, value)


In [40]:

df_combined = pd.concat([df_primary_no_zero,
                         df_primary_zero,
df_no_evidence_no_zero,
df_no_evidence_zero,
df_secondary_no_zero,
df_secondary_one_zero,
df_secondary_plural_zero,])

In [41]:
df_combined.to_file("AWI_sources_updated_5_resolved_test.shp")


Column names longer than 10 characters will be truncated when saved to ESRI Shapefile.


Normalized/laundered field name: 'intersects_old_AWI' to 'intersects'


Normalized/laundered field name: 'present_TITHE' to 'present_TI'


Normalized/laundered field name: 'assessed_TITHE' to 'assessed_T'


Normalized/laundered field name: 'present_C19_EM' to 'present_C1'


Normalized/laundered field name: 'zero_C19_EM' to 'zero_C19_E'


Normalized/laundered field name: 'assessed_C19_EM' to 'assessed_C'


Normalized/laundered field name: 'present_OSD' to 'present_OS'


Normalized/laundered field name: 'assessed_OSD' to 'assessed_O'


Normalized/laundered field name: 'present_CM_1773' to 'present_CM'


Normalized/laundered field name: 'zero_CM_1773' to 'zero_CM_17'


Normalized/laundered field name: 'assessed_CM_1773' to 'assessed_1'


Normalized/laundered field name: 'present_C18_EM' to 'present__1'


Normalized/laundered field name: 'zero_C18_EM' to 'zero_C18_E'


Normalized/laundered field name:

## Random analysis


In [31]:
df_ancient_only = df_combined[df_combined.Category == 'Ancient woodland']
df_maybe_ancient = df_combined[df_combined.Category == 'Maybe ancient woodland']
df_not_ancient = df_combined[df_combined.Category == 'Not ancient woodland']

In [51]:
df_not_ancient_grouped = df_not_ancient.groupby('P3_Status').count().reset_index()
df_ancient_grouped = df_ancient_only.groupby('P3_Status').count().reset_index()

In [60]:
df_combined['area'] = df_combined.area/10000


In [88]:
df_grouped = df_combined.groupby(['Category', 'subCategory','intersects_old_AWI'])['area'].sum().reset_index()


In [67]:
df_intersects_not_ancient = df_combined[(df_combined['intersects_old_AWI']==True)&(df_combined['Category']=='Not ancient woodland')]
df_intersects_not_ancient.area.sum()/10000

np.float64(670.9962464495912)

In [90]:
df_grouped_not_ancient  = df_grouped[df_grouped.Category =='Not ancient woodland']

In [34]:
df_check = df_primary.groupby(['present_primary'])['present_TITHE'].sum().reset_index()
df_check

Unnamed: 0,present_primary,present_TITHE
0,1,1179
1,2,1049
2,3,1209
3,4,304


In [31]:
df_check = df_primary.groupby(['present_primary'])['present_OSD'].sum().reset_index()
df_check

Unnamed: 0,present_primary,present_OSD
0,1,539
1,2,1215
2,3,1486
3,4,304


In [32]:
df_check = df_primary.groupby(['present_primary'])['present_CM_1773'].sum().reset_index()
df_check

Unnamed: 0,present_primary,present_CM_1773
0,1,232
1,2,665
2,3,1285
3,4,304


In [39]:
df_check = df_primary.groupby(['present_primary'])['present_HE_REF'].sum().reset_index()
df_check

Unnamed: 0,present_primary,present_HE_REF
0,1,154
1,2,403
2,3,673
3,4,304


In [80]:
df_no_primary.COMMENTS.unique()

array([None, 'Verge of road', 'too small',
       'Lidar + EP1: Spoil heaps around tunnel shafts',
       'Tithe: quarries, confirmed by LiDAR',
       'Tithe: small fields & gardens', 'Tithe: small fields and gardens',
       'Tithe: too narrow to be wp', 'Too small',
       "OSD: shown as farm. EP1: 'Old Farm Clump'",
       'AP & AP_LUFT: trees missing from interior',
       'Assumed old orchard and gardens',
       'C19_EM: could be recent plantation as drawn over arable plot. Extended to the south in pencil - later addition?',
       'OSD: penning',
       'LiDAR: signs of quarrying. Tithe: wooded area < 0.25 ha',
       'Tithe: lots of small fields so not WP',
       'Tithe: pasture area < 0.25 ha', 'Mansion grounds',
       'Old chalk pit', 'Tithe: lots of small fields, also on EM',
       'Tithe: too narrow to be WP',
       "Labelled 'glebe' on EM. Tithe: garden and narrow woodland.",
       'Tithe: pasture part of plot < 0.25 ha. AP_LUFT: garden of house',
       'AP_LUFT: wo

In [None]:
df_grouped = df.groupby(['present_primary']).count().reset_index()
df_combined_small = df_combined[df_combined.area <50000]

In [73]:
fig = px.histogram(df_combined_small, x = 'area', color = 'present_primary', nbins = 60 )
fig.show()

In [None]:
fig = px.bar(df_grouped, x ='assessed_primary', y = 'P3_UID', color = 'ar' )
fig.show()

# Decision tree diagram

In [27]:
df_one_primary_one_zero
df_sankey = {'source':[0,1,2,2,1,5,6,6,8,8, 5, 11,11,0, 14,15,15,17,17,14,21,21],
            'target':[ 1,2,3,4,5,6,7,8,9,10,11,12,13,21,15,16,17,18,19,20,14,22],
            'value':[df_no_primary.shape[0],df_no_evidence.shape[0],df_no_evidence_zero.shape[0],df_no_evidence_no_zero.shape[0],df_secondary.shape[0],df_secondary_zero.shape[0],
                    df_secondary_plural_zero.shape[0],df_secondary_one_zero.shape[0],df_secondary_one_zero_not_named.shape[0],df_secondary_one_zero_named.shape[0],
                    df_secondary_no_zero.shape[0],df_secondary_no_zero_not_named.shape[0],df_secondary_no_zero_named.shape[0],df_primary.shape[0],
                    df_one_primary_zero.shape[0],df_one_primary_plural_zero.shape[0],df_one_primary_one_zero.shape[0],df_primary_one_zero_not_named.shape[0],df_one_primary_one_zero_named.shape[0],
                    df_one_primary_no_zero.shape[0],df_primary_one.shape[0],df_primary_two.shape[0]],
            'colour':['#fb9985','#fb9985','#fb9985','#AFE1AF','#AFE1AF','#fb9985','#fb9985','#AFE1AF','#fb9985','#AFE1AF','#AFE1AF','#fb9985','#AFE1AF','#AFE1AF','#fb9985','#fb9985','#AFE1AF',
                '#fb9985','#AFE1AF','#AFE1AF','#fb9985','#AFE1AF'],
            'name':['primary source present?','secondary source present?','less than one zero present?','Contains zero','No zeros present','less than one zero present?','Only one zero?',
                'Multiple zeros present','Named?','Not named','Named','Named?','Not named','Named','less than one zero present?','Only one zero?','Multiple zeros present','Named?',
                'Not named','Named','No zeros present','Two primary sources present?','Two primary sources present'],
            'node_colour':['#ded6e2','#ded6e2','#ded6e2','#d0311a','#f8a54f','#ded6e2','#ded6e2','#d0311a','#ded6e2','#f8a54f','#f8a54f','#ded6e2','#f8a54f','#2ea411','#ded6e2','#ded6e2','#d0311a','#ded6e2',
                '#f8a54f','#2ea411','#2ea411','#ded6e2','#2ea411']}

In [28]:

fig = go.Figure(data=[go.Sankey(
    node = dict(
      pad = 15,
      thickness = 20,
     # line = dict(color = "black", width = 0.5),
      label =df_sankey['name'],
      color = df_sankey['node_colour']
    ),
    link = dict(
      source = df_sankey['source'], # indices correspond to labels, eg A1, A2, A1, B1, ...
      target = df_sankey['target'],
      value = df_sankey['value'],
      color = df_sankey['colour']
  ))])

fig.update_layout(title_text="AWI Decision Tree", font_size=10)
fig.show()