In [49]:
import geopandas as gpd

import pandas as pd
import json

In [50]:
df = pd.read_excel("amy cook - pusstats data request - 26.02.2024.xlsx")

In [52]:
df.region.unique()

array(['Eastern', 'Midlands & Western', 'Northern', 'Scotland',
       'South Eastern', 'South Western', 'Wales', 'Northern Ireland'],
      dtype=object)

In [53]:
gdf = gpd.read_file("https://raw.githubusercontent.com/EconomicsObservatory/courses/main/datasets/advanced/nuts1.json")

# ['Eastern', 'Midlands & Western', 'Northern', 'Scotland',
#        'South Eastern', 'South Western', 'Wales', 'Northern Ireland']

#  ['SOUTH WEST (ENGLAND)', 'WALES', 'NORTHERN IRELAND',
#        'NORTH EAST (ENGLAND)', 'NORTH WEST (ENGLAND)',
#        'YORKSHIRE AND THE HUMBER', 'EAST MIDLANDS (ENGLAND)',
#        'WEST MIDLANDS (ENGLAND)', 'EAST OF ENGLAND', 'LONDON',
#        'SOUTH EAST (ENGLAND)', 'SCOTLAND']

to_merge = {
    'SOUTH WEST (ENGLAND)': 'South Western',
    'WALES': 'Wales',
    'NORTHERN IRELAND': 'Northern Ireland',
    'NORTH EAST (ENGLAND)': 'Northern',
    'NORTH WEST (ENGLAND)': 'Northern',
    'YORKSHIRE AND THE HUMBER': 'Northern',
    'EAST MIDLANDS (ENGLAND)': 'Midlands & Western',
    'WEST MIDLANDS (ENGLAND)': 'Midlands & Western',
    'EAST OF ENGLAND': 'Eastern',
    'LONDON': 'South Eastern',
    'SOUTH EAST (ENGLAND)': 'South Eastern',
    'SCOTLAND': 'Scotland'
}

gdf['region'] = gdf['NAME_LATN'].map(to_merge)

gdf = gdf.dissolve(by='region')

gdf.to_file("pusstats_regions.geojson", driver='GeoJSON')

# The Data

In [59]:
df = pd.read_excel("amy cook - pusstats data request - 26.02.2024.xlsx")
df = df.rename(columns={ # clean up column names
    'suryear': 'year',
    'm_app': 'application_method',
    'cropgroup': 'crop_group',
    'chemgroup': 'chemical_group',
    'area treated (ha)': 'area_treated',
    'weight applied (kg)': 'weight_applied'
})

# the table is prohibitavely large, so I'll sum all the vals in the active col to a single value
df = df.groupby(by=["year", 'survey_name', 'region', 'application_method', 'crop_group', 'chemical_group']).agg({'area_treated': 'sum', 'weight_applied': 'sum'}).reset_index()

# add an 'All' category to survey_name, application_method, crop_group, and chemical_group
for col in ['survey_name', 'application_method', 'crop_group', 'chemical_group']:
    agg_df = df.groupby(by=[c for c in df.columns if c not in [col, 'area_treated', 'weight_applied']]).agg({'area_treated': 'sum', 'weight_applied': 'sum'}).reset_index()
    agg_df[col] = 'All'
    df = pd.concat([df, agg_df])


#  Add a most-recent option to the year column
df = df.sort_values(by='year').reset_index(drop=True)
most_recent_data = df.drop_duplicates(subset=[c for c in df.columns if c != 'year'], keep='last')
most_recent_data['year'] = 'Most Recent'

df = pd.concat([df, most_recent_data])




df.to_csv("chemical_group_level_data.csv", index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  most_recent_data['year'] = 'Most Recent'


In [58]:
most_recent_data

Unnamed: 0,year,survey_name,region,application_method,crop_group,chemical_group,area_treated,weight_applied
0,1990,Arable crops,Eastern,OT,Cereals,Fungicides,4608478.25,1645427.81
1,1990,All,Wales,ST,Cereals,All,25446.01,979.38
2,1990,All,Wales,ST,All,All,27648.01,1054.09
3,1990,All,Wales,OT,Strawberries,All,1222.82,1398.06
4,1990,All,Wales,OT,Pulses,All,1422.03,1110.51
...,...,...,...,...,...,...,...,...
51889,2022,Soft fruit,Northern Ireland,All,Strawberries,Herbicides,0.25,0.57
51890,2022,Soft fruit,Northern Ireland,All,Strawberries,Growth regulators,0.25,0.03
51891,2022,Soft fruit,Northern Ireland,All,Strawberries,Fungicides,49.95,17.09
51892,2022,Soft fruit,Scotland,All,Strawberries,Fungicides,19791.46,9974.63


In [55]:
len(df)

51894

In [43]:
most_recent_data

Unnamed: 0,year,survey_name,region,application_method,crop_group,chemical_group,area_treated,weight_applied
0,1990,Arable crops,Eastern,OT,Cereals,Fungicides,4608478.25,1645427.81
1,1990,All,Wales,ST,Cereals,All,25446.01,979.38
2,1990,All,Wales,ST,All,All,27648.01,1054.09
3,1990,All,Wales,OT,Strawberries,All,1222.82,1398.06
4,1990,All,Wales,OT,Pulses,All,1422.03,1110.51
...,...,...,...,...,...,...,...,...
51889,2022,Soft fruit,Northern Ireland,All,Strawberries,Herbicides,0.25,0.57
51890,2022,Soft fruit,Northern Ireland,All,Strawberries,Growth regulators,0.25,0.03
51891,2022,Soft fruit,Northern Ireland,All,Strawberries,Fungicides,49.95,17.09
51892,2022,Soft fruit,Scotland,All,Strawberries,Fungicides,19791.46,9974.63


In [33]:
# {
#     "name": "projection",
#     "value": "equalEarth",
#     "bind": {
#     "input": "select",
#     "options": [
#         "albers",
#         "albersUsa",
#         "azimuthalEqualArea",
#         "azimuthalEquidistant",
#         "conicConformal",
#         "conicEqualArea",
#         "conicEquidistant",
#         "equalEarth",
#         "equirectangular",
#         "gnomonic",
#         "mercator",
#         "naturalEarth1",
#         "orthographic",
#         "stereographic",
#         "transverseMercator"
#     ]
#     }
# }

params = []
for col_name in ["year", 'survey_name', 'region', 'application_method', 'crop_group', 'chemical_group']:
    options = df[col_name].unique().tolist()
    
    params.append({
        "name": col_name,
        "value": options[-1],
        "bind": {
            "input": "select",
            "options": options
        }
    })

with open("params.json", 'w') as f:
    f.write(json.dumps(params, indent=4))


In [36]:
df.survey_name.unique()

array(['Arable crops', 'Soft fruit', 'Edible protected crops',
       'Outdoor vegetables', 'Orchards', 'Grassland & fodder crops',
       'All'], dtype=object)

In [39]:
df.query("survey_name == 'Grassland & fodder crops' and application_method == 'All' and crop_group == 'All' and chemical_group == 'All'")

Unnamed: 0,year,survey_name,region,application_method,crop_group,chemical_group,area_treated,weight_applied
943,1993,Grassland & fodder crops,Eastern,All,All,All,171552.06,156909.03
955,1993,Grassland & fodder crops,Midlands & Western,All,All,All,460815.13,406198.7
967,1993,Grassland & fodder crops,Northern,All,All,All,319582.84,239495.17
979,1993,Grassland & fodder crops,Scotland,All,All,All,446827.43,273804.71
988,1993,Grassland & fodder crops,South Eastern,All,All,All,298319.93,190591.32
1000,1993,Grassland & fodder crops,South Western,All,All,All,424886.42,343684.3
1012,1993,Grassland & fodder crops,Wales,All,All,All,141776.37,133665.34
2082,1997,Grassland & fodder crops,Eastern,All,All,All,177053.3,168883.58
2093,1997,Grassland & fodder crops,Midlands & Western,All,All,All,546395.5,411368.41
2105,1997,Grassland & fodder crops,Northern,All,All,All,394907.3,406820.26


In [41]:
df.year.unique()

array([1990, 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000,
       2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011,
       2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022])

In [35]:
cols = ["year", 'survey_name', 'region', 'application_method', 'crop_group', 'chemical_group']
filters = [f"datum.{col} == {col}" for col in cols]
filters = " && ".join(filters)
filters


'datum.year == year && datum.survey_name == survey_name && datum.region == region && datum.application_method == application_method && datum.crop_group == crop_group && datum.chemical_group == chemical_group'

In [18]:
df

Unnamed: 0,year,survey_name,application_method,crop_group,region,chemical_group,active,area_treated,weight_applied
0,1990,Arable crops,OT,Cereals,Eastern,Fungicides,Maneb,293898.83,317627.48
1,1990,Arable crops,OT,Cereals,Eastern,Fungicides,Zineb,6876.60,1374.77
2,1990,Arable crops,OT,Cereals,Eastern,Fungicides,Ferbam,6876.60,1374.77
3,1990,Arable crops,OT,Cereals,Eastern,Fungicides,Sulphur,37904.84,242389.80
4,1990,Arable crops,OT,Cereals,Eastern,Fungicides,Captafol,5352.02,4347.24
...,...,...,...,...,...,...,...,...,...
65569,2022,Soft fruit,OT,Strawberries,South Western,Insecticides,Pirimicarb,1.16,0.32
65570,2022,Soft fruit,OT,Strawberries,South Western,Insecticides,Clofentezine,49.41,9.88
65571,2022,Soft fruit,OT,Strawberries,South Western,Insecticides,Spirotetramat,96.50,9.65
65572,2022,Soft fruit,OT,Strawberries,South Western,Insecticides,Cyantraniliprole,27.02,2.03


In [16]:
len(df.active.value_counts())

524

In [None]:
df.crop_group('')

In [13]:
df.survey_name.value_counts()

survey_name
Arable crops                28189
Outdoor vegetables          15836
Soft fruit                   8814
Grassland & fodder crops     5833
Edible protected crops       3722
Orchards                     3180
Name: count, dtype: int64

In [12]:
df.query("active == 'Zineb'")

Unnamed: 0,year,survey_name,application_method,crop_group,region,chemical_group,active,area_treated,weight_applied
1,1990,Arable crops,OT,Cereals,Eastern,Fungicides,Zineb,6876.60,1374.77
232,1990,Arable crops,OT,Cereals,Scotland,Fungicides,Zineb,1854.00,419.10
612,1990,Arable crops,OT,Oilseeds,Scotland,Fungicides,Zineb,492.00,49.20
724,1990,Arable crops,OT,Potatoes,Eastern,Fungicides,Zineb,2728.13,486.27
807,1990,Arable crops,OT,Potatoes,Northern,Fungicides,Zineb,588.48,98.90
...,...,...,...,...,...,...,...,...,...
58160,1994,Soft fruit,OT,Strawberries,South Eastern,Fungicides,Zineb,6.44,30.34
58536,1998,Soft fruit,OT,Other soft fruit,South Eastern,Fungicides,Zineb,9.02,12.63
58606,1998,Soft fruit,OT,Other soft fruit,South Western,Fungicides,Zineb,318.76,497.68
59317,2001,Soft fruit,OT,Other soft fruit,South Western,Fungicides,Zineb,9.55,7.35


In [10]:
df.active.unique()

array(['Maneb', 'Zineb', 'Ferbam', 'Sulphur', 'Captafol', 'Mancozeb',
       'Nuarimol', 'Iprodione', 'Triforine', 'Flutriafol', 'Prochloraz',
       'Pyrazophos', 'Tridemorph', 'Carbendazim', 'Fenpropidin',
       'Flusilazole', 'Triadimefon', 'Triadimenol', 'Fenpropimorph',
       'Propiconazole', 'Chlorothalonil', 'Thiophanate-methyl',
       'Mepiquat', 'Chlormequat', 'Choline chloride',
       '2-chloroethylphosphonic acid', 'MCPA', '2,4-D', '2,4-DB',
       'Diquat', 'Bifenox', 'Dicamba', 'Ioxynil', 'Linuron', 'Isoxaben',
       'Mecoprop', 'Paraquat', 'Benazolin', 'Cyanazine', 'Terbutryn',
       'Bromoxynil', 'Clopyralid', 'Fluroxypyr', 'Glyphosate',
       'Mecoprop-P', 'Tri-allate', 'Trietazine', 'Dichlorprop',
       'Difenzoquat', 'Isoproturon', 'Trifluralin', 'Diflufenican',
       'Chlorotoluron', 'Chlorsulfuron', 'Pendimethalin',
       'Diclofop-methyl', 'Flamprop-methyl', 'Fenoxaprop-ethyl',
       'Benzoylprop-ethyl', 'Methabenzthiazuron', 'Metsulfuron-methyl',
      

In [9]:
df.chemical_group.value_counts()

chemical_group
Fungicides           24756
Herbicides           23597
Insecticides         12640
Molluscicides         2040
Growth regulators     1665
Others                 876
Name: count, dtype: int64

In [8]:
df.crop_group.value_counts()

crop_group
Cereals                     11853
Oilseeds                     5864
Potatoes                     4511
Other soft fruit             4436
Strawberries                 4378
Pulses                       3941
Protected edible crops       3722
Orchards                     3180
Other fodder crops           3037
Brassicas                    2825
Onions & leeks               2494
Other outdoor vegetables     2353
Carrots, parsnips etc.       2333
Peas & beans                 1870
Sugar beet                   1787
Grassland                    1539
Lettuce, endive etc.         1508
Other root vegetables        1202
Maize                         994
Root crucifers                705
Cucurbits                     330
Fodder beet                   263
Other arable crops            233
Sweetcorn                     216
Name: count, dtype: int64

In [14]:
gdf

Unnamed: 0,id,LEVL_CODE,NUTS_ID,CNTR_CODE,NAME_LATN,NUTS_NAME,MOUNT_TYPE,URBN_TYPE,COAST_TYPE,FID,geometry
0,UKK,1,UKK,UK,SOUTH WEST (ENGLAND),SOUTH WEST (ENGLAND),0,0,0,UKK,"MULTIPOLYGON (((-1.66573 51.98749, -1.63700 51..."
1,UKL,1,UKL,UK,WALES,WALES,0,0,0,UKL,"MULTIPOLYGON (((-3.36339 53.35203, -3.16975 53..."
2,UKN,1,UKN,UK,NORTHERN IRELAND,NORTHERN IRELAND,0,0,0,UKN,"MULTIPOLYGON (((-5.97653 55.05660, -5.97080 54..."
3,UKC,1,UKC,UK,NORTH EAST (ENGLAND),NORTH EAST (ENGLAND),0,0,0,UKC,"POLYGON ((-1.34737 54.86069, -1.24223 54.72259..."
4,UKD,1,UKD,UK,NORTH WEST (ENGLAND),NORTH WEST (ENGLAND),0,0,0,UKD,"POLYGON ((-2.68975 55.18906, -2.65030 55.13921..."
5,UKE,1,UKE,UK,YORKSHIRE AND THE HUMBER,YORKSHIRE AND THE HUMBER,0,0,0,UKE,"POLYGON ((-0.79091 54.55948, -0.54857 54.45762..."
6,UKF,1,UKF,UK,EAST MIDLANDS (ENGLAND),EAST MIDLANDS (ENGLAND),0,0,0,UKF,"POLYGON ((0.01738 53.52537, 0.09692 53.49770, ..."
7,UKG,1,UKG,UK,WEST MIDLANDS (ENGLAND),WEST MIDLANDS (ENGLAND),0,0,0,UKG,"POLYGON ((-1.98738 53.21361, -1.86643 53.18290..."
8,UKH,1,UKH,UK,EAST OF ENGLAND,EAST OF ENGLAND,0,0,0,UKH,"POLYGON ((1.67548 52.74269, 1.73639 52.64175, ..."
9,UKI,1,UKI,UK,LONDON,LONDON,0,0,0,UKI,"POLYGON ((-0.24042 51.49011, -0.22283 51.47182..."


In [9]:
gdf.NAME_LATIN

AttributeError: 'GeoDataFrame' object has no attribute 'NAME_LATIN'

In [12]:
gdf.NAME_LATN.unique()

array(['SOUTH WEST (ENGLAND)', 'WALES', 'NORTHERN IRELAND',
       'NORTH EAST (ENGLAND)', 'NORTH WEST (ENGLAND)',
       'YORKSHIRE AND THE HUMBER', 'EAST MIDLANDS (ENGLAND)',
       'WEST MIDLANDS (ENGLAND)', 'EAST OF ENGLAND', 'LONDON',
       'SOUTH EAST (ENGLAND)', 'SCOTLAND'], dtype=object)

In [None]:
df = pd.read_csv('data/geo_data.geojson')