# Charts Pantanal EDA

## 0. Reading table

In [1]:
from runner import io
pantanal_df = io.load_table("domain", "pantanal")

## 1. Config altair

In [2]:
# Databricks notebook source
# MAGIC %md # Altair setup

# COMMAND ----------

# MAGIC %sh
# MAGIC # Installing altair
# MAGIC /databricks/python/bin/pip install altair altair_saver selenium -q
# MAGIC 
# MAGIC # Installing chromewebdriver for saving altair plots
# MAGIC apt-get update -y
# MAGIC apt-get install -y chromium-browser chromium-chromedriver

# COMMAND ----------

from datetime import datetime, timedelta
import json

import pandas as pd
import numpy as np

import altair as alt
alt.data_transformers.disable_max_rows()
alt.renderers.set_embed_options(scaleFactor=5)

# COMMAND ----------

# Altair theme setup
def tfg_palette(key="tfg_dark_light"):
    color_schemes = {
        'category-6': ['#ec8431', '#829eb1', '#c89d29', '#3580b1', '#adc839', '#ab7fb4'],
        'fire-7': ['#fbf2c7', '#f9e39c', '#f8d36e', '#f4bb6a', '#e68a4f', '#d15a40', '#ab4232'],
        'fireandice-6': ['#e68a4f', '#f4bb6a', '#f9e39c', '#dadfe2', '#a6b7c6', '#849eae'],
        'ice-7': ['#edefee', '#dadfe2', '#c4ccd2', '#a6b7c6', '#849eae', '#607785', '#47525d'],
        "tfg_dark": ['#0C66B5', '#BA1D32', '#0F8E40', '#E56F20', '#862BAF', '#626D69', '#ED5087'],
        "tfg_light": ['#3DB9E2', '#51DB63', '#E26868', '#FFB270', '#B37CEA', '#A8A1A4', '#FF87CD'],
    }
    
    color_schemes["tfg_dark_light"] = color_schemes["tfg_dark"] + color_schemes["tfg_light"]
    
    return color_schemes[key]

def science_theme_title(
    font="Helvetica Neue",
    titleFontSize=18,
    subtitleFontSize=14,
    titleFontWeight="normal",
    text=None,
    subtitle=None,
    **kwargs
):
    title_dict = {
        "align": "left",
        "anchor": "start",
        "frame": "group",
        "font": font,
        "fontSize": titleFontSize,
        "dy": -2,
        # Doesn't work directly in config
        "subtitleFont": font,
        "subtitleFontSize": subtitleFontSize,
        "fontWeight": titleFontWeight,
    }
    
    if text is not None:
        title_dict["text"] = text

    if subtitle is not None:
        title_dict["subtitle"] = subtitle
    
    if kwargs:
        title_dict.update(kwargs)

    return title_dict

def science_theme(
    width=1600/3,
    height=900/3,
    font="Helvetica Neue",
    titleFontSize=18,
    titleFontWeight="bold",
    subtitleFontSize=14,
    baseFontSize=14,
):    

    return {
        "config": {
            "view": {
                "width": width,
                "height": height,
                "fill": "white",
            },
            "title": science_theme_title(
                font=font,
                titleFontSize=titleFontSize,
                subtitleFontSize=subtitleFontSize,
            ),
            "axis": {
                "domainColor": "lightGrey",
                "gridColor": "lightGrey",
                "gridWidth": 0.25,
#                 "title": None,
                "titleFont": font,
                "titleFontSize": baseFontSize - 3,
                "labelFont": font,
                "labelFontSize": baseFontSize,
            },
            "legend": {
                "layout": {"anchor": "start"},
                "labelLimit": 0,
                "titleFontSize": baseFontSize,
                "titleFont": font,
                "labelFontSize": baseFontSize,
                "labelFont": font,
            },
            "header": {
                "title": None,
                "titleFont": font,
                "labelFont": font,
                "labelFontSize": baseFontSize,
                "labelPadding": 2,
            },
            "range": {
                "category": tfg_palette(),
            },
        }
    }

def science_theme_caption(text):
    return (
        alt.Chart(pd.DataFrame(),
                  title={"text": text, **science_theme_title(titleFontSize=12, titleFontWeight="normal")})
        .mark_text()
        .properties(height=1, view={"stroke": None})
    )

def science_theme_facet_title_background(width, fill="lightGrey", opacity=0.25, widthExtra=1):
    return (
        alt.Chart(pd.DataFrame({"a": [0]}))
        .mark_rect(fill=fill, opacity=opacity)
        .encode(y=alt.value(-25), y2=alt.value(-1), x=alt.value(0), x2=alt.value(width + widthExtra))
    )

def science_theme_facet_plot(chart, 
                             title=None, subtitle=None, caption=None,
                             width=300, height=150,
                             facet_kwargs=None, facet_resolve_scales=None,
                             focus_chart=None, focus_chart_width=300, focus_chart_height=150,
                             focus_chart_title="All",
                             focus_chart_resolve_scales=None,
                             grey_facet_background=True,
                             configure_headerColumn=True,
                             **kwargs):
    
    if facet_kwargs is not None:
        if grey_facet_background:
          chart = (chart + science_theme_facet_title_background(width=width))
        chart = chart.facet(**facet_kwargs)

        if facet_resolve_scales is not None:
          chart = chart.resolve_scale(**facet_resolve_scales)
        
        if focus_chart is not None:
            if grey_facet_background: 
              focus_chart = focus_chart + science_theme_facet_title_background(width=focus_chart_width)
            focus_chart = focus_chart.properties(
                width=focus_chart_width, 
                height=focus_chart_height,
                title=science_theme_title(text=focus_chart_title, titleFontSize=14, anchor="middle", offset=-22 if grey_facet_background else 0)
            )
            chart = focus_chart | chart
            
            if focus_chart_resolve_scales is not None:
              chart = chart.resolve_scale(**focus_chart_resolve_scales)
    
    chart = (
        (chart
         # Adding caption
         & science_theme_caption(caption)).resolve_scale(
            x="independent", y="independent", size="independent",
            color="independent", fill="independent", stroke="independent", opacity="independent",
        )
        # Adding title and subtitle
        .properties(title=science_theme_title(text=title, subtitle=subtitle))
        # Configuring plot size
        .configure_view(width=width, height=height)
    )
    
    if (facet_kwargs is not None) and grey_facet_background and configure_headerColumn:
        chart = chart.configure_headerColumn(labelPadding=-20).configure_headerFacet(labelPadding=-20)
    
    return chart
  
def science_theme_weekly_highlights(format="%b %d", tickCount=40): 
    return alt.Axis(
        format=format,
        tickCount=tickCount,
        gridOpacity=alt.condition(
            alt.Predicate(alt.FieldEqualPredicate(field='value', timeUnit="day", equal=1)),
            alt.value(1.0),
            alt.value(0.4)
        ),
    )
    
alt.themes.register("science_theme", science_theme)
alt.themes.enable("science_theme")

# COMMAND ----------

# DBTITLE 1,Test the saver works with a sample graph save
# import altair_saver
# import pandas as pd

# test_path = '/dbfs/mnt/tfg-ua/airflow/creatives/altair_test_save.png'

# source = pd.DataFrame({'x': ['A'], 'y': [1]})
# chart = alt.Chart(source).mark_bar().encode(x='x', y='y')
# try:
#   chart.save(test_path)
# except ValueError as e:
#   if 'No enabled saver found' in str(e):
#     # retry with an explicit method + driver to provide a more informative error message
#     altair_saver.save(chart, test_path, method='selenium', webdriver='chrome')
#   raise e  # in case the more explicit method works for some reason, or this is an unrecognized error



ThemeRegistry.enable('science_theme')

## 2. Exploratory Data Analysis

### 2.1 Evolution by crop

In [3]:
pantanal_df_all = pantanal_df.copy()
pantanal_df = pantanal_df_all.query('type != "pecuaria"')
pantanal_df_pecuaria = pantanal_df_all.query('type == "pecuaria"')

In [4]:
qntd_line_chart = alt.Chart(pantanal_df).mark_line().encode(
    x='year:T',
    y='sum_qntd:Q',
    color='crop',
#     strokeDash='type'
).transform_aggregate(
    sum_qntd='sum(quantidade_ton)',
    groupby=[
        "year", 
        "crop", 
        "type"
    ]
)

qntd_point_chart = alt.Chart(pantanal_df).mark_point(size=10, fill='white', tooltip=True).encode(
    x='year:T',
    y='sum_qntd:Q',
    color='crop',
#     strokeDash='type'
).transform_aggregate(
    sum_qntd='sum(quantidade_ton)',
    groupby=[
        "year", 
        "crop", 
        "type"
    ]
)

# alt.layer(qntd_chart, area_chart).resolve_scale(y='independent')

(qntd_line_chart).properties(
    width=700,
    height=500
)

In [5]:
qntd_area_chart = alt.Chart(pantanal_df).mark_area(opacity=0.6).encode(
    x=alt.X('year:T', axis=alt.Axis(title='Year')),
    y=alt.Y('sum_qntd:Q', axis=alt.Axis(title='Volume (ton)')),
    color='crop',
#     opacity='type'
).transform_aggregate(
    sum_qntd='sum(quantidade_ton)',
    groupby=[
        "year", 
        "crop", 
        "type"
    ]
)

# qntd_type_line_chart = alt.Chart(pantanal_df).mark_line().encode(
#     x='year:T',
#     y='sum_qntd:Q',
# #     color='crop',
#     color='type'
# ).transform_aggregate(
#     sum_qntd='sum(quantidade_ton)',
#     groupby=[
#         "year", 
#         "type"
#     ]
# )


pecuaria_chart = alt.Chart(pantanal_df_pecuaria).mark_line(strokeDash=[2]).encode(
    x='year:T',
    y=alt.Y('sum_cabecas:Q', axis=alt.Axis(title='Number of heads of livestock')),
    color='crop',
#     opacity='type'
).transform_aggregate(
    sum_cabecas='sum(numero_cabecas)',
    groupby=[
        "year", 
        "crop", 
        "type"
    ]
) 

text = alt.Chart({'values':[{}]}).mark_text(
    align="left", baseline="top", size=15, color='green'
).encode(
    x=alt.value(170),  # pixels from left
    y=alt.value(150),  # pixels from top
    text=alt.value("Livestock"))

pecuaria_chart = pecuaria_chart + text


# (qntd_area_chart).properties(
#     width=700,
#     height=500
# )

final_chart = alt.layer(qntd_area_chart, pecuaria_chart).resolve_scale(y='independent').properties(
    width=700,
    height=500
).properties(
    title=science_theme_title(
        text="Evolution of agricultural and livestock production in the Pantanal", 
        subtitle="1985-2020"
    )
)

final_chart

In [6]:
qntd_area_chart = alt.Chart(pantanal_df).mark_area(opacity=0.7).encode(
    x='year:T',
    y=alt.Y('sum_qntd:Q',stack='normalize'),
    color='crop',
#     opacity='type'
).transform_aggregate(
    sum_qntd='sum(quantidade_ton)',
    groupby=[
        "year", 
        "crop", 
        "type"
    ]
)

# qntd_type_line_chart = alt.Chart(pantanal_df).mark_line().encode(
#     x='year:T',
#     y='sum_qntd:Q',
# #     color='crop',
#     color='type'
# ).transform_aggregate(
#     sum_qntd='sum(quantidade_ton)',
#     groupby=[
#         "year", 
#         "type"
#     ]
# )

(qntd_area_chart).properties(
    width=700,
    height=500
)

### 2.2 Evolution by location

In [7]:
pantanal_df['location_UF'] = pantanal_df['location'].apply(lambda x: x[-3:-1])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pantanal_df['location_UF'] = pantanal_df['location'].apply(lambda x: x[-3:-1])


In [8]:
qntd_area_chart = alt.Chart(pantanal_df).mark_area(opacity=0.6).encode(
    x=alt.X('year:T', axis=alt.Axis(title='Year')),
    y=alt.Y('sum_qntd:Q', axis=alt.Axis(title='Volume produced (ton)')),
    color='location',
#     opacity='location_UF:O'
).transform_aggregate(
    sum_qntd='sum(quantidade_ton)',
    groupby=[
        "year", 
        "location",
        "location_UF"
    ]
)

# qntd_type_line_chart = alt.Chart(pantanal_df).mark_line().encode(
#     x='year:T',
#     y='sum_qntd:Q',
# #     color='crop',
#     color='type'
# ).transform_aggregate(
#     sum_qntd='sum(quantidade_ton)',
#     groupby=[
#         "year", 
#         "type"
#     ]
# )


final_chart = qntd_area_chart.properties(
    width=700,
    height=500
).properties(
    title=science_theme_title(
        text="Evolution of agricultural production by municipality in Pantanal", 
        subtitle="1985-2020"
    )
)

final_chart

In [9]:

qntd_area_chart = alt.Chart(pantanal_df).mark_area(opacity=0.7).encode(
    x='year:T',
    y=alt.Y('sum_qntd:Q', stack='normalize'),
    color='location',
#     opacity='location_UF:N'
).transform_aggregate(
    sum_qntd='sum(quantidade_ton)',
    groupby=[
        "year", 
        "location",
#         "location_UF"
    ]
)

# qntd_type_line_chart = alt.Chart(pantanal_df).mark_line().encode(
#     x='year:T',
#     y='sum_qntd:Q',
# #     color='crop',
#     color='type'
# ).transform_aggregate(
#     sum_qntd='sum(quantidade_ton)',
#     groupby=[
#         "year", 
#         "type"
#     ]
# )

(qntd_area_chart).properties(
    width=1000,
    height=600
)