## Imports

In [1]:
import pandas as pd
import geopandas as gpd
import pyreadstat  # pip install pyreadstat

import folium
from branca.colormap import LinearColormap

import matplotlib.pyplot as plt

from IPython.core.display import HTML

## Data preparation

#### Importing the data

In [2]:
df, meta = pyreadstat.read_sav('./data/survey/ENCAVI_2015.sav', apply_value_formats=True, formats_as_category=True)

regions_gid = pd.read_csv('./data/survey/region_to_GID.csv', sep=';', index_col='region_name', squeeze=True)
question_format = pd.read_csv('./data/survey/question_format.csv', sep=';', index_col='question')

#### Value changes

In [3]:
# Scales
yes_no_scale = {'Sí':1, 'No':0}
likert_freq_scale = {'Nunca': 0, 'Casi nunca': 1, 'A veces': 2, 'Casi siempre': 3, 'Siempre': 4, 'No sabe': None, 'No responde': None}
likert_quality_scale = {'Mala': 0, 'Regular': 1, 'Buena': 2, 'Muy buena': 3, 'Excelente': 4, 'No sabe': None, 'No responde': None}

In [4]:
df_f = df[['Region', *question_format.index.values]].copy()

df_f['GID_1'] = df_f.Region.map(regions_gid)

for question in question_format[question_format.scale == 'yes_no'].index:
    df_f[question] = df_f[question].map(yes_no_scale)

for question in question_format[question_format.scale == 'likert_freq'].index:
    df_f[question] = df_f[question].map(likert_freq_scale)

for question in question_format[question_format.scale == 'likert_quality'].index:
    df_f[question] = df_f[question].map(likert_quality_scale)

df_f.rename(columns=question_format.var_name.to_dict(), inplace=True)

In [5]:
df_f['age'] = df_f.age.astype(float)

#### Calculating metrics

In [6]:
df_reg = df_f.groupby('GID_1').mean()

## Map creation

In [7]:
chilean_map = gpd.read_file('./data/map/chilean_map.json', encoding='latin-1')

gid_dict_mapping = {regions_gid['Ñuble']: regions_gid['Biobío']}  # When the survey was conducted Ñuble was part of Biobío
chilean_map.GID_1.replace(gid_dict_mapping, inplace=True)

chilean_map = chilean_map.merge(df_reg, on='GID_1')

In [8]:
def create_map(gdf, color_column, tooltip_columns=None):
    tooltip_columns = tooltip_columns if tooltip_columns else ['NAME_1', color_column]
    
    is_good_indicator = question_format[question_format.var_name == color_column].good_indicator.iloc[0]
    if is_good_indicator:
        color_scale = LinearColormap(['yellow','green'], vmin=chilean_map[color_column].min(), vmax=chilean_map[color_column].max())
    else:
        color_scale = LinearColormap(['yellow','red'], vmin=chilean_map[color_column].min(), vmax=chilean_map[color_column].max())

    m = folium.Map(
    location = [-39.5, -60], 
    zoom_start = 4,
    )
    
    folium.GeoJson(
        data = gdf.to_json(),
        style_function = lambda feature: {
            'fillColor': color_scale(feature['properties'][color_column]),
            'fillOpacity': 0.65,
            'color' : 'black',
            'weight' : 1.5,
        },
        highlight_function = lambda x: {"weight": 1,'fillOpacity': 1},
        tooltip = folium.features.GeoJsonTooltip(fields=tooltip_columns, labels=False, sticky=False)
    ).add_to(m)
    
    return m

#### Ploting two maps side-by-side

In [9]:
html_template = open('./data/display/map_disp_template.html').read()

def plot_maps(map_1, map_2, title_1, title_2, correlation=None):
    map_file_1 = './data/generated/map_1.html'
    map_file_2 = './data/generated/map_2.html'
    
    map_1.save(map_file_1)
    map_2.save(map_file_2)
    
    correlation_txt = "(correlation between variables {0:.2f})".format(correlation) if correlation else ""
    
    disp_html = html_template.format(map_file_1=map_file_1, map_file_2=map_file_2, title_1=title_1, title_2=title_2, correlation_txt=correlation_txt)
    display(HTML(disp_html))

### Suggesting variables

In [10]:
def get_most_correlated(df, original_variable, correlation='positive'):
    corr_values = df.corr()[original_variable].drop(original_variable)
    
    if correlation == 'positive':
        return corr_values.idxmax()
    elif correlation == 'negative':
        return corr_values.idxmin()
    elif correlation == 'absolute':
        return corr_values.abs().idxmax()
    else:
        raise ValueError('Invalid correlation selector.')

## Visualizing

In [11]:
variable_1 = 'has_hypertension'
variable_2 = get_most_correlated(df_f, variable_1, correlation='negative')

map_1 = create_map(chilean_map, variable_1)
map_2 = create_map(chilean_map, variable_2)

correlation = df_f[[variable_1, variable_2]].corr().iloc[1, 0]

plot_maps(map_1, map_2, variable_1, variable_2, correlation)