# Generating dataset with "region" columns

In [7]:
import streamlit as st
import pandas as pd
import numpy as np
from PIL import Image
#from models import seqiahr_model

#import dashboard_data
#import dashboard_models
BRASIL_IO_URL = "https://brasil.io/dataset/covid19/caso?format=csv"

REGIONS_DICT = {"Sudeste":["SP","RJ","MG","ES"], "Nordeste":["BA","PE","CE","RN","PB","AL","SE","PI","MA"],
               "Sul":["RS","SC","PR"], "Centro-Oeste":["GO","MT","MS","DF"], "Norte":["PA","AM","RO","RR","TO","AC","AP"]}

def get_data():
    brasil_io_url = BRASIL_IO_URL
    cases = pd.read_csv(brasil_io_url).rename(
        columns={"confirmed": "Casos Confirmados"})

    return cases

def get_data_uf(data, uf, city_options):
    if uf:
        data = data.loc[data.state.isin(uf)]
        if city_options:
            city_options = [c.split(" - ")[1] for c in city_options]
            data = data.loc[
                (data.city.isin(city_options)) & (data.place_type == "city")
            ][["date", "state", "city", "Casos Confirmados"]]
            pivot_data = data.pivot_table(values="Casos Confirmados", index="date", columns="city")
            data = pd.DataFrame(pivot_data.to_records())
        else:
            data = data.loc[data.place_type == "state"][["date", "state", "Casos Confirmados"]]
            pivot_data = data.pivot_table(values="Casos Confirmados", index="date", columns="state")
            data = pd.DataFrame(pivot_data.to_records())

    else:
        return data.loc[data.place_type == "city"].groupby("date")["Casos Confirmados"].sum().to_frame()

    return data.set_index("date")

def get_aligned_data(df,align=100):
    align_dfs = [df.loc[df[c]>=100,[c]].values.reshape(-1,) for c in df.columns] 
    columns = [c for c in df.columns] 
    aligned_df = pd.DataFrame(align_dfs,index=columns).T                           
    #align_dfs = [d.reset_index() for d in align_dfs]
    #aligned = pd.concat([d for d in align_dfs],ignore_index=True)
    return aligned_df

def get_city_list(data, uf):
    data_filt = data.loc[(data.state.isin(uf)) & (data.place_type == "city")]
    data_filt["state_city"] = data_filt["state"] + " - " + data_filt["city"]
    return sorted(list(data_filt.state_city.drop_duplicates().values))

def get_data_region(source=BRASIL_IO_URL):
    cases = pd.read_csv(source).rename(
        columns={"confirmed": "Casos Confirmados"})
    
    return cases
    

In [9]:
cases = get_data_region()
cases

Unnamed: 0,date,state,city,place_type,Casos Confirmados,deaths,is_last,estimated_population_2019,city_ibge_code,confirmed_per_100k_inhabitants,death_rate
0,2020-04-09,AM,Anori,city,1,0.0,True,21010.0,1300102.0,4.75964,
1,2020-04-09,AM,Boca do Acre,city,1,0.0,True,34308.0,1300706.0,2.91477,
2,2020-04-09,AM,Careiro da Várzea,city,2,0.0,True,30225.0,1301159.0,6.61704,
3,2020-04-09,AM,Importados/Indefinidos,city,0,0.0,True,,,,
4,2020-04-09,AM,Iranduba,city,11,0.0,True,48296.0,1301852.0,22.77621,
...,...,...,...,...,...,...,...,...,...,...,...
8848,2020-02-27,SP,São Paulo,city,1,,False,12252023.0,3550308.0,0.00816,
8849,2020-02-27,SP,,state,1,0.0,False,45919049.0,35.0,0.00218,
8850,2020-02-26,SP,,state,1,0.0,False,45919049.0,35.0,0.00218,
8851,2020-02-25,SP,São Paulo,city,1,,False,12252023.0,3550308.0,0.00816,


In [10]:
cases.shape

(8853, 11)

In [5]:
regions = {"CO": ["DF", "GO", "MS", "MT"], "N": ["AC", "AM", "PA", "RO", "RR", "TO", "AP"], 
           "NE": ["AL", "BA", "CE", "MA", "PB", "PE", "PI", "RN", "SE",], "S": ["PR", "RS", "SC"],
           "SE": ["ES", "MG", "RJ", "SP"]}
state_to_regions_dict = {"AC":"N", "AM":"N", "PA":"N", "RO":"N", "RR":"N", "TO":"N", "AP":"N",
                         "AL":"NE", "BA":"NE", "CE":"NE", "MA":"NE", "PB":"NE", "PE":"NE", "PI":"NE", "RN":"NE", "SE":"NE",
                         "DF":"CO", "GO":"CO", "MS":"CO", "MT":"CO",
                         "PR":"S", "RS":"S", "SC":"S",
                         "ES":"SE", "MG":"SE", "RJ":"SE", "SP":"SE"}
cases 

Unnamed: 0,date,state,city,place_type,confirmed,deaths,is_last,estimated_population_2019,city_ibge_code,confirmed_per_100k_inhabitants,death_rate
0,2020-04-09,AM,Anori,city,1,0.0,True,21010.0,1300102.0,4.75964,
1,2020-04-09,AM,Boca do Acre,city,1,0.0,True,34308.0,1300706.0,2.91477,
2,2020-04-09,AM,Careiro da Várzea,city,2,0.0,True,30225.0,1301159.0,6.61704,
3,2020-04-09,AM,Importados/Indefinidos,city,0,0.0,True,,,,
4,2020-04-09,AM,Iranduba,city,11,0.0,True,48296.0,1301852.0,22.77621,
...,...,...,...,...,...,...,...,...,...,...,...
8848,2020-02-27,SP,São Paulo,city,1,,False,12252023.0,3550308.0,0.00816,
8849,2020-02-27,SP,,state,1,0.0,False,45919049.0,35.0,0.00218,
8850,2020-02-26,SP,,state,1,0.0,False,45919049.0,35.0,0.00218,
8851,2020-02-25,SP,São Paulo,city,1,,False,12252023.0,3550308.0,0.00816,


In [6]:
if "region" not in cases.columns:
    column_ix = cases.columns.get_loc("city")+1
    cases.insert(loc= column_ix,column="region",value=np.nan)
    for ix,s in enumerate(cases.state):
        cases.iloc[ix:,column_ix] = state_to_regions_dict[s]
cases

Unnamed: 0,date,state,city,region,place_type,confirmed,deaths,is_last,estimated_population_2019,city_ibge_code,confirmed_per_100k_inhabitants,death_rate
0,2020-04-09,AM,Anori,N,city,1,0.0,True,21010.0,1300102.0,4.75964,
1,2020-04-09,AM,Boca do Acre,N,city,1,0.0,True,34308.0,1300706.0,2.91477,
2,2020-04-09,AM,Careiro da Várzea,N,city,2,0.0,True,30225.0,1301159.0,6.61704,
3,2020-04-09,AM,Importados/Indefinidos,N,city,0,0.0,True,,,,
4,2020-04-09,AM,Iranduba,N,city,11,0.0,True,48296.0,1301852.0,22.77621,
...,...,...,...,...,...,...,...,...,...,...,...,...
8848,2020-02-27,SP,São Paulo,SE,city,1,,False,12252023.0,3550308.0,0.00816,
8849,2020-02-27,SP,,SE,state,1,0.0,False,45919049.0,35.0,0.00218,
8850,2020-02-26,SP,,SE,state,1,0.0,False,45919049.0,35.0,0.00218,
8851,2020-02-25,SP,São Paulo,SE,city,1,,False,12252023.0,3550308.0,0.00816,


In [15]:
cases.shape

(7059, 12)

In [7]:
cases.iloc[234:435,:]

Unnamed: 0,date,state,city,region,place_type,confirmed,deaths,is_last,estimated_population_2019,city_ibge_code,confirmed_per_100k_inhabitants,death_rate
234,2020-04-06,MT,,CO,state,76,1.0,True,3484466.0,51.0,2.18111,0.0132
235,2020-04-06,PA,Abaetetuba,N,city,1,0.0,True,157698.0,1500107.0,0.63412,
236,2020-04-06,PA,Altamira,N,city,2,0.0,True,114594.0,1500602.0,1.74529,
237,2020-04-06,PA,Ananindeua,N,city,14,0.0,True,530598.0,1500800.0,2.63853,
238,2020-04-06,PA,Barcarena,N,city,3,0.0,True,124680.0,1501303.0,2.40616,
...,...,...,...,...,...,...,...,...,...,...,...,...
430,2020-04-06,RS,Dois Irmãos,S,city,2,0.0,True,32671.0,4306403.0,6.12164,
431,2020-04-06,RS,Dom Pedrito,S,city,1,0.0,True,38461.0,4306601.0,2.60004,
432,2020-04-06,RS,Eldorado do Sul,S,city,1,0.0,True,41285.0,4306767.0,2.42219,
433,2020-04-06,RS,Erechim,S,city,3,0.0,True,105862.0,4307005.0,2.83388,


In [21]:
bool_mask = cases.loc[:,"place_type"]=="state"
cases.loc[bool_mask,:]#.sum()

Unnamed: 0,date,state,city,region,place_type,confirmed,deaths,is_last,estimated_population_2019,city_ibge_code,confirmed_per_100k_inhabitants,death_rate
4,2020-04-06,AC,,N,state,50,1.0,True,881935.0,12.0,5.66935,0.0200
10,2020-04-06,AL,,NE,state,31,2.0,True,3337357.0,27.0,0.92888,0.0645
25,2020-04-06,AM,,N,state,532,19.0,True,4144597.0,13.0,12.83599,0.0357
30,2020-04-06,AP,,N,state,39,3.0,True,845731.0,16.0,4.61140,0.0769
84,2020-04-06,BA,,NE,state,437,10.0,True,14873064.0,29.0,2.93820,0.0229
...,...,...,...,...,...,...,...,...,...,...,...,...
7050,2020-03-01,SP,,SE,state,2,,False,45919049.0,35.0,0.00436,
7052,2020-02-29,SP,,SE,state,2,,False,45919049.0,35.0,0.00436,
7054,2020-02-28,SP,,SE,state,2,,False,45919049.0,35.0,0.00436,
7056,2020-02-27,SP,,SE,state,1,,False,45919049.0,35.0,0.00218,


## Grouping by date, region, state, city

In [29]:
data = cases.loc[cases.loc[:,"region"] == "CO",:]
#df = cases.groupby(["date"]).loc[:,"region"]
data

Unnamed: 0,date,state,city,region,place_type,confirmed,deaths,is_last,estimated_population_2019,city_ibge_code,confirmed_per_100k_inhabitants,death_rate
85,2020-04-06,DF,Brasília,CO,city,485,10.0,True,3015268.0,5300108.0,16.08481,0.0206
86,2020-04-06,DF,,CO,state,485,10.0,True,3015268.0,53.0,16.08481,0.0206
109,2020-04-06,GO,Águas Lindas de Goiás,CO,city,1,0.0,True,212440.0,5200258.0,0.47072,
110,2020-04-06,GO,Anápolis,CO,city,8,0.0,True,386923.0,5201108.0,2.06759,
111,2020-04-06,GO,Aparecida de Goiânia,CO,city,1,0.0,True,578179.0,5201405.0,0.17296,
...,...,...,...,...,...,...,...,...,...,...,...,...
7003,2020-03-09,DF,,CO,state,1,0.0,False,3015268.0,53.0,0.03316,
7012,2020-03-08,DF,Brasília,CO,city,1,0.0,False,3015268.0,5300108.0,0.03316,
7013,2020-03-08,DF,,CO,state,1,0.0,False,3015268.0,53.0,0.03316,
7025,2020-03-07,DF,Brasília,CO,city,1,0.0,False,3015268.0,5300108.0,0.03316,


In [None]:
def get_data_uf(data, uf, city_option):
    if uf != WHOLE_BRASIL:
        data = data.loc[data.state == uf]
        if city_option and city_option != "Todos":
            data = data.loc[data.city == city_option]

    return data.groupby("date")["Casos Confirmados"].sum()

In [31]:
cases.loc[cases.date=="2020-02-26",:]

Unnamed: 0,date,state,city,region,place_type,confirmed,deaths,is_last,estimated_population_2019,city_ibge_code,confirmed_per_100k_inhabitants,death_rate
7057,2020-02-26,SP,São Paulo,SE,city,1,,False,12252023.0,3550308.0,0.00816,
7058,2020-02-26,SP,,SE,state,1,,False,45919049.0,35.0,0.00218,


In [30]:
cases.groupby("date").sum()

Unnamed: 0_level_0,confirmed,deaths,is_last,estimated_population_2019,city_ibge_code,confirmed_per_100k_inhabitants,death_rate
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2020-02-26,2,0.0,0.0,58171072.0,3550343.0,0.01034,0.0
2020-02-27,2,0.0,0.0,58171072.0,3550343.0,0.01034,0.0
2020-02-28,4,0.0,0.0,58171072.0,3550343.0,0.02068,0.0
2020-02-29,4,0.0,0.0,58171072.0,3550343.0,0.02068,0.0
2020-03-01,4,0.0,0.0,58171072.0,3550343.0,0.02068,0.0
2020-03-02,4,0.0,0.0,58171072.0,3550343.0,0.02068,0.0
2020-03-03,4,0.0,0.0,58171072.0,3550343.0,0.02068,0.0
2020-03-04,6,0.0,0.0,58171072.0,3550343.0,0.03102,0.0
2020-03-05,15,0.0,0.0,79639077.0,6850815.0,0.63497,0.0
2020-03-06,17,0.0,0.0,89593893.0,9515893.0,0.78474,0.0


In [35]:
cases.loc[(cases.date=="2020-02-26")&(cases.state=="SP"),:]

Unnamed: 0,date,state,city,region,place_type,confirmed,deaths,is_last,estimated_population_2019,city_ibge_code,confirmed_per_100k_inhabitants,death_rate
7057,2020-02-26,SP,São Paulo,SE,city,1,,False,12252023.0,3550308.0,0.00816,
7058,2020-02-26,SP,,SE,state,1,,False,45919049.0,35.0,0.00218,


In [36]:
cases.loc[(cases.date=="2020-02-26")&(cases.state=="SP"),:].groupby("date").sum()

Unnamed: 0_level_0,confirmed,deaths,is_last,estimated_population_2019,city_ibge_code,confirmed_per_100k_inhabitants,death_rate
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2020-02-26,2,0.0,False,58171072.0,3550343.0,0.01034,0.0


In [None]:
#def get_data_uf(data, uf, city_option):
#    if uf != WHOLE_BRASIL:
#        data = data.loc[data.state == uf]
#        if city_option and city_option != "Todos":
#            data = data.loc[data.city == city_option]

#    return data.groupby("date")["Casos Confirmados"].sum()

############################### my changes ############################################ 
state_to_regions_dict = {"AC":"N", "AM":"N", "PA":"N", "RO":"N", "RR":"N", "TO":"N", "AP":"N",
                         "AL":"NE", "BA":"NE", "CE":"NE", "MA":"NE", "PB":"NE", "PE":"NE", "PI":"NE", "RN":"NE", "SE":"NE",
                         "DF":"CO", "GO":"CO", "MS":"CO", "MT":"CO",
                         "PR":"S", "RS":"S", "SC":"S",
                         "ES":"SE", "MG":"SE", "RJ":"SE", "SP":"SE"}
WHOLE_BRAZIL="Brasil inteiro"
def get_data_region(data, region=None, uf=None, city_option=None):
    if region:
        if "region" not in data.columns:
            column_ix = cases.columns.get_loc("date")+1
        cases.insert(loc= column_ix,column="region",value=np.nan)
        for ix,s in enumerate(cases.state):
            cases.iloc[ix:,column_ix] = state_to_regions_dict[s]
        data = data.loc[(data.region == region)&(data.place_type=="state"),:]
    if uf:
        data = data.loc[(data.state==uf)&(data.place_type=="state"),:]
    if city_option:
        data = data.loc[(data.city==city)&(data.place_type=="city"),:]
    return data.groupby("date")["Casos Confirmados"].sum()

        
        
        
#######################################################################################


In [39]:
type(cases.loc[(cases.date=="2020-02-26")&(cases.state=="SP"),"city"].iloc[1])

float

In [44]:
len(cases.loc[(cases.date=="2020-03-26")&(cases.city.isna()),:])

26

In [45]:
cases.loc[(cases.date=="2020-02-26")&(cases.state=="RJ"),:].groupby("date").sum()

Unnamed: 0_level_0,confirmed,deaths,is_last,estimated_population_2019,city_ibge_code,confirmed_per_100k_inhabitants,death_rate
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1


## Testing get_data_region

In [9]:
cases = get_data_region()

NameError: name 'get_data_region' is not defined