# Dataframe Factory
We will use two public dataset to create a unique dataset to scrapy some neighborhood in brazil.

In [54]:
# Dependencies
import pandas as pd
from unidecode import unidecode
import numpy as np
import requests
import uuid

In [11]:
# Use public dataframes

states = pd.read_csv("https://raw.githubusercontent.com/kelvins/Municipios-Brasileiros/main/csv/estados.csv")
cities = pd.read_csv("https://raw.githubusercontent.com/kelvins/Municipios-Brasileiros/main/csv/municipios.csv")

In [20]:
# Create column with UF names in the order of cities
uf_codes = cities["codigo_uf"]
uf_order_by_code = []
for code in uf_codes:
    row = states.loc[states["codigo_uf"] == code]
    uf_order_by_code.append(row["uf"].values[0])
    
print(f"total rows in uf_codes {len(uf_codes)}")
print(f"total row in uf_order_by_code {len(uf_order_by_code)}")

total rows in uf_codes 5570
total row in uf_order_by_code 5570


In [83]:
# Format the cities names
cities_names = cities["nome"].values
formated_names = []
for city_name in cities_names:
    formated_city_name = unidecode(city_name)
    formated_names.append(formated_city_name)
    
print(f"total row in formated_names {len(formated_names)}")

total row in formated_names 5570


In [84]:
# Create capital column and set unique id for each city
capital = cities["capital"].values
cities_ids = []
for index in range(len(capital)):
    cities_ids.append(uuid.uuid4())

print(f"total row in capital {len(capital)}")
print(f"total row in cities_ids {len(set(cities_ids))}")

total row in capital 5570
total row in cities_ids 5570


In [96]:
# Create new dataframe
df_blueprint = {
    "city_id": cities_ids,
    "city_names": formated_names,
    "relative_uf": uf_order_by_code,
    "capital": capital    
}

df = pd.DataFrame(df_blueprint)
df
df.to_csv("./csv/br-cities.csv")

In [86]:
# Create endpoint column to the crawler
endpoints = []
for index, row in df.iterrows():
    city_names = row["city_names"]
    uf = row["relative_uf"]
    str_city_uf = f"{city_names} {uf}"
    str_formated = str_city_uf.replace(" ", "-").lower()
    full_endpoint = f"https://www.guiamais.com.br/bairros/{str_formated}"
    endpoints.append(full_endpoint)
    
print(f"total row in endpoints {len(endpoints)}")

total row in endpoints 5570


In [101]:
# Add endpoint column in the main dataframe
df["endpoints"] = endpoints
df

Unnamed: 0,city_id,city_names,relative_uf,capital,endpoints
0,432dd075-2bca-479c-8728-bb0580b5767b,Abadia de Goias,GO,0,https://www.guiamais.com.br/bairros/abadia-de-...
1,5f6fa223-a94d-48e2-aa2a-b47a1b282ef3,Abadia dos Dourados,MG,0,https://www.guiamais.com.br/bairros/abadia-dos...
2,06295a45-0a70-4cb2-920a-d5cbaceb5e06,Abadiania,GO,0,https://www.guiamais.com.br/bairros/abadiania-go
3,3a43e1ad-8a2c-4fd5-b263-6a69e1abf08f,Abaete,MG,0,https://www.guiamais.com.br/bairros/abaete-mg
4,e14e4682-7a83-4895-9dc7-829ddd44be6b,Abaetetuba,PA,0,https://www.guiamais.com.br/bairros/abaetetuba-pa
...,...,...,...,...,...
5565,5ef428b7-18b9-451a-b7c9-1582be4e6578,Xique-Xique,BA,0,https://www.guiamais.com.br/bairros/xique-xiqu...
5566,8abed599-5116-4771-bd63-aa61eec8fd18,Zabele,PB,0,https://www.guiamais.com.br/bairros/zabele-pb
5567,c8e052fe-f959-4734-a180-e284ee970b74,Zacarias,SP,0,https://www.guiamais.com.br/bairros/zacarias-sp
5568,66df7679-7d2a-4c0b-a076-7f6f71494f89,Ze Doca,MA,0,https://www.guiamais.com.br/bairros/ze-doca-ma
