In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
import os
import string
from bs4 import BeautifulSoup

### Auxiliar functions

In [2]:
def generate_soup(file_path):
    with open(file_path, 'r', encoding="utf-8") as f:
        soup = BeautifulSoup(f, 'html.parser')
    return soup

In [3]:
# Function for replace the last number of ocurrences of a string. This number is the argument "occurrence"
def rreplace(s, old, new, occurrence):
    li = s.rsplit(old, occurrence)
    return new.join(li)

### Main code

Zones are obtained

In [4]:
bd_folder = './data_filtered/zonas'
filename = 'zonas.html'

soup = generate_soup(os.path.join(bd_folder, filename))

zones = []

# get id=sublocations
data = soup.find('ul', {'id': 'sublocations', 'class' : 'navList nav-list'})
data = data.find_all('span', {'class': 'sublocationText'})
for item in data:
    zones.append(item.text.strip())

In [5]:
aytos = pd.read_csv('./data/ayuntamientos/municipios.csv', usecols = ["Municipio"]).values.flatten().tolist()

In [6]:
zones = set(zones)-set(aytos)
zones = list(zones)
print("There are {} zones".format(len(zones)))

There are 39 zones


In [7]:
aux = []

for ay in aytos:
    #check if any item in zones contains ay
    if not any(ay in s for s in zones):
        aux.append(ay)
        
zones = aux+zones

In [8]:
zones

['Cambre',
 'Sada',
 'Betanzos',
 'Bergondo',
 'Carral',
 'Abegondo',
 'Agra del Orzán - Ventorrillo, A Coruña',
 'Cuatro Caminos - Plaza de la Cubela, A Coruña',
 'Monte Alto - Zalaeta - Atocha, A Coruña',
 'Portádego - Vilaboa Norte, Culleredo',
 'Almeiras, Culleredo',
 'Centro de Culleredo, Culleredo',
 'Someso - Matogrande, A Coruña',
 'Paseo de los Puentes-Santa Margarita, Ensanche - Juan Florez',
 'Ensanche - Juan Florez, A Coruña',
 'Morás-A Zapateira, Arteixo',
 'Acea de Ama - O Burgo, Culleredo',
 'Nós, Oleiros',
 'Ensanche, Ensanche - Juan Florez',
 'Dexo-Lorbé, Oleiros',
 'Los Castros - Castrillón, A Coruña',
 'Perillo, Oleiros',
 'Falperra-Santa Lucía, Ensanche - Juan Florez',
 'Mesoiro, A Coruña',
 'Elviña - A Zapateira, A Coruña',
 'Juan Flórez-San Pablo, Ensanche - Juan Florez',
 'Mera-Serantes, Oleiros',
 'Riazor - Los Rosales, A Coruña',
 'Dorneda, Oleiros',
 'Suevos-Pastoriza, Arteixo',
 'Liáns, Oleiros',
 'A Zapateira, Culleredo',
 'Ciudad Jardín, Ensanche - Juan Flo

In [9]:
zone_col = []
ayto_col = []

for z in zones:
    for a in aytos:
        if a in z:
            if a != z:
                print(z)
                z = rreplace(z, a, '', 1)
                z = z.strip().strip(string.punctuation)
            # z = z.replace(' - ', '-')
            zone_col.append(z)
            ayto_col.append(a)
            continue

df_zones = pd.DataFrame({'zona': zone_col, 'municipio': ayto_col})

dest_folder = './data/zonas'
os.makedirs(dest_folder, exist_ok=True)
# save data to csv file
df_zones.to_csv(os.path.join(dest_folder, "zonas.csv"), index=False)

Agra del Orzán - Ventorrillo, A Coruña
Cuatro Caminos - Plaza de la Cubela, A Coruña
Monte Alto - Zalaeta - Atocha, A Coruña
Portádego - Vilaboa Norte, Culleredo
Almeiras, Culleredo
Centro de Culleredo, Culleredo
Someso - Matogrande, A Coruña
Ensanche - Juan Florez, A Coruña
Morás-A Zapateira, Arteixo
Acea de Ama - O Burgo, Culleredo
Nós, Oleiros
Dexo-Lorbé, Oleiros
Los Castros - Castrillón, A Coruña
Perillo, Oleiros
Mesoiro, A Coruña
Elviña - A Zapateira, A Coruña
Mera-Serantes, Oleiros
Riazor - Los Rosales, A Coruña
Dorneda, Oleiros
Suevos-Pastoriza, Arteixo
Liáns, Oleiros
A Zapateira, Culleredo
Os Mallos, A Coruña
Iñás, Oleiros
Sagrada Familia, A Coruña
Eirís, A Coruña
Ciudad Vieja - Centro, A Coruña
Vioño, A Coruña
Maianca, Oleiros
Parroquias rurales, Culleredo
Loureda-Lañas-Barrañán, Arteixo
Vilaboa Sur, Culleredo
Oseiro, Arteixo
