In [1]:
import pandas as pd
import json 

### Defining rules

In [2]:
#DEFINITIONS
POSTAL_CODE = "codigo_postal"
SUBURB = "colonia"
STATE = "estado"
MUNICIPALITY = "municipio"
CITY = "ciudad"
INPUT_FOLDER = "original_database/"
INPUT_FILE = "CPdescarga.xls"
OUTPUT_FOLDER = "processed_database/"
OUTPUT_CSV_FILE = "codigos_postales.csv"
OUTPUT_JSON = "codigos_postales.json"

#Used for renaming states
best_known_names_for_states = {"Coahuila de Zaragoza":"Coahuila", "Michoacán de Ocampo":"Michoacán",
                              "Veracruz de Ignacio de la Llave": "Veracruz"}
#Used to Ignore sheets
sheet_names_ignore = ["Nota"]

#Used to include and rename a column
renaming_columns = {"d_codigo":POSTAL_CODE,"d_asenta":SUBURB,"d_estado":STATE,"D_mnpio":MUNICIPALITY,
                    "d_ciudad":CITY}

### Reading source "database"

In [3]:
xls = pd.ExcelFile(f"{INPUT_FOLDER}{INPUT_FILE}")

### Getting States from sheets

In [4]:
states = list(filter(lambda x: x not in sheet_names_ignore, xls.sheet_names))

# Creating CSV

### Merging information from all states in one dataframe

In [5]:
df =  pd.DataFrame(columns = list(renaming_columns.values()))
for state in states:
    df = pd.concat([df,xls.parse(state).rename(columns=renaming_columns)],ignore_index=True)

### Filtering selected columns 

In [6]:
df = df[list(renaming_columns.values())]

### Filling empty values and updating states names

In [7]:
df.fillna("", inplace=True)
df[STATE].replace(best_known_names_for_states, inplace=True)

### Writing CSV File

In [8]:
df.sort_values([POSTAL_CODE,SUBURB],inplace=True)
df.to_csv(f"{OUTPUT_FOLDER}{OUTPUT_CSV_FILE}",index=False,encoding='utf-8')

# Creating JSON

### Creating JSON Structure

In [10]:
json_data = {"codigos_postales":{}}
postal_codes = df[POSTAL_CODE].unique()
for postal_code in postal_codes:
    postal_code = int(postal_code)
    for row in df[df[POSTAL_CODE]==postal_code].iloc():
        new_reg = {SUBURB:row[SUBURB], STATE:row[STATE],MUNICIPALITY:row[MUNICIPALITY], CITY:row[CITY]}
        if(postal_code in json_data["codigos_postales"]):
            json_data["codigos_postales"][postal_code].append(new_reg)
        else:
            json_data["codigos_postales"][postal_code] = [new_reg]

In [11]:
#json_object = json.dumps(json_data, indent = 4).encode('utf-8')
with open(f"{OUTPUT_FOLDER}{OUTPUT_JSON}", 'w+', encoding='utf-8') as nf:
    data = json.dumps(json_data, ensure_ascii=False, indent=4)
    nf.write(data)