In [38]:
import os
import pandas as pd
import numpy as np
import geopandas as gpd
from sqlalchemy import create_engine
import matplotlib.pyplot as plt

from shapely.geometry import Point

pd.options.mode.chained_assignment = None
pd.options.display.max_columns = None

In [39]:
# ====== Connection ======
# Connecting to PostgreSQL by providing a sqlachemy engine
# Local
local_username= 'ds4a_final'
local_password='qwerty.123'
local_host = 'localhost'
local_port='5432'
local_database = 'ds4a_finalproject'

local_engine = create_engine(f'postgresql://{local_username}:{local_password}@{local_host}:{local_port}/{local_database}',echo=False)

# Remote
remote_username= 'final_project'
remote_password='******'
remote_host = '******'
remote_port='5432'
remote_database = 'ds4a_final'

remote_engine = create_engine(f'postgresql://{remote_username}:{remote_password}@{remote_host}:{remote_port}/{remote_database}',echo=False)

In [3]:
os.chdir(os.path.join('..'))

In [40]:
basepath = os.getcwd()
ds_path = os.path.join(basepath, 'datasets\\final\\')
pj_path = os.path.join(basepath, 'final\\')

In [41]:
# Read all information from datasets
df_viv = pd.read_csv(os.path.join(ds_path,'raw\\DANE\\08_Atlantico\\CNPV2018_1VIV_A1_08.csv'), sep=',', dtype={'U_DPTO': str, 'U_MPIO': str} )
df_hog = pd.read_csv(os.path.join(ds_path,'raw\\DANE\\08_Atlantico\\CNPV2018_2HOG_A1_08.csv'), sep=',', dtype={'U_DPTO': str, 'U_MPIO': str} )
df_per = pd.read_csv(os.path.join(ds_path,'raw\\DANE\\08_Atlantico\\CNPV2018_5PER_A1_08.csv'), sep=',', dtype={'U_DPTO': str, 'U_MPIO': str} )
df_mgn = pd.read_csv(os.path.join(ds_path,'raw\\DANE\\08_Atlantico\\CNPV2018_MGN_A1_08.csv'), sep=',', dtype={'U_DPTO': str, 'U_MPIO': str, 'UA_CLASE': str, 'UA1_LOCALIDAD': str, 'U_SECT_RUR': str, 'U_SECC_RUR': str, 'UA2_CPOB': str,'U_SECT_URB': str,'U_SECC_URB': str,'U_MZA': str } )
df_sec = gpd.read_file(os.path.join(ds_path,'raw\\DANE\\MGN2017_08_ATLANTICO\MGN\\MGN_URB_SECTOR.shp'))
df_dane_barrio= pd.read_csv(os.path.join(ds_path,'processed\\neighborhood_dane.csv'), sep=',', dtype={'setu_ccnct':str})

In [42]:
# All column names to lowercase
df_viv.columns = map(str.lower, df_viv.columns)
df_hog.columns = map(str.lower, df_hog.columns)
df_per.columns = map(str.lower, df_per.columns)
df_mgn.columns = map(str.lower, df_mgn.columns)
df_sec.columns = map(str.lower, df_sec.columns)

df_viv = df_viv[df_viv['u_mpio']=='001']
df_hog = df_hog[df_hog['u_mpio']=='001']
df_per = df_per[df_per['u_mpio']=='001']
df_mgn = df_mgn[df_mgn['u_mpio']=='001']
df_sec = df_sec[df_sec['mpio_ccdgo']=='08001']

In [43]:
df_sec['mpio_ccdgo'] = df_sec['mpio_ccdgo'].apply(lambda data: data[2:])
df_sec['cpob_ccdgo'] = df_sec['cpob_ccdgo'].apply(lambda data: data[5:])

In [44]:
df_sec = pd.merge(df_sec, df_dane_barrio, on='setu_ccnct', how='inner') #152 records

In [53]:
df_sec['key'] = df_sec['dpto_ccdgo'] + df_sec['mpio_ccdgo'] +df_sec['clas_ccdgo'] + df_sec['setr_ccdgo']+ df_sec['secr_ccdgo']+ df_sec['cpob_ccdgo']+ df_sec['setu_ccdgo']
df_mgn['key'] = df_mgn['u_dpto'] + df_mgn['u_mpio'] +df_mgn['ua_clase'] + df_mgn['u_sect_rur']+ df_mgn['u_secc_rur']+ df_mgn['ua2_cpob']+ df_mgn['u_sect_urb']

df_match = pd.merge(df_mgn, df_sec, on='key', how='inner', suffixes=('_x', '_y') ) # 344463 records match - 2661 don't match

In [54]:
df_match = df_match[['cod_encuestas', 'barrio_id']]
df_match['barrio_id'] = df_match['barrio_id'].astype(int)

In [56]:
#Ages
bins= [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21, 22]
labels = ['P_00-04','P_05-09','P_10-14','P_15-19','P_20-24','P_25-29','P_30-34','P_35-39','P_40-44','P_45-49','P_50-54','P_55-59','P_60-64','P_65-69','P_70-74','P_75-79','P_80-84','P_85-89','P_90-94','P_95-99','P_100+']
df_per['rango_edad'] = pd.cut(df_per['p_edad'], bins=bins, labels=labels, right=False)
#Gender
bins= [1,2,3]
labels = ['Hombre', 'Mujer']
df_per['sexo'] = pd.cut(df_per['p_sexo'], bins=bins, labels=labels, right=False)

In [57]:
df_vivienda = pd.merge(df_match, df_viv, on='cod_encuestas', how='inner')
df_hogar = pd.merge(df_match, df_hog, on='cod_encuestas', how='inner')
df_personas = pd.merge(df_match, df_per, on='cod_encuestas', how='inner')

df_vivienda = df_vivienda.groupby('barrio_id').agg({'va1_estrato': 'mean'} ).reset_index(drop=False)
df_hogar = df_hogar.groupby('barrio_id').agg({'ha_tot_per' : 'sum'}).reset_index(drop=False)
df_personas = df_personas.groupby(['barrio_id', 'sexo', 'rango_edad']).agg({'p_nro_per' : 'count'}).reset_index(drop=False)

In [63]:
from functools import reduce
dfs = [df_vivienda,
       df_hogar,
       (df_personas.groupby('barrio_id').agg({'p_nro_per': 'sum'}).reset_index(drop=False).rename(columns={'p_nro_per':'total_personas'})),
       (df_personas[df_personas['sexo'] == 'Hombre'].groupby('barrio_id').agg({'p_nro_per': 'sum'}).reset_index(drop=False).rename(columns={'p_nro_per':'total_hombres'})),
       (df_personas[df_personas['sexo'] == 'Mujer'].groupby('barrio_id').agg({'p_nro_per': 'sum'}).reset_index(drop=False).rename(columns={'p_nro_per':'total_mujeres'}))
      ]
df_final = reduce(lambda left,right: pd.merge(left,right,on='barrio_id'), dfs)
del df_final['ha_tot_per']
df_final.to_sql(name='barrio_demografia', con=local_engine, if_exists = 'append', index=False)

In [66]:
df_personas['detail_id'] = df_personas.index
df_personas.to_sql(name='barrio_demografia_detail', con=local_engine, if_exists='append', index=False)