In [1]:
import psycopg2
from psycopg2 import Error
import os
import sys
import pandas as pd
import numpy as np
import datetime

### Auxiliar functions

In [2]:
def get_cols_dict(df):
    cols = df.columns
    cols_dict = {}
    for i, col in enumerate(cols):
        cols_dict[col] = i
    return cols_dict

In [3]:
def modificar_fin_vivienda(df, cols):
    ids = set(df[:,0])
    query = 'select id from registro where fin_fecha is null'
    try:
        cursor.execute(query)
        result = cursor.fetchall()
        ids_bd = set([x[0] for x in result])
        ids_modificar = ids_bd-ids
        for id in ids_modificar:
            query = 'update registro set fin_fecha = %s where id = %s'
            cursor.execute(query, (datetime.datetime.now(), id))
    except Exception as e:
        n_errors += 1

In [4]:
def insertar_bd(file, src_folder):
    n_errors = 0
    df = pd.read_csv(os.path.join(src_folder, file))
    df = df.replace(np.nan, None, regex=True)
    cols = get_cols_dict(df)
    df = df.to_numpy()

    for i in range(len(df)):
        row = df[i]
        query = 'SELECT * FROM insert_vivienda (%s::integer,%s::text,%s::integer,%s::integer,%s::text,%s::text,%s::text,%s::integer,%s::money,%s::varchar);'
        try:
            cursor.execute(query, (
            row[cols['id']],
            row[cols['title']],
            row[cols['n_rooms']],
            row[cols['size']],
            row[cols['description']],
            row[cols['extra_info']],
            row[cols['image']],
            row[cols['img_no']],
            row[cols['price']],
            row[cols['price_form']]
            ))
            result = cursor.fetchone()
        except Exception as e:
            n_errors += 1
    print(n_errors, 'errors')
    modificar_fin_vivienda(df, cols)

### Main code

Open conection

In [5]:
try:
    connection = psycopg2.connect(
        user="postgres",
        password="1111",
        host="localhost",
        port="5432",
        database="corunaRealEstateMarket"
    )
    connection.autocommit = True
    cursor = connection.cursor()

except (Exception, Error) as error:
    print("Error while connecting to PostgreSQL", error)

### Insert data

In [6]:
src_folder = './data/idealista/'

In [7]:
for file in os.listdir(src_folder)[5:]:
    insertar_bd(file, src_folder)

0 errors


### Set municipios to data (this information must be found in 'title' field)

In [8]:
municipios = pd.read_csv('./data/ayuntamientos/municipios.csv')
municipios = municipios["Municipio"].values.tolist()
# to lower case
municipios = [x.lower() for x in municipios]

Check that from all house the municipio can be extracted

In [9]:
count_sin_municipio = 0
for file in os.listdir(src_folder)[5:]:
    df = pd.read_csv(src_folder + file, usecols = ['id', 'title'])
    for row in df.itertuples():
        id = row[1]
        title = row[2].lower()
        municipio = None
        for mun in municipios:
            if mun in title:
                municipio = mun
                break
        if municipio == None:
            count_sin_municipio+=1

print('Cantidad de viviendas sin municipio: ', count_sin_municipio)

Cantidad de viviendas sin municipio:  0


### Update municipio

In [10]:
query = 'update viviendas set municipio = %s where id = %s'

for file in os.listdir(src_folder)[5:]:
    df = pd.read_csv(src_folder + file, usecols = ['id', 'title'])
    for row in df.itertuples():
        id = row[1]
        title = row[2].lower()
        municipio = None
        for mun in municipios:
            if mun in title:
                municipio = mun
                break
        cursor.execute(query, (municipio, id))

Close connection

In [11]:
connection.close()