# Scrape Data from Rightmove

In [1]:
%load_ext nb_black

<IPython.core.display.Javascript object>

In [2]:
import logging

import sys

sys.path.append("/app")

import scraping as sc

import pandas as pd

from jinja2 import Template

import os

from db_utils import get_engine, get_table_creation_query

<IPython.core.display.Javascript object>

In [3]:
# global logger
logger = logging.getLogger()
handler = logging.StreamHandler()
formatter = logging.Formatter("%(asctime)s [%(name)s] %(levelname)-8s %(message)s")
handler.setFormatter(formatter)
logger.addHandler(handler)
logger.setLevel(logging.INFO)

<IPython.core.display.Javascript object>

## Connect to Database

In [4]:
user = os.getenv("POSTGRES_USER")
password = os.getenv("POSTGRES_PASSWORD")
host = os.getenv("POSTGRES_HOST")

<IPython.core.display.Javascript object>

In [5]:
engine = get_engine(user, password, host)

<IPython.core.display.Javascript object>

## Define search parameters

In [6]:
url = "https://www.rightmove.co.uk/property-to-rent/find.html?locationIdentifier=REGION%5E70331&maxBedrooms=1&minBedrooms=0&maxPrice=2000&minPrice=500&propertyTypes=&includeLetAgreed=false&mustHave=&dontShow=&furnishTypes=furnished&keywords=%22"

<IPython.core.display.Javascript object>

In [7]:
searchname = "tamzin"

<IPython.core.display.Javascript object>

In [8]:
tablename = "raw_data"

<IPython.core.display.Javascript object>

## Create table

In [9]:
cols = {
    "price": "INTEGER",
    "type": "VARCHAR(256)",
    "address": "VARCHAR(256)",
    "url": "TEXT",
    "agent_url": "TEXT",
    "postcode": "VARCHAR(32)",
    "number_bedrooms": "INTEGER",
    "search_date": "TIMESTAMP",
}

index_cols = ["url", "search_date", "address"]

<IPython.core.display.Javascript object>

In [11]:
with engine.connect() as conn:
    conn.execute(f"CREATE SCHEMA IF NOT EXISTS {searchname}")
    conn.execute(get_table_creation_query(tablename, cols, searchname, index_cols))

<IPython.core.display.Javascript object>

## Scrape Data

In [12]:
df = sc.scrape_rightmove(url)

2021-05-21 10:55:25,190 [scraping] INFO     Starting RightMove scraping
2021-05-21 10:55:25,193 [scraping] INFO     Scraping from RightMove...
2021-05-21 10:55:29,712 [scraping] INFO     Fetched results


<IPython.core.display.Javascript object>

## Save to Database

In [13]:
with engine.connect() as conn:
    df.to_sql(tablename, schema=searchname, index=False, con=conn, if_exists="append")

<IPython.core.display.Javascript object>

## Create table for new data

In [14]:
cols_newdata = {
    "property_id": "INTEGER",
    "address": "VARCHAR(256)",
    "url": "TEXT",
}

<IPython.core.display.Javascript object>

In [15]:
index_cols_newdata = ["id", "address", "url"]

<IPython.core.display.Javascript object>

In [16]:
with engine.connect() as conn:
    #     conn.execute(f"CREATE SCHEMA IF NOT EXISTS {searchname}")
    conn.execute(
        get_table_creation_query(
            "new_data", cols_newdata, searchname, index_cols_newdata
        )
    )

<IPython.core.display.Javascript object>

## Populate new data table

In [17]:
with engine.connect() as conn:
    conn.execute(f"DELETE FROM {searchname}.new_data")
    conn.execute(
        f"""INSERT INTO {searchname}.new_data (property_id,url,address)
        SELECT rd.id AS property_id,rd.url,rd.address

FROM {searchname}.{tablename} rd

INNER JOIN (SELECT search_date, MAX(search_date) AS Maxsearch_date

    FROM {searchname}.{tablename}

    GROUP BY search_date) groupedrd

ON rd.search_date = groupedrd.search_date

AND rd.search_date = groupedrd.Maxsearch_date""",
    )
#     conn.execute(f"DROP SCHEMA {searchname} CASCADE")

<IPython.core.display.Javascript object>