<a href="https://colab.research.google.com/github/ipeirotis/datasets/blob/master/Real_Estate_Data_Parsing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!sudo pip3 install -U -q PyMySQL sqlalchemy sql_magic
!pip install -q google-cloud-secret-manager
import os
from sqlalchemy import create_engine

import pandas as pd
import numpy as np
from tqdm import tqdm

In [None]:
from google.colab import auth
auth.authenticate_user()

In [None]:
from google.cloud import secretmanager


def access_secret_version(project_id, secret_id, version_id):
    """
    Access the payload of the given secret version and return it.

    Args:
        project_id (str): Google Cloud project ID.
        secret_id (str): ID of the secret to access.
        version_id (str): ID of the version to access.
    Returns:
        str: The secret version's payload, or None if
        the version does not exist.
    """
    client = secretmanager.SecretManagerServiceClient()
    name = f"projects/{project_id}/secrets/{secret_id}/versions/{version_id}"
    response = client.access_secret_version(request={"name": name})
    return response.payload.data.decode("UTF-8")


mysql_password = access_secret_version("nyu-datasets", "MYSQL_PASSWORD", "latest")

In [None]:


conn_string = 'mysql+pymysql://{user}:{password}@{host}/?charset=utf8mb4'.format(
    host = 'db.ipeirotis.org',
    user = 'root',
    password = mysql_password,
    encoding = 'utf8mb4')

engine = create_engine(conn_string)


In [None]:
# Query to create a database
db_name = 'real_estate'

# sql = f"DROP DATABASE IF EXISTS {db_name}"
# engine.execute(sql)

# Create a database
sql = f"CREATE DATABASE IF NOT EXISTS {db_name} DEFAULT CHARACTER SET 'utf8mb4'"
engine.execute(sql)

# And lets switch to the database
sql = f"USE {db_name}"
engine.execute(sql)

In [None]:
def load_data(url):
  df = pd.read_csv(url, sep='\t', dtype='unicode')
  df = df.drop(["Unnamed: 0", "state.1"], axis='columns')
  df = df.query('property_id==property_id')
  df['property_id'] = pd.to_numeric(df['property_id']).astype('int64')

  df = df[
      [
          "geo_id",
          "id",
          "property_id",
          "building_id",
          "property_type",
          "status",
          "sales_transaction_type",
          "current_price",
          "bedrooms",
          "bathrooms",
          "square_feet",
          "price_per_square_foot",
          "street_number",
          "street",
          "unit_number",
          "city",
          "state",
          "zipcode",
          "county",
          "close_date",
          "close_price",
          "contract_date",
          "contract_price",
          "list_date",
          "list_price",
          "date_exit_market",
          "days_on_market",
          "cumulative_days_on_market",
          "latitude",
          "longitude",
          "building_year_opened",
          "is_public_record",
      ]
  ]

  df['current_price'] = pd.to_numeric(df['current_price'])
  df['close_price'] = pd.to_numeric(df['close_price'])
  df['contract_price'] = pd.to_numeric(df['contract_price'])
  df['list_price'] = pd.to_numeric(df['list_price'])
  df['days_on_market'] = pd.to_numeric(df['days_on_market'])
  df['cumulative_days_on_market'] = pd.to_numeric(df['cumulative_days_on_market'])
  df['latitude'] = pd.to_numeric(df['latitude'])
  df['longitude'] = pd.to_numeric(df['longitude'])
  df['square_feet'] = pd.to_numeric(df['square_feet'])
  df['price_per_square_foot'] = pd.to_numeric(df['price_per_square_foot'])
  df['bedrooms'] = pd.to_numeric(df['bedrooms'])
  df['bathrooms'] = pd.to_numeric(df['bathrooms'])

  df['building_year_opened'] = pd.to_numeric(df['building_year_opened']).astype('Int64')

  df['close_date'] = pd.to_datetime(df['close_date'], errors='coerce').dt.date
  df['contract_date'] = pd.to_datetime(df['contract_date'], errors='coerce').dt.date
  df['list_date'] = pd.to_datetime(df['list_date'], errors='coerce').dt.date
  df['date_exit_market'] = pd.to_datetime(df['date_exit_market'], errors='coerce').dt.date



  df.loc[ df["days_on_market"] < 0, "days_on_market"] = None
  df.loc[ df["current_price"] < 0, "current_price"] = None
  df.loc[ df["close_price"] < 0, "close_price"] = None
  df.loc[ df["contract_price"] < 0, "contract_price"] = None
  df.loc[ df["list_price"] < 0, "list_price"] = None
  df.loc[ df["days_on_market"] < 0, "days_on_market"] = None
  df.loc[ df["cumulative_days_on_market"] < 0, "cumulative_days_on_market"] = None

  df.loc[ df["square_feet"] < 0, "square_feet"] = None
  df.loc[ df["price_per_square_foot"] < 0, "price_per_square_foot"] = None
  df.loc[ df["bedrooms"] < 0, "bedrooms"] = None
  df.loc[ df["bedrooms"] < 0, "bedrooms"] = None

  df.is_public_record = df.is_public_record.map({"True": True, "False": False})

  return df

In [None]:
create_table_sql = '''
CREATE TABLE real_estate.listings (
    id char(20),
    geo_id char(10),
    property_id char(25),
    building_id varchar(100),
    property_type varchar(100),
    status varchar(60),
    sales_transaction_type  varchar(60),
    current_price float,
    bedrooms float,
    bathrooms float,
    square_feet float,
    price_per_square_foot float,
    street_number varchar(50),
    street varchar(100),
    unit_number varchar(50),
    city varchar(50),
    state varchar(50),
    zipcode varchar(10),
    county varchar(50),
    close_date date,
    close_price float,
    contract_date date,
    contract_price float,
    list_date date,
    list_price float,
    date_exit_market date,
    days_on_market float,
    cumulative_days_on_market float,
    latitude float,
    longitude float,
    building_year_opened int,
    is_public_record boolean,
  PRIMARY KEY (id)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4;
'''
engine.execute(create_table_sql)

In [None]:
sql = '''
GRANT SELECT ON real_estate.* TO 'student'@'%%'
'''
engine.execute(sql)

In [None]:
!curl https://storage.googleapis.com/datasets_nyu/la_listings.tsv.gz -o la_listings.tsv.gz

In [None]:
%%time

urls = [
  # 'https://storage.googleapis.com/datasets_nyu/nyc_listings.tsv.gz',
  # 'https://storage.googleapis.com/datasets_nyu/sf_listings.tsv.gz',
  # 'https://storage.googleapis.com/datasets_nyu/boston_listings.tsv.gz',
  # 'https://storage.googleapis.com/datasets_nyu/seattle_listings.tsv.gz',
  'la_listings.tsv.gz'
]

for url in urls:
  print(url)
  df = load_data(url)
  print("Entries", len(df))

  batchsize = 10000
  batches = len(df) // batchsize + 1

  t = tqdm(range(batches))

  for i in t:
      start = batchsize * i
      end = batchsize * (i+1)
      with engine.connect() as con:
        df[start:end].to_sql(
            name = 'listings',
            schema = db_name,
            con = con,
            if_exists = 'append',
            index = False,
            chunksize = 1000)


In [None]:
df.loc[ df["property_type"] == "Multi-Family", "property_type"] = "Multi Family"
df.loc[ df["property_type"] == "Multi family", "property_type"] = "Multi Family"
df.loc[ df["property_type"] == "Multi Family,Townhouse", "property_type"] = "Multi Family"

df.loc[ df["property_type"] == "Co-Op", "property_type"] = "Co-op"

df.loc[ df["property_type"] == "Mobile", "property_type"] = "Mobile/Manufactured"

df.loc[ df["property_type"] == "Rentals", "property_type"] = "Rental"

df.loc[ df["property_type"] == "Commercial", "property_type"] = "Non-Residential"

df.loc[ df["property_type"] == "Condo|Co-op", "property_type"] = "Condop"

In [None]:
df.loc[ df["bedrooms"] == 1.5, "bedrooms"] = 1.0

df.loc[ df["bedrooms"] == 1.5, "bedrooms"] = 1.0

In [None]:
df["property_type"].value_counts()

In [None]:
for c in df.columns:
  if df.dtypes[c] == 'object' and 'date' not in c:
    print(c)
    print( df[c].str.len().max())

In [None]:
df.dtypes