# 1. Data Retrieval

**TODO:** Copy from 

One for manual inspection (csv) and one conservative (pkl)

Loads necessary libraries for the EDA:

In [1]:
import os
import subprocess
from sqlalchemy import create_engine
import pandas as pd
from dotenv import load_dotenv

Global configs:

In [2]:
# Path to root directory of the repo.
root_dir_ = subprocess.check_output(
    ["git", "rev-parse", "--show-toplevel"],
    text=True,
)
ROOT_DIR = root_dir_.strip()

# Path to data directory.
DATA_DIR = os.path.join(ROOT_DIR, "data")

# Paths to which dataframes will be saved
DF_PKL_PATH = os.path.join(DATA_DIR, "df.pkl")
DF_CSV_PATH = os.path.join(DATA_DIR, "df.csv") 

After exploring in db beaver. We join on id patient id

In [3]:
QUERY = """
    SET SCHEMA 'eda';

    SELECT kchd.*, kchs."date", kchs.price 
        FROM 
            king_county_house_details kchd 
        LEFT JOIN
            king_county_house_sales kchs 
        ON kchd.id = kchs.house_id
    ;
"""

Loads data from the database and stores it in pandas dataframes:

In [4]:
load_dotenv()

DB_CONFIG = {
    "database": os.getenv("DATABASE"),
    "user": os.getenv("USER_DB"),
    "password": os.getenv("PASSWORD"),
    "host": os.getenv("HOST"),
    "port": os.getenv("PORT")
}

DB_STRING = (
    "postgresql://{user}:{password}@{host}:{port}/{database}"
    .format(**DB_CONFIG)
)

In [5]:
db = create_engine(DB_STRING)

with db.connect() as conn:
    df = pd.read_sql(QUERY, conn)

Let us check that the data were retrieved.

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21597 entries, 0 to 21596
Data columns (total 21 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   id             21597 non-null  int64  
 1   bedrooms       21597 non-null  float64
 2   bathrooms      21597 non-null  float64
 3   sqft_living    21597 non-null  float64
 4   sqft_lot       21597 non-null  float64
 5   floors         21597 non-null  float64
 6   waterfront     19206 non-null  float64
 7   view           21534 non-null  float64
 8   condition      21597 non-null  int64  
 9   grade          21597 non-null  int64  
 10  sqft_above     21597 non-null  float64
 11  sqft_basement  21145 non-null  float64
 12  yr_built       21597 non-null  int64  
 13  yr_renovated   17749 non-null  float64
 14  zipcode        21597 non-null  int64  
 15  lat            21597 non-null  float64
 16  long           21597 non-null  float64
 17  sqft_living15  21597 non-null  float64
 18  sqft_l

In [7]:
df.sample(10)



Unnamed: 0,id,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,...,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15,date,price
482,6189200125,3.0,3.0,2990.0,9773.0,2.0,0.0,0.0,4,8,...,0.0,1973,0.0,98005,47.6344,-122.174,2230.0,11553.0,2015-03-25,849950.0
16689,1623049145,2.0,1.0,880.0,9750.0,1.0,0.0,0.0,5,6,...,0.0,1938,0.0,98168,47.4885,-122.298,1220.0,9406.0,2014-09-25,210000.0
13355,9828702266,3.0,2.5,1480.0,1165.0,3.0,,0.0,3,8,...,0.0,2006,0.0,98144,47.6199,-122.3,1480.0,1231.0,2014-10-03,520000.0
18614,3501600215,2.0,1.0,1000.0,4800.0,1.0,0.0,0.0,3,6,...,0.0,1952,0.0,98117,47.6926,-122.362,1000.0,4800.0,2015-04-14,380000.0
731,2723089104,3.0,2.25,1540.0,17424.0,2.0,0.0,0.0,3,7,...,0.0,1992,0.0,98045,47.4429,-121.759,1560.0,11439.0,2014-09-17,315000.0
19440,2108500110,3.0,2.25,2120.0,9804.0,2.0,0.0,0.0,3,7,...,0.0,1994,0.0,98042,47.3596,-122.16,2120.0,7200.0,2015-04-15,278000.0
20674,3943600140,4.0,2.5,1812.0,5026.0,2.0,0.0,0.0,3,8,...,0.0,2011,0.0,98055,47.4513,-122.202,2440.0,6007.0,2015-03-02,370000.0
14194,6300500183,4.0,2.0,1780.0,5043.0,1.0,0.0,0.0,3,7,...,910.0,1993,,98133,47.7045,-122.342,1350.0,5044.0,2014-12-12,305000.0
2292,6121800060,3.0,1.5,1320.0,9750.0,1.0,0.0,0.0,3,7,...,0.0,1954,0.0,98148,47.4267,-122.331,1530.0,9750.0,2014-09-10,260000.0
15916,4139400630,3.0,2.5,2770.0,9136.0,2.0,0.0,0.0,3,10,...,0.0,1991,0.0,98006,47.5605,-122.115,2890.0,8442.0,2014-05-29,860000.0


Before storing, let us remove 

In [8]:
df = df.loc[:, ~df.columns.duplicated()]


In [9]:
df.to_pickle(DF_PKL_PATH)
df.to_csv(DF_CSV_PATH, index=False)