In [1]:
import pandas as pd
import numpy as np
from datetime import datetime
 

def clean_df():
    king_country_house_prices = pd.read_csv("../data/King_County_House_prices_dataset.csv")
    king_country_house_prices_cleaning = king_country_house_prices.copy()

    king_country_house_prices_cleaning = king_country_house_prices_cleaning[(\
        king_country_house_prices_cleaning['bedrooms'] /\
            king_country_house_prices_cleaning['bathrooms']) <= 10]

    king_country_house_prices_cleaning['sqft_basement'] = king_country_house_prices_cleaning['sqft_basement'].replace('?', np.NaN)
    king_country_house_prices_cleaning['sqft_basement'] = king_country_house_prices_cleaning['sqft_basement'].astype(float)
    king_country_house_prices_cleaning.eval('sqft_basement = sqft_living - sqft_above', inplace=True)
    king_country_house_prices_cleaning['view'].fillna(0, inplace=True)
    king_country_house_prices_cleaning.waterfront.fillna(0, inplace=True)

    last_known_change = []
    for idx, yr_re in king_country_house_prices_cleaning.yr_renovated.items():
        if str(yr_re) == 'nan' or yr_re == 0.0:
            last_known_change.append(king_country_house_prices_cleaning.yr_built[idx])
        else:
            last_known_change.append(int(yr_re))
    king_country_house_prices_cleaning['last_known_change'] = last_known_change
    king_country_house_prices_cleaning.drop("yr_renovated", axis=1, inplace=True)
    king_country_house_prices_cleaning.drop("yr_built", axis=1, inplace=True)

    king_country_house_prices_cleaning['date'] = pd.to_datetime(king_country_house_prices_cleaning['date'])

    return king_country_house_prices_cleaning

In [2]:
clean_df().head()

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,zipcode,lat,long,sqft_living15,sqft_lot15,last_known_change
0,7129300520,2014-10-13,221900.0,3,1.0,1180,5650,1.0,0.0,0.0,3,7,1180,0.0,98178,47.5112,-122.257,1340,5650,1955
1,6414100192,2014-12-09,538000.0,3,2.25,2570,7242,2.0,0.0,0.0,3,7,2170,400.0,98125,47.721,-122.319,1690,7639,1991
2,5631500400,2015-02-25,180000.0,2,1.0,770,10000,1.0,0.0,0.0,3,6,770,0.0,98028,47.7379,-122.233,2720,8062,1933
3,2487200875,2014-12-09,604000.0,4,3.0,1960,5000,1.0,0.0,0.0,5,7,1050,910.0,98136,47.5208,-122.393,1360,5000,1965
4,1954400510,2015-02-18,510000.0,3,2.0,1680,8080,1.0,0.0,0.0,3,8,1680,0.0,98074,47.6168,-122.045,1800,7503,1987


In [4]:
from pydantic import BaseModel

class DataValidation(BaseModel):
    id: int
    date: datetime
    price: float 
    bedrooms: int 
    bathrooms: float  
    sqft_living: int 
    sqft_lot: int 
    floors: float
    waterfront: float
    view: float 
    condition: int 
    grade: int  
    sqft_above: int 
    sqft_basement: float 
    last_known_change: int
    zipcode: int 
    lat: float 
    long: float  
    sqft_living15: int 
    sqft_lot15: int 
    sqft_price: float
    delta_lat: float
    delta_long: float
    center_distance: float
    water_distance: float

def data_validation(df: pd.DataFrame, data_schema) -> pd.DataFrame:
    class DataframeValidation(BaseModel):
        df_as_dict: list[data_schema]
    df_as_dict = df.to_dict(orient='records')
    DataframeValidation(df_as_dict=df_as_dict)
    return df 

def distance_to_water(long, lat, ref_long, ref_lat):
    delta_long = long - ref_long
    delta_lat = lat - ref_lat
    delta_long_corr = delta_long * np.cos(np.radians(ref_lat))
    return ((delta_long_corr)**2 +(delta_lat)**2)**(1/2)*2*np.pi*6378/360

def feature_engineering():
    house_prices_feature_engineering = clean_df()

    house_prices_feature_engineering['sqft_price'] = (house_prices_feature_engineering.price/(\
        house_prices_feature_engineering.sqft_living + house_prices_feature_engineering.sqft_lot)).round(2)

    house_prices_feature_engineering['delta_lat'] = np.absolute(47.62774- house_prices_feature_engineering['lat'])
    house_prices_feature_engineering['delta_long'] = np.absolute(-122.24194-house_prices_feature_engineering['long'])
    house_prices_feature_engineering['center_distance']= ((house_prices_feature_engineering['delta_long']* np.cos(np.radians(47.6219)))**2 
                                    + house_prices_feature_engineering['delta_lat']**2)**(1/2)*2*np.pi*6378/360

    water_list= house_prices_feature_engineering.query('waterfront == 1')
    water_distance = []
    for idx, lat in house_prices_feature_engineering.lat.items():
        ref_list = []
        for x,y in zip(list(water_list.long), list(water_list.lat)):
            ref_list.append(distance_to_water(house_prices_feature_engineering.long[idx], house_prices_feature_engineering.lat[idx],x,y).min())
        water_distance.append(min(ref_list))
    house_prices_feature_engineering['water_distance'] = water_distance
    
    data_validation(house_prices_feature_engineering, DataValidation)
    return house_prices_feature_engineering

q2 = feature_engineering()

q2

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,lat,long,sqft_living15,sqft_lot15,last_known_change,sqft_price,delta_lat,delta_long,center_distance,water_distance
0,7129300520,2014-10-13,221900.0,3,1.00,1180,5650,1.0,0.0,0.0,...,47.5112,-122.257,1340,5650,1955,32.49,0.11654,0.01506,13.022012,0.678977
1,6414100192,2014-12-09,538000.0,3,2.25,2570,7242,2.0,0.0,0.0,...,47.7210,-122.319,1690,7639,1991,54.83,0.09326,0.07706,11.882906,2.910551
2,5631500400,2015-02-25,180000.0,2,1.00,770,10000,1.0,0.0,0.0,...,47.7379,-122.233,2720,8062,1933,16.71,0.11016,0.00894,12.281023,2.327626
3,2487200875,2014-12-09,604000.0,4,3.00,1960,5000,1.0,0.0,0.0,...,47.5208,-122.393,1360,5000,1965,86.78,0.10694,0.15106,16.436889,0.467532
4,1954400510,2015-02-18,510000.0,3,2.00,1680,8080,1.0,0.0,0.0,...,47.6168,-122.045,1800,7503,1987,52.25,0.01094,0.19694,14.826499,1.726771
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21592,263000018,2014-05-21,360000.0,3,2.50,1530,1131,3.0,0.0,0.0,...,47.6993,-122.346,1530,1509,2009,135.29,0.07156,0.10406,11.154088,5.346761
21593,6600060120,2015-02-23,400000.0,4,2.50,2310,5813,2.0,0.0,0.0,...,47.5107,-122.362,1830,7200,2014,49.24,0.11704,0.12006,15.839476,1.724718
21594,1523300141,2014-06-23,402101.0,2,0.75,1020,1350,2.0,0.0,0.0,...,47.5944,-122.299,1020,2007,2009,169.66,0.03334,0.05706,5.665915,0.925361
21595,291310100,2015-01-16,400000.0,3,2.50,1600,2388,2.0,0.0,0.0,...,47.5345,-122.069,1410,1287,2004,100.30,0.09324,0.17294,16.616144,2.402901


In [11]:
data_validation(feature_engineering(), DataValidation)

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,lat,long,sqft_living15,sqft_lot15,last_known_change,sqft_price,delta_lat,delta_long,center_distance,water_distance
0,7129300520,2014-10-13,221900.0,3,1.00,1180,5650,1.0,0.0,0.0,...,47.5112,-122.257,1340,5650,1955,32.49,0.11654,0.01506,13.022012,0.678977
1,6414100192,2014-12-09,538000.0,3,2.25,2570,7242,2.0,0.0,0.0,...,47.7210,-122.319,1690,7639,1991,54.83,0.09326,0.07706,11.882906,2.910551
2,5631500400,2015-02-25,180000.0,2,1.00,770,10000,1.0,0.0,0.0,...,47.7379,-122.233,2720,8062,1933,16.71,0.11016,0.00894,12.281023,2.327626
3,2487200875,2014-12-09,604000.0,4,3.00,1960,5000,1.0,0.0,0.0,...,47.5208,-122.393,1360,5000,1965,86.78,0.10694,0.15106,16.436889,0.467532
4,1954400510,2015-02-18,510000.0,3,2.00,1680,8080,1.0,0.0,0.0,...,47.6168,-122.045,1800,7503,1987,52.25,0.01094,0.19694,14.826499,1.726771
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21592,263000018,2014-05-21,360000.0,3,2.50,1530,1131,3.0,0.0,0.0,...,47.6993,-122.346,1530,1509,2009,135.29,0.07156,0.10406,11.154088,5.346761
21593,6600060120,2015-02-23,400000.0,4,2.50,2310,5813,2.0,0.0,0.0,...,47.5107,-122.362,1830,7200,2014,49.24,0.11704,0.12006,15.839476,1.724718
21594,1523300141,2014-06-23,402101.0,2,0.75,1020,1350,2.0,0.0,0.0,...,47.5944,-122.299,1020,2007,2009,169.66,0.03334,0.05706,5.665915,0.925361
21595,291310100,2015-01-16,400000.0,3,2.50,1600,2388,2.0,0.0,0.0,...,47.5345,-122.069,1410,1287,2004,100.30,0.09324,0.17294,16.616144,2.402901
