In [1]:
import pandas as pd
import numpy as np
from datetime import datetime
 

def open_king_country_hous_prices_data():
    open_house_prices = pd.read_csv("../data/King_County_House_prices_dataset.csv")
    open_house_prices_data = open_house_prices.copy()
    return open_house_prices_data

def remove_outlier_with_high_bedroom_bathroom_ratio():
    remove_outlier = open_king_country_hous_prices_data()[(\
        open_king_country_hous_prices_data()['bedrooms'] /\
            open_king_country_hous_prices_data()['bathrooms']) <= 10]
    return remove_outlier

def inputation_sqft_basement():
    sqft_basement = remove_outlier_with_high_bedroom_bathroom_ratio()
    sqft_basement['sqft_basement'] = sqft_basement['sqft_basement'].replace('?', np.NaN)
    sqft_basement['sqft_basement'] = sqft_basement['sqft_basement'].astype(float)
    sqft_basement.eval('sqft_basement = sqft_living - sqft_above', inplace=True)
    return sqft_basement

def inputation_view_values():
    view_values = inputation_sqft_basement()
    view_values['view'].fillna(0, inplace=True)
    return view_values

def inputation_waterfront_values():
    waterfront_values = inputation_view_values()
    waterfront_values.waterfront.fillna(0, inplace=True)
    return waterfront_values

def last_change_on_building():
    renovation = inputation_waterfront_values()
    last_known_change = []
    for idx, yr_re in renovation.yr_renovated.items():
        if str(yr_re) == 'nan' or yr_re == 0.0:
            last_known_change.append(renovation.yr_built[idx])
        else:
            last_known_change.append(int(yr_re))
    renovation['last_known_change'] = last_known_change
    renovation.drop("yr_renovated", axis=1, inplace=True)
    renovation.drop("yr_built", axis=1, inplace=True)
    return renovation

def date_format():
    date_format = last_change_on_building()
    date_format['date'] = pd.to_datetime(date_format['date'])
    return date_format

date_format().info(2)

<class 'pandas.core.frame.DataFrame'>
Index: 21596 entries, 0 to 21596
Data columns (total 20 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   id                 21596 non-null  int64         
 1   date               21596 non-null  datetime64[ns]
 2   price              21596 non-null  float64       
 3   bedrooms           21596 non-null  int64         
 4   bathrooms          21596 non-null  float64       
 5   sqft_living        21596 non-null  int64         
 6   sqft_lot           21596 non-null  int64         
 7   floors             21596 non-null  float64       
 8   waterfront         21596 non-null  float64       
 9   view               21596 non-null  float64       
 10  condition          21596 non-null  int64         
 11  grade              21596 non-null  int64         
 12  sqft_above         21596 non-null  int64         
 13  sqft_basement      21596 non-null  float64       
 14  zipcode    

In [37]:
from pydantic import BaseModel

class DataValidation(BaseModel):
    id: int
    date: datetime
    price: float 
    bedrooms: int 
    bathrooms: float  
    sqft_living: int 
    sqft_lot: int 
    floors: float
    waterfront: float
    view: float 
    condition: int 
    grade: int  
    sqft_above: int 
    sqft_basement: float 
    last_known_change: int
    zipcode: int 
    lat: float 
    long: float  
    sqft_living15: int 
    sqft_lot15: int 
    sqft_price: float
    delta_lat: float
    delta_long: float
    center_distance: float
    water_distance: float

def data_validation(df: pd.DataFrame, data_schema) -> pd.DataFrame:
    class DataframeValidation(BaseModel):
        df_as_dict: list[data_schema]
    df_as_dict = df.to_dict(orient='records')
    DataframeValidation(df_as_dict=df_as_dict)
    return df 

def distance_calculater(long, lat, ref_long, ref_lat):
    delta_long = long - ref_long
    delta_lat = lat - ref_lat
    delta_long_corr = delta_long * np.cos(np.radians(ref_lat))
    return ((delta_long_corr)**2 +(delta_lat)**2)**(1/2)*2*np.pi*6378/360

def calculate_sqft_price():
    price_per_sqft = date_format()
    price_per_sqft['sqft_price'] = (price_per_sqft.price/(price_per_sqft.sqft_living + price_per_sqft.sqft_lot)).round(2)
    return price_per_sqft

def calculata_center_distance():
    distance = calculate_sqft_price()
    distance['delta_lat'] = np.absolute(47.62774- distance['lat'])
    distance['delta_long'] = np.absolute(-122.24194-distance['long'])
    distance['center_distance']= ((distance['delta_long']* np.cos(np.radians(47.6219)))**2 
                                    + distance['delta_lat']**2)**(1/2)*2*np.pi*6378/360
    return distance

def calculate_waterfront_distance():
    distance = calculata_center_distance()
    water_list= distance.query('waterfront == 1')
    water_distance = []
    for idx, lat in distance.lat.items():
        ref_list = []
        for x,y in zip(list(water_list.long), list(water_list.lat)):
            ref_list.append(distance_calculater(distance.long[idx], distance.lat[idx],x,y).min())
        water_distance.append(min(ref_list))
    distance['water_distance'] = water_distance
    return distance

def validation_df():
    validation_df = calculate_waterfront_distance()
    df = data_validation(validation_df, DataValidation)
    return df

validation_df().head(2)

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,lat,long,sqft_living15,sqft_lot15,last_known_change,sqft_price,delta_lat,delta_long,center_distance,water_distance
0,7129300520,2014-10-13,221900.0,3,1.0,1180,5650,1.0,0.0,0.0,...,47.5112,-122.257,1340,5650,1955,32.49,0.11654,0.01506,13.022012,0.678977
1,6414100192,2014-12-09,538000.0,3,2.25,2570,7242,2.0,0.0,0.0,...,47.721,-122.319,1690,7639,1991,54.83,0.09326,0.07706,11.882906,2.910551
