In [2]:
import pandas as pd
from datetime import datetime

# Importing 3 Dataframes

In [3]:
df_parcel = pd.read_csv('../Data/raw/EXTR_Parcel.csv', encoding = "ISO-8859-1")

In [4]:
df_res = pd.read_csv('../Data/raw/EXTR_ResBldg.csv',low_memory=False)

In [5]:
df_real = pd.read_csv('../Data/raw/EXTR_RPSale.csv', encoding = "ISO-8859-1", low_memory=False)

# Filtering for single family homes only

In [6]:
df_real['DocumentDate'] = pd.to_datetime(df_real['DocumentDate'], format="%m/%d/%Y")
df_real['year'] = pd.DatetimeIndex(df_real['DocumentDate']).year

In [7]:
#set property type to 2,3,11 which are single family units
#set to principal use to 6 which is RESIDENTIAL   
#set year to 2019 for most recent data
# set propertyclass to 8 and 7 which includes only houses
df_real = df_real.loc[(df_real.PropertyType.isin([2,3,11])) & 
                           (df_real.SalePrice > 250000) & 
                           (df_real.SalePrice < 3000000) &
                           (df_real.PrincipalUse == 6) & 
                           (df_real.year == 2019) &
                           (df_real.PropertyClass.isin([8,7]))]

# Creating a UniqueID accross all dataframes to combine later

In [8]:
df_list = [df_parcel, df_res, df_real]

In [9]:
for i in df_list:
    i.Minor = i.Minor.apply(lambda x: str(x).zfill(4))
    i.Major = i.Major.apply(lambda x: str(x).zfill(6))
    i['UniqueID'] = pd.to_numeric(i['Major'] + i['Minor']).astype(int)

# Removing duplicates

In [10]:
#dropping duplicated values of unique id in and keeping most recent sale
df_real = df_real.sort_values('DocumentDate').drop_duplicates('UniqueID', keep='last')

In [11]:
df_res = df_res.drop_duplicates(subset = 'UniqueID', keep = 'first')

# Making UniqueID column the index for all the dataframes

In [12]:
df_res.set_index(keys = 'UniqueID', drop=True, inplace=True)
df_parcel.set_index(keys = 'UniqueID', drop=True, inplace=True)
df_real.set_index(keys = 'UniqueID', drop=True, inplace=True)

# Concatenating the Dataframes 

In [13]:
test_df = pd.concat([df_real, df_parcel.reindex(df_real.index)], axis=1)

In [14]:
df_combined = pd.concat([test_df, df_res.reindex(test_df.index)], axis=1)

In [15]:
# Removes duplicated columns
df_combined = df_combined.loc[:,~df_combined.columns.duplicated()]

In [16]:
df_combined.columns.tolist()

['ExciseTaxNbr',
 'Major',
 'Minor',
 'DocumentDate',
 'SalePrice',
 'RecordingNbr',
 'Volume',
 'Page',
 'PlatNbr',
 'PlatType',
 'PlatLot',
 'PlatBlock',
 'SellerName',
 'BuyerName',
 'PropertyType',
 'PrincipalUse',
 'SaleInstrument',
 'AFForestLand',
 'AFCurrentUseLand',
 'AFNonProfitUse',
 'AFHistoricProperty',
 'SaleReason',
 'PropertyClass',
 'year',
 'PropName',
 'PlatName',
 'Range',
 'Township',
 'Section',
 'QuarterSection',
 'PropType',
 'Area',
 'SubArea',
 'SpecArea',
 'SpecSubArea',
 'DistrictName',
 'LevyCode',
 'CurrentZoning',
 'HBUAsIfVacant',
 'HBUAsImproved',
 'PresentUse',
 'SqFtLot',
 'WaterSystem',
 'SewerSystem',
 'Access',
 'Topography',
 'StreetSurface',
 'RestrictiveSzShape',
 'InadequateParking',
 'PcntUnusable',
 'Unbuildable',
 'MtRainier',
 'Olympics',
 'Cascades',
 'Territorial',
 'SeattleSkyline',
 'PugetSound',
 'LakeWashington',
 'LakeSammamish',
 'SmallLakeRiverCreek',
 'OtherView',
 'WfntLocation',
 'WfntFootage',
 'WfntBank',
 'WfntPoorQuality',
 

In [28]:
df_chosen = df_combined[['SalePrice',
 'SqFtTotLiving',
 'SqFtOpenPorch',
 'SqFtLot',
 'WfntFootage',]]

In [29]:
df_chosen.dropna(inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_chosen.dropna(inplace=True)


In [30]:
df_chosen.to_json('../Data/combined.json')

# Creating Dummy columns

In [39]:
from sklearn.preprocessing import OneHotEncoder

ohe = OneHotEncoder(drop='first')
catagory = ohe.fit_transform(df_chosen.drop(['SalePrice',
 'SqFtTotLiving',
 'SqFtOpenPorch',
 'SqFtLot',
 'WfntFootage', 
 'WfntBank',
 'WfntLocation',
 'FinBasementGrade',
 'Bedrooms','BathHalfCount'],axis=1))

KeyError: "['Bedrooms' 'BathHalfCount'] not found in axis"

In [32]:
column_names = ohe.get_feature_names([
 'WfntBank',
 'WfntLocation',
 'FinBasementGrade',
 'Bedrooms',
 'BathHalfCount'
])

In [33]:
df_cat = pd.DataFrame(index = df_chosen.index , data = catagory.todense(), columns=column_names)

In [34]:
df_cont = df_chosen[['SalePrice',
 'SqFtTotLiving',
 'SqFtDeck',
 'SqFtOpenPorch',
 'Area',
 'SqFtLot',
 'WfntFootage']]

In [35]:
df_with_catagories = pd.concat([df_cont, df_cat.reindex(df_cont.index)], axis=1)

In [36]:
df_with_catagories.to_json('../Data/combined_onehot.json')