In [1]:
import pandas as pd
from datetime import datetime
from sklearn.preprocessing import OneHotEncoder

# Importing 3 Dataframes

In [2]:
df_parcel = pd.read_csv('../../references/EXTR_Parcel.csv', encoding = "ISO-8859-1")

In [3]:
df_res = pd.read_csv('../../references/EXTR_ResBldg.csv',low_memory=False)

In [4]:
df_real = pd.read_csv('../../references/EXTR_RPSale.csv', encoding = "ISO-8859-1", low_memory=False)

# Filtering for single family homes only

In [5]:
df_real['DocumentDate'] = pd.to_datetime(df_real['DocumentDate'], format="%m/%d/%Y")
df_real['year'] = pd.DatetimeIndex(df_real['DocumentDate']).year

In [6]:
#set property type to 2,3,11 which are single family units
#set to principal use to 6 which is RESIDENTIAL   
#set year to 2019 for most recent data
# set propertyclass to 8 and 7 which includes only houses
df_real = df_real.loc[(df_real.PropertyType.isin([2,3,11])) & 
                           (df_real.SalePrice > 250000) & 
                           (df_real.SalePrice < 3000000) &
                           (df_real.PrincipalUse == 6) & 
                           (df_real.year == 2019) &
                           (df_real.PropertyClass.isin([8,7]))]

# Creating a UniqueID accross all dataframes to combine later

In [7]:
df_list = [df_parcel, df_res, df_real]

In [8]:
for i in df_list:
    i.Minor = i.Minor.apply(lambda x: str(x).zfill(4))
    i.Major = i.Major.apply(lambda x: str(x).zfill(6))
    i['UniqueID'] = pd.to_numeric(i['Major'] + i['Minor']).astype(int)

# Removing duplicates

In [9]:
#dropping duplicated values of unique id in and keeping most recent sale
df_real = df_real.sort_values('DocumentDate').drop_duplicates('UniqueID', keep='last')

In [10]:
df_res = df_res.drop_duplicates(subset = 'UniqueID', keep = 'first')

# Making UniqueID column the index for all the dataframes

In [11]:
df_res.set_index(keys = 'UniqueID', drop=True, inplace=True)
df_parcel.set_index(keys = 'UniqueID', drop=True, inplace=True)
df_real.set_index(keys = 'UniqueID', drop=True, inplace=True)

# Concatenating the Dataframes 

In [12]:
test_df = pd.concat([df_real, df_parcel.reindex(df_real.index)], axis=1)

In [13]:
df_combined = pd.concat([test_df, df_res.reindex(test_df.index)], axis=1)

In [14]:
# Removes duplicated columns
df_combined = df_combined.loc[:,~df_combined.columns.duplicated()]

In [15]:
df_chosen = df_combined[['SalePrice',
 'SqFtTotLiving',
 'SqFtOpenPorch',
 'WfntFootage', 
 'WfntLocation',
 'FinBasementGrade']]

In [16]:
df_chosen.dropna(inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_chosen.dropna(inplace=True)


In [17]:
df_chosen.to_json('../../src/data/combined.json')

# Creating Dummy columns

In [18]:
from sklearn.preprocessing import OneHotEncoder

cont = ['SalePrice',
        'SqFtTotLiving',
        'SqFtOpenPorch',
        'WfntFootage',
        'FinBasementGrade']

df_cont = df_chosen[cont]
ohe = OneHotEncoder(drop='first')
catagory = ohe.fit_transform(df_chosen.drop(cont,axis=1))

In [19]:
column_names = ohe.get_feature_names([
 'WfntLocation',])

In [20]:
df_cat = pd.DataFrame(index = df_chosen.index , data = catagory.todense(), columns=column_names)

In [21]:
df_with_catagories = pd.concat([df_cont, df_cat.reindex(df_cont.index)], axis=1)

In [22]:
df_with_catagories.to_json('../../src/data/combined_onehot.json')