### Importing Libraries and Adjusting Settings

In [1]:
# import modules for eda and plotting
import pandas as pd
import numpy as np
import scipy.stats as stats

import sqlite3

from sklearn import preprocessing
from sklearn.preprocessing import OneHotEncoder

import matplotlib.pyplot as plt
import seaborn as sns

import functions as fn

# setting plots to inline
%matplotlib inline

# setting the max number of rows displayed
pd.options.display.max_rows = 250

### Creating Data Frames

In [2]:
# creating paths to the files
files = ['EXTR_RPSale.csv', 'EXTR_ResBldg.csv', 'EXTR_Parcel.csv', 'EXTR_LookUp.csv']
paths = [f'../../data/raw/{file}' for file in files]

# creating list of data frames, importing data as strings
dfs = [pd.read_csv(path, dtype=str) for path in paths]

# isolating individual data frames
SALE = dfs[0]
RESB = dfs[1]
PARC = dfs[2]
LOOK = dfs[3]

### Creating `ID` Column
Creating `ID` column in each data frame by concatenating `Major` & `Minor` strings

In [3]:
SALE['ID'] = SALE['Major'] + SALE['Minor']
RESB['ID'] = RESB['Major'] + RESB['Minor']
PARC['ID'] = PARC['Major'] + PARC['Minor']

### Testing `ID` String and Inspecting Data
Filtering for a specific `ID` in each of the data frames.

### SALE Data Frame

In [7]:
print(SALE[SALE['ID']=='0822119001'].shape)
SALE[SALE['ID']=='0822119001']

(10, 25)


Unnamed: 0,ExciseTaxNbr,Major,Minor,DocumentDate,SalePrice,RecordingNbr,Volume,Page,PlatNbr,PlatType,...,PrincipalUse,SaleInstrument,AFForestLand,AFCurrentUseLand,AFNonProfitUse,AFHistoricProperty,SaleReason,PropertyClass,SaleWarning,ID
92009,2829379,82211,9001,09/30/2016,85000,20161021001126.0,,,,,...,6,15,N,N,N,N,1,8,1,822119001
107851,2896045,82211,9001,08/29/2017,28500,20171019000521.0,,,,,...,6,26,N,N,N,N,18,7,1,822119001
126403,2789810,82211,9001,03/14/2016,0,20160413000590.0,,,,,...,6,24,N,N,N,N,8,8,12 31 51,822119001
189866,3017542,82211,9001,10/21/2019,70000,20191028001102.0,,,,,...,6,26,N,N,N,N,1,8,,822119001
192146,3017076,82211,9001,10/21/2019,70000,,,,,,...,6,26,N,N,N,N,18,8,,822119001
278894,2900407,82211,9001,11/01/2017,71500,,,,,,...,6,26,N,N,N,N,18,8,1,822119001
280452,2740512,82211,9001,06/26/2015,48000,20150701000317.0,,,,,...,6,26,N,N,N,N,18,8,,822119001
309734,3017115,82211,9001,10/21/2019,70000,20191025000752.0,,,,,...,6,26,N,N,N,N,1,8,,822119001
346494,2905723,82211,9001,08/29/2017,0,2896045.0,,,,,...,6,26,N,N,N,N,16,7,20 31,822119001
346927,2754177,82211,9001,08/28/2015,43000,20150904000997.0,,,,,...,6,26,N,N,N,N,18,8,,822119001


### Narrowing Down Features to Get Context

In [23]:
EXPL1 = SALE[SALE['ID']=='0822119001']
EXPL1[['SalePrice', 'ID', 'ExciseTaxNbr',\
          'RecordingNbr','DocumentDate', 'PropertyType']]

Unnamed: 0,SalePrice,ID,ExciseTaxNbr,RecordingNbr,DocumentDate,PropertyType
92009,85000,822119001,2829379,20161021001126.0,09/30/2016,0
107851,28500,822119001,2896045,20171019000521.0,08/29/2017,1
126403,0,822119001,2789810,20160413000590.0,03/14/2016,3
189866,70000,822119001,3017542,20191028001102.0,10/21/2019,0
192146,70000,822119001,3017076,,10/21/2019,3
278894,71500,822119001,2900407,,11/01/2017,6
280452,48000,822119001,2740512,20150701000317.0,06/26/2015,1
309734,70000,822119001,3017115,20191025000752.0,10/21/2019,0
346494,0,822119001,2905723,2896045.0,08/29/2017,1
346927,43000,822119001,2754177,20150904000997.0,08/28/2015,96


## Corruption!!!
It looks like there are five corrupt rows with either a 0 `SalePrice` or a 0 `PropertyType`

In [24]:
print(RESB[RESB['ID']=='0822119001'].shape)
RESB[RESB['ID']=='0822119001']

(21, 51)


Unnamed: 0,Major,Minor,BldgNbr,NbrLivingUnits,Address,BuildingNumber,Fraction,DirectionPrefix,StreetName,StreetType,...,FpFreestanding,FpAdditional,YrBuilt,YrRenovated,PcntComplete,Obsolescence,PcntNetCondition,Condition,AddnlCost,ID
59918,82211,9001,20,1,20904 677TH PL SE 98045,20904,,,677TH,PL,...,0,0,1940,0,0,25,0,4,0,822119001
59919,82211,9001,1,1,20904 677TH PL SE 98045,20904,,,677TH,PL,...,0,0,1957,0,0,25,0,3,0,822119001
59920,82211,9001,14,1,20904 677TH PL SE 98045,20904,,,677TH,PL,...,0,0,1940,0,0,25,0,3,0,822119001
59921,82211,9001,17,1,20904 677TH PL SE 98045,20904,,,677TH,PL,...,0,0,1946,0,0,25,0,4,0,822119001
59922,82211,9001,5,1,20904 677TH PL SE 98045,20904,,,677TH,PL,...,0,0,1940,0,0,25,0,3,0,822119001
59923,82211,9001,15,1,20904 677TH PL SE 98045,20904,,,677TH,PL,...,0,0,1950,0,0,25,0,3,0,822119001
59924,82211,9001,11,1,20904 677TH PL SE 98045,20904,,,677TH,PL,...,0,0,1940,0,0,25,0,3,0,822119001
59925,82211,9001,9,1,20904 677TH PL SE 98045,20904,,,677TH,PL,...,0,0,1957,0,0,25,0,3,0,822119001
59926,82211,9001,12,1,20904 677TH PL SE 98045,20904,,,677TH,PL,...,0,0,1959,0,0,25,0,3,0,822119001
59927,82211,9001,13,1,20904 677TH PL SE 98045,20904,,,677TH,PL,...,0,0,1940,0,0,25,0,4,0,822119001


In [25]:
EXPL2 = RESB[RESB['ID']=='0822119001']
print('len: ', len(EXPL2))
print('BldgNbr len', len(EXPL2['BldgNbr']))
EXPL2[['ID', 'Address', 'BldgNbr']]

len:  21
BldgNbr len 21


Unnamed: 0,ID,Address,BldgNbr
59918,822119001,20904 677TH PL SE 98045,20
59919,822119001,20904 677TH PL SE 98045,1
59920,822119001,20904 677TH PL SE 98045,14
59921,822119001,20904 677TH PL SE 98045,17
59922,822119001,20904 677TH PL SE 98045,5
59923,822119001,20904 677TH PL SE 98045,15
59924,822119001,20904 677TH PL SE 98045,11
59925,822119001,20904 677TH PL SE 98045,9
59926,822119001,20904 677TH PL SE 98045,12
59927,822119001,20904 677TH PL SE 98045,13


In [26]:
print(PARC[PARC['ID']=='0822119001'].shape)
PARC[PARC['ID']=='0822119001'].head(1)

(1, 83)


Unnamed: 0.1,Unnamed: 0,Major,Minor,PropName,PlatName,PlatLot,PlatBlock,Range,Township,Section,...,LandslideHazard,SteepSlopeHazard,Stream,Wetland,SpeciesOfConcern,SensitiveAreaTract,WaterProblems,TranspConcurrency,OtherProblems,ID
46768,139958,82211,9001,,,,,11,22,8,...,N,N,N,N,N,N,N,N,N,822119001


In [27]:
MERG = pd.merge(SALE, RESB, on='ID')
print(MERG.shape)
MERG.head()

(251300, 75)


Unnamed: 0,ExciseTaxNbr,Major_x,Minor_x,DocumentDate,SalePrice,RecordingNbr,Volume,Page,PlatNbr,PlatType,...,FpMultiStory,FpFreestanding,FpAdditional,YrBuilt,YrRenovated,PcntComplete,Obsolescence,PcntNetCondition,Condition,AddnlCost
0,2743355,638580,110,07/14/2015,190000,20150715002686,,,,,...,1,0,1,1963,0,0,0,0,3,0
1,2743356,638580,110,07/14/2015,0,20150715002687,,,,,...,1,0,1,1963,0,0,0,0,3,0
2,2772937,638580,110,12/21/2015,338000,20151222002066,,,,,...,1,0,1,1963,0,0,0,0,3,0
3,2841697,894677,240,12/21/2016,818161,20161228000896,,,,,...,0,0,0,2016,0,0,0,0,3,0
4,3024468,894677,240,12/05/2019,0,20191209000162,,,,,...,0,0,0,2016,0,0,0,0,3,0


In [28]:
print(MERG[MERG['ID']=='0822119001'].shape)
MERG[MERG['ID']=='0822119001'].head()

(210, 75)


Unnamed: 0,ExciseTaxNbr,Major_x,Minor_x,DocumentDate,SalePrice,RecordingNbr,Volume,Page,PlatNbr,PlatType,...,FpMultiStory,FpFreestanding,FpAdditional,YrBuilt,YrRenovated,PcntComplete,Obsolescence,PcntNetCondition,Condition,AddnlCost
98012,2829379,82211,9001,09/30/2016,85000,20161021001126,,,,,...,0,0,0,1940,0,0,25,0,4,0
98013,2829379,82211,9001,09/30/2016,85000,20161021001126,,,,,...,0,0,0,1957,0,0,25,0,3,0
98014,2829379,82211,9001,09/30/2016,85000,20161021001126,,,,,...,0,0,0,1940,0,0,25,0,3,0
98015,2829379,82211,9001,09/30/2016,85000,20161021001126,,,,,...,0,0,0,1946,0,0,25,0,4,0
98016,2829379,82211,9001,09/30/2016,85000,20161021001126,,,,,...,0,0,0,1940,0,0,25,0,3,0


In [29]:
MERG[MERG.duplicated()]

Unnamed: 0,ExciseTaxNbr,Major_x,Minor_x,DocumentDate,SalePrice,RecordingNbr,Volume,Page,PlatNbr,PlatType,...,FpMultiStory,FpFreestanding,FpAdditional,YrBuilt,YrRenovated,PcntComplete,Obsolescence,PcntNetCondition,Condition,AddnlCost


### Light Data Cleaning
Changing `DocumentDate` strings into `datetime` objects, adding a `DocumentYear` column, and converting `SalePrice` to `Float` type.

In [30]:
# changing date strings to datetime objects
MERG.DocumentDate = pd.to_datetime(MERG.DocumentDate)

# adding a document year column
MERG['DocumentYear'] = MERG['DocumentDate'].apply(lambda x: x.year)

# converting SalePrice string to float
MERG['SalePrice'] = MERG['SalePrice'].astype('float')

In [31]:
print('SALE_08 shape: ', SALE[SALE['ID']=='0822119001'].shape)
print('RESB_08 Shape: ', RESB[RESB['ID']=='0822119001'].shape)
print('MERG_08 Shape: ', MERG[MERG['ID']=='0822119001'].shape)

SALE_08 shape:  (10, 25)
RESB_08 Shape:  (21, 51)
MERG_08 Shape:  (210, 76)


In [32]:
TEST = MERG[['SalePrice', 'ID', 'ExciseTaxNbr',\
          'RecordingNbr','DocumentDate',\
          'Address', 'BldgNbr', 'PropertyType']]

In [33]:
SETH = TEST[TEST['ID']=='0822119001'].copy()
SETH

Unnamed: 0,SalePrice,ID,ExciseTaxNbr,RecordingNbr,DocumentDate,Address,BldgNbr,PropertyType
98012,85000.0,822119001,2829379,20161021001126.0,2016-09-30,20904 677TH PL SE 98045,20,0
98013,85000.0,822119001,2829379,20161021001126.0,2016-09-30,20904 677TH PL SE 98045,1,0
98014,85000.0,822119001,2829379,20161021001126.0,2016-09-30,20904 677TH PL SE 98045,14,0
98015,85000.0,822119001,2829379,20161021001126.0,2016-09-30,20904 677TH PL SE 98045,17,0
98016,85000.0,822119001,2829379,20161021001126.0,2016-09-30,20904 677TH PL SE 98045,5,0
98017,85000.0,822119001,2829379,20161021001126.0,2016-09-30,20904 677TH PL SE 98045,15,0
98018,85000.0,822119001,2829379,20161021001126.0,2016-09-30,20904 677TH PL SE 98045,11,0
98019,85000.0,822119001,2829379,20161021001126.0,2016-09-30,20904 677TH PL SE 98045,9,0
98020,85000.0,822119001,2829379,20161021001126.0,2016-09-30,20904 677TH PL SE 98045,12,0
98021,85000.0,822119001,2829379,20161021001126.0,2016-09-30,20904 677TH PL SE 98045,13,0


In [37]:
def len_print(df, li):
    for x in li:
        print(f'{x}: ', len(df[x].unique()))

In [38]:
li = ['SalePrice', 'ID', 'ExciseTaxNbr',\
          'RecordingNbr','DocumentDate',\
          'Address', 'BldgNbr', 'PropertyType']
len_print(SETH, li)

SalePrice:  7
ID:  1
ExciseTaxNbr:  10
RecordingNbr:  9
DocumentDate:  7
Address:  1
BldgNbr:  21
PropertyType:  5


In [48]:
drop_list = SETH[SETH['SalePrice']==0].index
SETH.drop(drop_list, inplace=True)

In [49]:
SETH

Unnamed: 0,SalePrice,ID,ExciseTaxNbr,RecordingNbr,DocumentDate,Address,BldgNbr,PropertyType
98033,28500.0,822119001,2896045,20171019000521.0,2017-08-29,20904 677TH PL SE 98045,20,1
98034,28500.0,822119001,2896045,20171019000521.0,2017-08-29,20904 677TH PL SE 98045,1,1
98035,28500.0,822119001,2896045,20171019000521.0,2017-08-29,20904 677TH PL SE 98045,14,1
98036,28500.0,822119001,2896045,20171019000521.0,2017-08-29,20904 677TH PL SE 98045,17,1
98037,28500.0,822119001,2896045,20171019000521.0,2017-08-29,20904 677TH PL SE 98045,5,1
98038,28500.0,822119001,2896045,20171019000521.0,2017-08-29,20904 677TH PL SE 98045,15,1
98039,28500.0,822119001,2896045,20171019000521.0,2017-08-29,20904 677TH PL SE 98045,11,1
98040,28500.0,822119001,2896045,20171019000521.0,2017-08-29,20904 677TH PL SE 98045,9,1
98041,28500.0,822119001,2896045,20171019000521.0,2017-08-29,20904 677TH PL SE 98045,12,1
98042,28500.0,822119001,2896045,20171019000521.0,2017-08-29,20904 677TH PL SE 98045,13,1


In [41]:
drop_list = SETH[SETH['PropertyType']=='0'].index
SETH.drop(drop_list, inplace=True)

In [50]:
print(len_print(SETH, li))
print(SETH.shape)
SETH.sort_values(by='DocumentDate')

SalePrice:  5
ID:  1
ExciseTaxNbr:  5
RecordingNbr:  4
DocumentDate:  5
Address:  1
BldgNbr:  21
PropertyType:  4
None
(105, 8)


Unnamed: 0,SalePrice,ID,ExciseTaxNbr,RecordingNbr,DocumentDate,Address,BldgNbr,PropertyType
98158,48000.0,822119001,2740512,20150701000317.0,2015-06-26,20904 677TH PL SE 98045,8,1
98156,48000.0,822119001,2740512,20150701000317.0,2015-06-26,20904 677TH PL SE 98045,10,1
98155,48000.0,822119001,2740512,20150701000317.0,2015-06-26,20904 677TH PL SE 98045,7,1
98154,48000.0,822119001,2740512,20150701000317.0,2015-06-26,20904 677TH PL SE 98045,18,1
98153,48000.0,822119001,2740512,20150701000317.0,2015-06-26,20904 677TH PL SE 98045,3,1
98152,48000.0,822119001,2740512,20150701000317.0,2015-06-26,20904 677TH PL SE 98045,4,1
98151,48000.0,822119001,2740512,20150701000317.0,2015-06-26,20904 677TH PL SE 98045,21,1
98150,48000.0,822119001,2740512,20150701000317.0,2015-06-26,20904 677TH PL SE 98045,16,1
98149,48000.0,822119001,2740512,20150701000317.0,2015-06-26,20904 677TH PL SE 98045,2,1
98157,48000.0,822119001,2740512,20150701000317.0,2015-06-26,20904 677TH PL SE 98045,6,1
