<h1 style="color: #00BFFF;">00 |</h1>

In [1]:
# 📚 Basic libraries
import pandas as pd # data manipulation
import numpy as np # numerical operations
import os # file managment
import warnings # warning messages management
import datetime # to play with dates

In [2]:
# ⚙️ Settings
pd.set_option('display.max_columns', None) # display all columns
warnings.filterwarnings('ignore') # ignore warnings

In [3]:
# 🎯 Specific functions
def explore_data(data): # sum & returns duplicates, NaN & empty spaces
    duplicate_rows = data.duplicated().sum()
    nan_values = data.isna().sum()
    empty_spaces = data.eq(' ').sum()
    import pandas as pd
    exploration = pd.DataFrame({"NaN": nan_values, "EmptySpaces": empty_spaces}) # New dataframe with the results
    print(f"There are {data.duplicated().sum()} duplicate rows. Also;")
    return exploration

def get_house_lifetime(row): # returns house lifetime, based on current year - (year renovated) or (year build)
    today = datetime.datetime.today().year
    if row['yr_renovated'] != 0:
        return today - row['yr_renovated']
    else:
        return today - row['yr_built']

<h2 style="color: #008080;">Data Extraction</h2>

In [4]:
file_path = os.path.join("C:/Users/apisi/01. IronData/01. GitHub/03. Projects/05_patern_pending/00_data", "datac.csv")
datac = pd.read_csv(file_path, index_col=0) # to deal with an error `Unnamed: 0` column
datac = datac.reset_index()

<h1 style="color: #00BFFF;">02 | Data cleaning</h1>

<h2 style="color: #008080;">Dealing with Duplicates</h2>

In [6]:
explore_data(datac) # sum & returns duplicates, NaN & empty spaces

There are 0 duplicate rows. Also;


Unnamed: 0,NaN,EmptySpaces
id,0,0
date,0,0
bedrooms,0,0
bathrooms,0,0
sqft_living,0,0
sqft_lot,0,0
floors,0,0
waterfront,0,0
view,0,0
condition,0,0


In [9]:
# Nothing to do here... moving on!

<h2 style="color: #008080;">Dealing with datetime formats</h2>

In [10]:
# Convert 'date' column to datetime format (it's an object after creating a new dataset)
datac['date'] = pd.to_datetime(datac['date'])

# Dates are complex. We will create 3 new columns for year, month and day.
datac['year'] = datac['date'].dt.year
datac['month'] = datac['date'].dt.month_name().str.slice(stop=3) # instead of giving them a number, we make them categoricals, and then we will encode them

<h2 style="color: #008080;">Encoding Categoricals</h2>

In [11]:
dummies = pd.get_dummies(datac['month'], prefix='month')
datac = pd.concat([datac, dummies], axis=1)

<h2 style="color: #008080;">Saving information about Seattle houses</h2>

In [5]:
# It'll have some sense later for the conclusions :)
seattle = datac[["lat","long", "zipcode"]]
seattle.to_csv("C:/Users/apisi/01. IronData/01. GitHub/03. Projects/05_patern_pending/00_data/seattle.csv")

<h2 style="color: #008080;">Dropping unnecessary features</h2>

In [13]:
datac = datac.drop('id', axis=1) # Well, obvious or NaN? I mean, it's NaN or never.
datac = datac.drop(['lat', 'long', 'zipcode', 'date', 'month'], axis=1)
# lat, long and zipcode we have them storage in seattle dataset.
# date has fulfilled it's purpose

<h2 style="color: #008080;">Feature Engineering</h2>

In [15]:
# Apply the function to create a new column 'house_lifetime'
datac['house_lifetime'] = datac.apply(get_house_lifetime, axis=1) # returns house lifetime, based on current year - (year renovated) or (year build)
datac = datac.drop(['yr_built', 'yr_renovated'], axis=1) # already in house lifetime

KeyError: 'yr_renovated'

<h2 style="color: #008080;">Target coordinates</h2>

We will just kindly move our target to the right.

In [16]:
datac.columns

Index(['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors',
       'waterfront', 'view', 'condition', 'grade', 'sqft_above',
       'sqft_basement', 'sqft_living15', 'sqft_lot15', 'price', 'year',
       'month_Apr', 'month_Aug', 'month_Dec', 'month_Feb', 'month_Jan',
       'month_Jul', 'month_Jun', 'month_Mar', 'month_May', 'month_Nov',
       'month_Oct', 'month_Sep', 'house_lifetime'],
      dtype='object')

In [18]:
# Kindly moving our target to the right, as best practices
datac = datac[['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors',
       'waterfront', 'view', 'condition', 'grade', 'sqft_above',
       'sqft_basement', 'sqft_living15', 'sqft_lot15', 'year',
       'month_Apr', 'month_Aug', 'month_Dec', 'month_Feb', 'month_Jan',
       'month_Jul', 'month_Jun', 'month_Mar', 'month_May', 'month_Nov',
       'month_Oct', 'month_Sep', 'house_lifetime', 'price']]

In [21]:
# Let's see how it looks like !
datac.sample(10)

Unnamed: 0,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,sqft_living15,sqft_lot15,year,month_Apr,month_Aug,month_Dec,month_Feb,month_Jan,month_Jul,month_Jun,month_Mar,month_May,month_Nov,month_Oct,month_Sep,house_lifetime,price
9635,4,2.5,1820,16103,2.0,0,0,3,7,1820,0,2120,21277,2014,0,1,0,0,0,0,0,0,0,0,0,0,19.0,265000
4751,3,2.5,1690,1618,2.0,0,0,3,8,1150,540,1690,1618,2014,0,0,0,0,0,0,0,0,1,0,0,0,9.0,372977
12832,3,2.5,2300,7800,2.0,0,2,3,9,2300,0,2300,8187,2015,0,0,0,0,0,0,0,0,1,0,0,0,26.0,737000
14186,3,3.0,2960,5500,2.0,0,2,3,10,2440,520,2960,5876,2014,0,0,0,0,0,0,0,0,1,0,0,0,33.0,1340000
18822,3,1.0,1120,8576,1.0,0,0,3,6,1120,0,1050,8812,2014,0,0,0,0,0,0,0,0,0,0,1,0,80.0,202500
19362,3,2.0,1350,8587,1.0,0,0,3,7,1350,0,1350,8587,2014,0,0,1,0,0,0,0,0,0,0,0,0,25.0,244000
11104,3,2.5,1680,1620,2.0,0,0,3,9,1120,560,1610,1618,2015,0,0,0,0,0,0,0,1,0,0,0,0,9.0,629950
5261,3,2.5,3200,35012,1.5,0,0,3,8,2100,1100,2690,35100,2015,0,0,0,1,0,0,0,0,0,0,0,0,58.0,605000
19267,3,1.75,2000,7560,1.0,0,0,4,7,1300,700,1900,8301,2014,0,0,0,0,0,0,1,0,0,0,0,0,55.0,295000
20,4,1.75,1620,4980,1.0,0,0,4,7,860,760,1400,4980,2014,0,0,0,0,0,0,0,0,1,0,0,0,76.0,385000


<h2 style="color: #008080;">Moving on to --> 03_eda</h2>

In [19]:
cleaned = datac.copy()
cleaned.to_csv("C:/Users/apisi/01. IronData/01. GitHub/03. Projects/05_patern_pending/00_data/cleaned.csv")