<h1 style="color: #00BFFF;">00 |</h1>

In [3]:
# 📚 Basic libraries
import pandas as pd # data manipulation
import numpy as np # numerical operations
import os # file managment
import warnings # warning messages management
import datetime # to play with dates

In [4]:
# ⚙️ Settings
pd.set_option('display.max_columns', None) # display all columns
warnings.filterwarnings('ignore') # ignore warnings

In [5]:
# 🎯 Specific functions
def explore_data(data): # sum & returns duplicates, NaN & empty spaces
    duplicate_rows = data.duplicated().sum()
    nan_values = data.isna().sum()
    empty_spaces = data.eq(' ').sum()
    import pandas as pd
    exploration = pd.DataFrame({"NaN": nan_values, "EmptySpaces": empty_spaces}) # New dataframe with the results
    print(f"There are {data.duplicated().sum()} duplicate rows. Also;")
    return exploration

def get_house_lifetime(row): # returns house lifetime, based on current year - (year renovated) or (year build)
    today = datetime.datetime.today().year
    if row['yr_renovated'] != 0:
        return today - row['yr_renovated']
    else:
        return today - row['yr_built']

<h2 style="color: #008080;">Data Extraction</h2>

In [6]:
file_path = os.path.join("C:/Users/apisi/01. IronData/01. GitHub/03. Projects/05_patern_pending/00_data", "data_copy.csv")
data_copy = pd.read_csv(file_path, index_col=0) # to deal with an error `Unnamed: 0` column
data_copy = data_copy.reset_index()

<h1 style="color: #00BFFF;">02 | Data cleaning</h1>

<h2 style="color: #008080;">Dealing with Duplicates</h2>

In [7]:
explore_data(data_copy) # sum & returns duplicates, NaN & empty spaces

There are 0 duplicate rows. Also;


Unnamed: 0,NaN,EmptySpaces
id,0,0
date,0,0
bedrooms,0,0
bathrooms,0,0
sqft_living,0,0
sqft_lot,0,0
floors,0,0
waterfront,0,0
view,0,0
condition,0,0


In [6]:
# Nothing to do here... moving on!

<h2 style="color: #008080;">Dealing with datetime formats</h2>

In [8]:
# Convert 'date' column to datetime format (it's an object after creating a new dataset)
data_copy['date'] = pd.to_datetime(data_copy['date'])

# Dates are complex. We will create 3 new columns for year, month and day.
data_copy['year'] = data_copy['date'].dt.year
data_copy['month'] = data_copy['date'].dt.month_name().str.slice(stop=3) # instead of giving them a number, we make them categoricals, and then we will encode them

<h2 style="color: #008080;">Encoding Categoricals</h2>

In [9]:
dummies = pd.get_dummies(data_copy['month'], prefix='month')
data_copy = pd.concat([data_copy, dummies], axis=1)

<h2 style="color: #008080;">Saving information about Seattle houses</h2>

In [10]:
# It'll have some sense later for the conclusions :)
seattle = data_copy[["lat","long", "zipcode"]]
seattle.to_csv("C:/Users/apisi/01. IronData/01. GitHub/03. Projects/05_patern_pending/00_data/seattle.csv")

<h2 style="color: #008080;">Dropping unnecessary features</h2>

In [10]:
data_copy = data_copy.drop('id', axis=1) # Well, obvious or NaN? I mean, it's NaN or never.
data_copy = data_copy.drop(['lat', 'long', 'zipcode', 'date', 'month'], axis=1)
# lat, long and zipcode we have them storage in seattle dataset.
# date has fulfilled it's purpose

<h2 style="color: #008080;">Life happens</h2>

In [13]:
duplicates = data_copy.duplicated()
print("Number of duplicated rows:", duplicates.sum())

Number of duplicated rows: 2


In [14]:
# Unknown error
# For some reason (we tested for duplicates in different steps), we got 2 duplicated rows after dropping some features.
# For now, we will just drop them
data_copy = data_copy.drop_duplicates()
duplicates = data_copy.duplicated()
print("Number of duplicated rows:", duplicates.sum())

Number of duplicated rows: 0


<h2 style="color: #008080;">Feature Engineering</h2>

In [15]:
# Apply the function to create a new column 'house_lifetime'
data_copy['house_lifetime'] = data_copy.apply(get_house_lifetime, axis=1) # returns house lifetime, based on current year S- (year renovated) or (year build)
data_copy = data_copy.drop(['yr_built', 'yr_renovated'], axis=1) # already in house lifetime

<h2 style="color: #008080;">Target coordinates</h2>

We will just kindly move our target to the right.

In [16]:
data_copy.columns

Index(['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors',
       'waterfront', 'view', 'condition', 'grade', 'sqft_above',
       'sqft_basement', 'sqft_living15', 'sqft_lot15', 'price', 'year',
       'month_Apr', 'month_Aug', 'month_Dec', 'month_Feb', 'month_Jan',
       'month_Jul', 'month_Jun', 'month_Mar', 'month_May', 'month_Nov',
       'month_Oct', 'month_Sep', 'house_lifetime'],
      dtype='object')

In [17]:
# Kindly moving our target to the right, as best practices
data_copy = data_copy[['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors',
       'waterfront', 'view', 'condition', 'grade', 'sqft_above',
       'sqft_basement', 'sqft_living15', 'sqft_lot15', 'year',
       'month_Apr', 'month_Aug', 'month_Dec', 'month_Feb', 'month_Jan',
       'month_Jul', 'month_Jun', 'month_Mar', 'month_May', 'month_Nov',
       'month_Oct', 'month_Sep', 'house_lifetime', 'price']]

In [18]:
# Let's see how it looks like !
data_copy.sample(5)

Unnamed: 0,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,sqft_living15,sqft_lot15,year,month_Apr,month_Aug,month_Dec,month_Feb,month_Jan,month_Jul,month_Jun,month_Mar,month_May,month_Nov,month_Oct,month_Sep,house_lifetime,price
14814,3,1.5,1250,4000,1.0,0,0,3,7,1250,0,1030,4000,2014,0,0,0,0,0,1,0,0,0,0,0,0,68.0,452000
4973,3,1.0,1660,7440,1.0,0,0,3,7,1270,390,1540,7440,2015,0,0,0,0,0,0,0,1,0,0,0,0,66.0,210000
18813,3,2.5,3490,8343,2.0,1,4,4,9,2150,1340,2990,13104,2014,0,0,0,0,0,1,0,0,0,0,0,0,32.0,1680000
9377,3,1.5,1240,12400,1.0,0,0,3,7,1240,0,1640,9600,2014,0,0,0,0,0,0,0,0,0,0,0,1,65.0,415500
166,2,1.75,1340,7250,1.0,0,0,5,5,700,640,1830,9750,2015,0,0,0,1,0,0,0,0,0,0,0,0,74.0,269950


<h2 style="color: #008080;">Moving on to --> 03_eda</h2>

In [20]:
cleaned = data_copy.copy()
cleaned.to_csv("C:/Users/apisi/01. IronData/01. GitHub/03. Projects/05_patern_pending/00_data/cleaned.csv")