<h1 style="color: #00BFFF;">00 |</h1>

In [4]:
# 📚 Basic libraries
import pandas as pd # data manipulation
import numpy as np # numerical operations
import os # file managment
import matplotlib.pyplot as plt # 2D visualizations
import seaborn as sns # high-resolution visualization
import warnings # warning messages management
import datetime # to play with dates

# ⚙️ Settings
pd.set_option('display.max_columns', None) # display all columns
warnings.filterwarnings('ignore') # ignore warnings

In [5]:
# Specific functions

def explore_data(data): # sum & returns duplicates, NaN & empty spaces
    duplicate_rows = data.duplicated().sum()
    nan_values = data.isna().sum()
    empty_spaces = data.eq(' ').sum()
    import pandas as pd
    exploration = pd.DataFrame({"NaN": nan_values, "EmptySpaces": empty_spaces}) # New dataframe with the results
    print(f"There are {data.duplicated().sum()} duplicate rows. Also;")
    return exploration

def get_house_soldtime(row): # Define the get_house_soldtime function
    today = datetime.datetime.today().year
    return today - row['year']

def get_house_lifetime(row): # Define the get_house_lifetime function
    today = datetime.datetime.today().year
    if row['yr_renovated'] != 0:
        return today - row['yr_renovated']
    else:
        return today - row['yr_built']

<h2 style="color: #008080;">Data Extraction</h2>

In [6]:
file_path = os.path.join("C:/Users/apisi/01. IronData/01. GitHub/03. Projects/05_patern_pending/00_data", "datac.csv")
datac = pd.read_csv(file_path, index_col=0) # to deal with an error `Unnamed: 0` column
datac = datac.reset_index()

<h1 style="color: #00BFFF;">02 | Data cleaning</h1>

<h2 style="color: #008080;">Saving information about Seattle houses</h2>

In [7]:
# It'll have some sense later for the conclusions :)
seattle = datac[["lat","long", "zipcode"]]
seattle.to_csv("C:/Users/apisi/01. IronData/01. GitHub/03. Projects/05_patern_pending/00_data/seattle.csv")

<h2 style="color: #008080;">Dealing with Duplicates</h2>

In [8]:
explore_data(datac)

There are 0 duplicate rows. Also;


Unnamed: 0,NaN,EmptySpaces
id,0,0
date,0,0
bedrooms,0,0
bathrooms,0,0
sqft_living,0,0
sqft_lot,0,0
floors,0,0
waterfront,0,0
view,0,0
condition,0,0


In [9]:
# Nothing to do here... moving on!

<h2 style="color: #008080;">Dealing with datetime formats</h2>

In [10]:
# Convert 'date' column to datetime format (it's an object after creating a new dataset)
datac['date'] = pd.to_datetime(datac['date'])

# Dates are complex. We will create 3 new columns for year, month and day.
datac['year'] = datac['date'].dt.year
datac['month'] = datac['date'].dt.month_name().str.slice(stop=3) #instead of giving them a number, we make them categoricals, and then we will encode them

<h2 style="color: #008080;">Feature Engineering</h2>

In [11]:
# Apply the function to create a new column 'house_soldtime' "JC rocks !" :) ~ isi
datac['house_soldtime'] = datac.apply(get_house_soldtime, axis=1)

# Print the updated DataFrame
datac.head(5)

Unnamed: 0,id,date,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15,price,year,month,house_soldtime
0,7129300520,2014-10-13,3,1.0,1180,5650,1.0,0,0,3,7,1180,0,1955,0,98178,47.5112,-122.257,1340,5650,221900,2014,Oct,9
1,6414100192,2014-12-09,3,2.25,2570,7242,2.0,0,0,3,7,2170,400,1951,1991,98125,47.721,-122.319,1690,7639,538000,2014,Dec,9
2,5631500400,2015-02-25,2,1.0,770,10000,1.0,0,0,3,6,770,0,1933,0,98028,47.7379,-122.233,2720,8062,180000,2015,Feb,8
3,2487200875,2014-12-09,4,3.0,1960,5000,1.0,0,0,5,7,1050,910,1965,0,98136,47.5208,-122.393,1360,5000,604000,2014,Dec,9
4,1954400510,2015-02-18,3,2.0,1680,8080,1.0,0,0,3,8,1680,0,1987,0,98074,47.6168,-122.045,1800,7503,510000,2015,Feb,8


<h2 style="color: #008080;">Dropping unnecessary features</h2>

In [12]:
datac = datac.drop('id', axis=1) # Well, obvious or NaN?
datac = datac.drop(['yr_built', 'yr_renovated', 'zipcode', 'lat', 'long', 'date'], axis=1)
# lat, long and zipcode we have them storage in seattle dataset.
# as for yr_built and yr_renovated are in house_lifteimte function

<h2 style="color: #008080;">Encoding Categoricals</h2>

In [13]:
# Moving on to EDA >
cleaned = datac.copy()
cleaned.to_csv("C:/Users/apisi/01. IronData/01. GitHub/03. Projects/05_patern_pending/00_data/cleaned.csv")