# Data Cleaning for Tableau Dashboard

This Python script is used mainly to clean and prepare the data for visualisation and creating the Tableau dashboard. Please refer to the comments in the respective code block for more details and explanations. 

In [1]:
#Import required packages

import numpy as np 
import pandas as pd

In [2]:
#Read and assign the HDB Resale Sales data

resale = pd.read_csv("resale_1719.csv")

In [3]:
#Obtain summary information of the data set 

resale.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 60361 entries, 0 to 60360
Data columns (total 11 columns):
month                  60361 non-null object
town                   60361 non-null object
flat_type              60361 non-null object
block                  60361 non-null object
street_name            60361 non-null object
storey_range           60361 non-null object
floor_area_sqm         60361 non-null float64
flat_model             60361 non-null object
lease_commence_date    60361 non-null int64
remaining_lease        60361 non-null object
resale_price           60361 non-null float64
dtypes: float64(2), int64(1), object(8)
memory usage: 5.1+ MB


In [4]:
#View the first 10 rows of the data set 

resale.head(10)

Unnamed: 0,month,town,flat_type,block,street_name,storey_range,floor_area_sqm,flat_model,lease_commence_date,remaining_lease,resale_price
0,2017-01,ANG MO KIO,2 ROOM,406,ANG MO KIO AVE 10,10 TO 12,44.0,Improved,1979,61 years 04 months,232000.0
1,2017-01,ANG MO KIO,3 ROOM,108,ANG MO KIO AVE 4,01 TO 03,67.0,New Generation,1978,60 years 07 months,250000.0
2,2017-01,ANG MO KIO,3 ROOM,602,ANG MO KIO AVE 5,01 TO 03,67.0,New Generation,1980,62 years 05 months,262000.0
3,2017-01,ANG MO KIO,3 ROOM,465,ANG MO KIO AVE 10,04 TO 06,68.0,New Generation,1980,62 years 01 month,265000.0
4,2017-01,ANG MO KIO,3 ROOM,601,ANG MO KIO AVE 5,01 TO 03,67.0,New Generation,1980,62 years 05 months,265000.0
5,2017-01,ANG MO KIO,3 ROOM,150,ANG MO KIO AVE 5,01 TO 03,68.0,New Generation,1981,63 years,275000.0
6,2017-01,ANG MO KIO,3 ROOM,447,ANG MO KIO AVE 10,04 TO 06,68.0,New Generation,1979,61 years 06 months,280000.0
7,2017-01,ANG MO KIO,3 ROOM,218,ANG MO KIO AVE 1,04 TO 06,67.0,New Generation,1976,58 years 04 months,285000.0
8,2017-01,ANG MO KIO,3 ROOM,447,ANG MO KIO AVE 10,04 TO 06,68.0,New Generation,1979,61 years 06 months,285000.0
9,2017-01,ANG MO KIO,3 ROOM,571,ANG MO KIO AVE 3,01 TO 03,67.0,New Generation,1979,61 years 04 months,285000.0


In [5]:
#Obtain the remaining lease of each flat, separated by year and month 

lease = resale.remaining_lease.str.extractall('(\d+)')
lease.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,0
Unnamed: 0_level_1,match,Unnamed: 2_level_1
0,0,61
0,1,4
1,0,60
1,1,7
2,0,62


In [6]:
#Split the remaining lease duration into a 'year' and 'month' column respectively 

lease = lease.unstack(level='match')
lease.head()

Unnamed: 0_level_0,0,0
match,0,1
0,61,4
1,60,7
2,62,5
3,62,1
4,62,5


In [7]:
lease.columns = ['year', 'months']
lease = lease.fillna(0)
lease.head(10)

Unnamed: 0,year,months
0,61,4
1,60,7
2,62,5
3,62,1
4,62,5
5,63,0
6,61,6
7,58,4
8,61,6
9,61,4


In [8]:
#Concatenate the 'year' and 'month' column to the original data set 

resale_1 = pd.concat([resale,lease], axis=1)
resale_1.head()

Unnamed: 0,month,town,flat_type,block,street_name,storey_range,floor_area_sqm,flat_model,lease_commence_date,remaining_lease,resale_price,year,months
0,2017-01,ANG MO KIO,2 ROOM,406,ANG MO KIO AVE 10,10 TO 12,44.0,Improved,1979,61 years 04 months,232000.0,61,4
1,2017-01,ANG MO KIO,3 ROOM,108,ANG MO KIO AVE 4,01 TO 03,67.0,New Generation,1978,60 years 07 months,250000.0,60,7
2,2017-01,ANG MO KIO,3 ROOM,602,ANG MO KIO AVE 5,01 TO 03,67.0,New Generation,1980,62 years 05 months,262000.0,62,5
3,2017-01,ANG MO KIO,3 ROOM,465,ANG MO KIO AVE 10,04 TO 06,68.0,New Generation,1980,62 years 01 month,265000.0,62,1
4,2017-01,ANG MO KIO,3 ROOM,601,ANG MO KIO AVE 5,01 TO 03,67.0,New Generation,1980,62 years 05 months,265000.0,62,5


In [9]:
#Create a new remaining_lease column (numeric) in terms of years 

resale_1[["year", "months"]] = resale_1[["year", "months"]].apply(pd.to_numeric)
resale_1['remaining_lease'] = round((resale_1['year']*12 + resale_1['months'])/12, 2)
resale_1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 60361 entries, 0 to 60360
Data columns (total 13 columns):
month                  60361 non-null object
town                   60361 non-null object
flat_type              60361 non-null object
block                  60361 non-null object
street_name            60361 non-null object
storey_range           60361 non-null object
floor_area_sqm         60361 non-null float64
flat_model             60361 non-null object
lease_commence_date    60361 non-null int64
remaining_lease        60361 non-null float64
resale_price           60361 non-null float64
year                   60361 non-null int64
months                 60361 non-null int64
dtypes: float64(3), int64(3), object(7)
memory usage: 6.0+ MB


In [10]:
resale_1.head()

Unnamed: 0,month,town,flat_type,block,street_name,storey_range,floor_area_sqm,flat_model,lease_commence_date,remaining_lease,resale_price,year,months
0,2017-01,ANG MO KIO,2 ROOM,406,ANG MO KIO AVE 10,10 TO 12,44.0,Improved,1979,61.33,232000.0,61,4
1,2017-01,ANG MO KIO,3 ROOM,108,ANG MO KIO AVE 4,01 TO 03,67.0,New Generation,1978,60.58,250000.0,60,7
2,2017-01,ANG MO KIO,3 ROOM,602,ANG MO KIO AVE 5,01 TO 03,67.0,New Generation,1980,62.42,262000.0,62,5
3,2017-01,ANG MO KIO,3 ROOM,465,ANG MO KIO AVE 10,04 TO 06,68.0,New Generation,1980,62.08,265000.0,62,1
4,2017-01,ANG MO KIO,3 ROOM,601,ANG MO KIO AVE 5,01 TO 03,67.0,New Generation,1980,62.42,265000.0,62,5


In [11]:
#Export the cleaned data set as a csv file

resale_1.to_csv('resale_clean.csv')