## Loading of libraries

In [1]:
# Load libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

## Data Import and Cleaning
- Load dataset
- Check for missing values and datatype
- Rename columns when necessary
- Drop unnecessary columns

#### Load Data

In [2]:
# Load test data
housing_test = pd.read_csv('../datasets/test.csv')
print(housing_test.shape)
housing_test.head()

(16737, 76)


  housing_test = pd.read_csv('../datasets/test.csv')


Unnamed: 0,id,Tranc_YearMonth,town,flat_type,block,street_name,storey_range,floor_area_sqm,flat_model,lease_commence_date,...,vacancy,pri_sch_affiliation,pri_sch_latitude,pri_sch_longitude,sec_sch_nearest_dist,sec_sch_name,cutoff_point,affiliation,sec_sch_latitude,sec_sch_longitude
0,114982,2012-11,YISHUN,4 ROOM,173,YISHUN AVE 7,07 TO 09,84.0,Simplified,1987,...,92,0,1.433681,103.832924,156.322353,Ahmad Ibrahim Secondary School,218,0,1.436235,103.829987
1,95653,2019-08,JURONG WEST,5 ROOM,986C,JURONG WEST ST 93,04 TO 06,112.0,Premium Apartment,2008,...,45,0,1.339244,103.698896,739.371688,Jurong West Secondary School,199,0,1.335256,103.702098
2,40303,2013-10,ANG MO KIO,3 ROOM,534,ANG MO KIO AVE 10,07 TO 09,68.0,New Generation,1980,...,36,0,1.371893,103.851811,305.071191,Anderson Secondary School,245,0,1.374242,103.85143
3,109506,2017-10,WOODLANDS,4 ROOM,29,MARSILING DR,01 TO 03,97.0,New Generation,1979,...,54,0,1.434423,103.773698,433.454591,Woodlands Secondary School,188,0,1.439183,103.774499
4,100149,2016-08,BUKIT BATOK,4 ROOM,170,BT BATOK WEST AVE 8,16 TO 18,103.0,Model A,1985,...,40,0,1.349195,103.741,217.295361,Bukit Batok Secondary School,223,0,1.348351,103.740873


#### Check for missing values and datatype

In [3]:
housing_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16737 entries, 0 to 16736
Data columns (total 76 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   id                         16737 non-null  int64  
 1   Tranc_YearMonth            16737 non-null  object 
 2   town                       16737 non-null  object 
 3   flat_type                  16737 non-null  object 
 4   block                      16737 non-null  object 
 5   street_name                16737 non-null  object 
 6   storey_range               16737 non-null  object 
 7   floor_area_sqm             16737 non-null  float64
 8   flat_model                 16737 non-null  object 
 9   lease_commence_date        16737 non-null  int64  
 10  Tranc_Year                 16737 non-null  int64  
 11  Tranc_Month                16737 non-null  int64  
 12  mid_storey                 16737 non-null  int64  
 13  lower                      16737 non-null  int

There are missing values in `Mall_Nearest_Distance`, `Mall_Within_500m`, `Mall_Within_1km`, `Mall_Within_2km`, `Hawker_Within_500m`, `Hawker_Within_1km`, `Hawker_Within_2km`

In [4]:
# Change datatype to reflect columns correctly
housing_test['Tranc_YearMonth'] = pd.to_datetime(housing_test['Tranc_YearMonth'])
housing_test['residential'] = housing_test['residential'].map({'Y': True, 'N': False})
housing_test['commercial'] = housing_test['commercial'].map({'Y': True, 'N': False})
housing_test['market_hawker'] = housing_test['market_hawker'].map({'Y': True, 'N': False})
housing_test['multistorey_carpark'] = housing_test['multistorey_carpark'].map({'Y': True, 'N': False})
housing_test['precinct_pavilion'] = housing_test['precinct_pavilion'].map({'Y': True, 'N': False})
housing_test['bus_interchange'] = housing_test['bus_interchange'].map({1: True, 0: False})
housing_test['mrt_interchange'] = housing_test['mrt_interchange'].map({1: True, 0: False})
housing_test['pri_sch_affiliation'] = housing_test['pri_sch_affiliation'].map({1: True, 0: False})
housing_test['affiliation'] = housing_test['affiliation'].map({1: True, 0: False})
housing_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16737 entries, 0 to 16736
Data columns (total 76 columns):
 #   Column                     Non-Null Count  Dtype         
---  ------                     --------------  -----         
 0   id                         16737 non-null  int64         
 1   Tranc_YearMonth            16737 non-null  datetime64[ns]
 2   town                       16737 non-null  object        
 3   flat_type                  16737 non-null  object        
 4   block                      16737 non-null  object        
 5   street_name                16737 non-null  object        
 6   storey_range               16737 non-null  object        
 7   floor_area_sqm             16737 non-null  float64       
 8   flat_model                 16737 non-null  object        
 9   lease_commence_date        16737 non-null  int64         
 10  Tranc_Year                 16737 non-null  int64         
 11  Tranc_Month                16737 non-null  int64         
 12  mid_

Datatypes are reflected correctly, except `postal` which will be dropped so need not be corrected.

#### Rename columns when necessary

In [5]:
# Rename columns
housing_test.rename(columns={"Tranc_YearMonth": "tranc_year_month", 
                        "lease_commence_date": "lease_commence_year", 
                        "Tranc_Year": "tranc_year",
                        "Tranc_Month": "tranc_month",
                        "lower": "lower_storey_range",
                        "upper": "upper_storey_range",
                        "mid": "mid_storey_range",
                        "1room_sold": "1room_res",
                        "2room_sold": "2room_res",
                        "3room_sold": "3room_res",
                        "4room_sold": "4room_res",
                        "5room_sold": "5room_res",
                        "exec_sold": "exec_res",
                        "multigen_sold": "multigen_res",
                        "studio_apartment_sold": "studio_res",
                        "Latitude": "latitude",
                        "Longitude": "longitude",
                        "Mall_Nearest_Distance": "mall_nearest_distance",
                        "Mall_Within_500m": "mall_500m",
                        "Mall_Within_1km": "mall_1km",
                        "Mall_Within_2km": "mall_2km",
                        "Hawker_Nearest_Distance": "hawker_nearest_distance",
                        "Hawker_Within_500m": "hawker_500m",
                        "Hawker_Within_1km": "hawker_1km",
                        "Hawker_Within_2km": "hawker_2km",
                        "hawker_food_stalls": "hawker_stalls",
                        "bus_interchange": "mrt_bus_interchange"
                        }, inplace=True)
housing_test.head(2)

Unnamed: 0,id,tranc_year_month,town,flat_type,block,street_name,storey_range,floor_area_sqm,flat_model,lease_commence_year,...,vacancy,pri_sch_affiliation,pri_sch_latitude,pri_sch_longitude,sec_sch_nearest_dist,sec_sch_name,cutoff_point,affiliation,sec_sch_latitude,sec_sch_longitude
0,114982,2012-11-01,YISHUN,4 ROOM,173,YISHUN AVE 7,07 TO 09,84.0,Simplified,1987,...,92,False,1.433681,103.832924,156.322353,Ahmad Ibrahim Secondary School,218,False,1.436235,103.829987
1,95653,2019-08-01,JURONG WEST,5 ROOM,986C,JURONG WEST ST 93,04 TO 06,112.0,Premium Apartment,2008,...,45,False,1.339244,103.698896,739.371688,Jurong West Secondary School,199,False,1.335256,103.702098


#### Check for missing values and datatype

In [6]:
# Replace null values in 'mall_500m', 'mall_1km', 'mall_2km' with 0
housing_test['mall_500m'] = housing_test['mall_500m'].fillna(0)
housing_test['mall_1km'] = housing_test['mall_1km'].fillna(0)
housing_test['mall_2km'] = housing_test['mall_2km'].fillna(0)

In [7]:
# Replace null values in 'hawker_500m', 'hawker_1km', 'hawker_2km' with 0
housing_test['hawker_500m'] = housing_test['hawker_500m'].fillna(0)
housing_test['hawker_1km'] = housing_test['hawker_1km'].fillna(0)
housing_test['hawker_2km'] = housing_test['hawker_2km'].fillna(0)

In [8]:
# Change datatype to reflect columns correctly
housing_test = housing_test.astype({'mall_500m':'int', 'mall_1km':'int', 'mall_2km':'int', 'hawker_500m':'int', 'hawker_1km':'int', 'hawker_2km':'int'})
housing_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16737 entries, 0 to 16736
Data columns (total 76 columns):
 #   Column                     Non-Null Count  Dtype         
---  ------                     --------------  -----         
 0   id                         16737 non-null  int64         
 1   tranc_year_month           16737 non-null  datetime64[ns]
 2   town                       16737 non-null  object        
 3   flat_type                  16737 non-null  object        
 4   block                      16737 non-null  object        
 5   street_name                16737 non-null  object        
 6   storey_range               16737 non-null  object        
 7   floor_area_sqm             16737 non-null  float64       
 8   flat_model                 16737 non-null  object        
 9   lease_commence_year        16737 non-null  int64         
 10  tranc_year                 16737 non-null  int64         
 11  tranc_month                16737 non-null  int64         
 12  mid_

#### Drop unnecessary columns

In [9]:
# Drop unnecessary columns
housing_test.drop(['id', 'tranc_year_month', 'block', 'street_name', 'address', 'full_flat_type', 'mid_storey', 'residential', 'town', 'bus_stop_name', 'postal', 'mall_nearest_distance'], axis=1, inplace=True)
housing_test.head(2)

Unnamed: 0,flat_type,storey_range,floor_area_sqm,flat_model,lease_commence_year,tranc_year,tranc_month,lower_storey_range,upper_storey_range,mid_storey_range,...,vacancy,pri_sch_affiliation,pri_sch_latitude,pri_sch_longitude,sec_sch_nearest_dist,sec_sch_name,cutoff_point,affiliation,sec_sch_latitude,sec_sch_longitude
0,4 ROOM,07 TO 09,84.0,Simplified,1987,2012,11,7,9,8,...,92,False,1.433681,103.832924,156.322353,Ahmad Ibrahim Secondary School,218,False,1.436235,103.829987
1,5 ROOM,04 TO 06,112.0,Premium Apartment,2008,2019,8,4,6,5,...,45,False,1.339244,103.698896,739.371688,Jurong West Secondary School,199,False,1.335256,103.702098


## Save and Export Data

In [10]:
import os
if not os.path.exists('output'):
    os.makedirs('output')
housing_test.to_csv('output/housing_test.csv', index=False)