# Table of Content
01. Import Libraries
02. Import Data
03. Change Column Data Type (if necessary)
04. Find Mixed-type Data
05. Data Accuracy
06. Data Consistency
07. Data Duplicates
08. Missing Values
09. Export Data

# 01. Import Libraries

In [1]:
# Import libraries
import pandas as pd
import numpy as np
import os

In [2]:
# Adjust the setting to view all rows
pd.options.display.max_rows = None

In [3]:
# Adjust the setting to view all columns
pd.options.display.max_columns = None

# 02. Import Data

In [4]:
# Define the main project folder path
path = r'C:\Users\saich\Desktop\CareerFoundry\Data Immersion\Achievement 6 Advanced Analytics & Dashboard Design\11-2023 HDB Flat Resale Analysis'

In [5]:
# Import the 'MRT_LRT (combined)' sheet from 'train_station.xlsx' under 'Original Data' folder
# This file contains the opening and closure dates for all train stations (extracted from Wikipedia)
# https://en.wikipedia.org/wiki/List_of_Singapore_MRT_stations
# https://en.wikipedia.org/wiki/List_of_Singapore_LRT_stations

train_station = pd.read_excel(os.path.join(path, '02 Data', 'Original Data', 'train_station.xlsx'), 
                               sheet_name = 'MRT_LRT (combined)')

In [6]:
train_station

Unnamed: 0,station_code,station_name,line,opening,closure
0,NS1,Jurong East,North-South Line,10 March 1990,
1,NS2,Bukit Batok,North-South Line,10 March 1990,
2,NS3,Bukit Gombak,North-South Line,10 March 1990,
3,NS4,Choa Chu Kang,North-South Line,10 March 1990,
4,NS5,Yew Tee,North-South Line,10 February 1996,
5,NS7,Kranji,North-South Line,10 February 1996,
6,NS8,Marsiling,North-South Line,10 February 1996,
7,NS9,Woodlands,North-South Line,10 February 1996,
8,NS10,Admiralty,North-South Line,10 February 1996,
9,NS11,Sembawang,North-South Line,10 February 1996,


In [7]:
train_station.shape

(204, 5)

# 03. Change Column Data Type (if necessary)

In [8]:
# Check the data type of each column
train_station.dtypes

station_code    object
station_name    object
line            object
opening         object
closure         object
dtype: object

'Opening' and 'Closure' columns store date information. They should be changed into datetime data type. 

In [9]:
# Change the data types of 'opening' and 'closure' columns into datetime
train_station['opening'] = pd.to_datetime(train_station['opening'])

In [10]:
train_station['closure'] = pd.to_datetime(train_station['closure'])

In [11]:
train_station

Unnamed: 0,station_code,station_name,line,opening,closure
0,NS1,Jurong East,North-South Line,1990-03-10,NaT
1,NS2,Bukit Batok,North-South Line,1990-03-10,NaT
2,NS3,Bukit Gombak,North-South Line,1990-03-10,NaT
3,NS4,Choa Chu Kang,North-South Line,1990-03-10,NaT
4,NS5,Yew Tee,North-South Line,1996-02-10,NaT
5,NS7,Kranji,North-South Line,1996-02-10,NaT
6,NS8,Marsiling,North-South Line,1996-02-10,NaT
7,NS9,Woodlands,North-South Line,1996-02-10,NaT
8,NS10,Admiralty,North-South Line,1996-02-10,NaT
9,NS11,Sembawang,North-South Line,1996-02-10,NaT


In [12]:
# Verify the changes
train_station.dtypes

station_code            object
station_name            object
line                    object
opening         datetime64[ns]
closure         datetime64[ns]
dtype: object

# 04. Find Mixed-type Data

In [13]:
# Check for any mixed-type columns - no output means no mixed-type data
for col in train_station.columns.tolist():
    weird = (train_station[[col]].applymap(type) != train_station[[col]].iloc[0].apply(type)).any(axis = 1)
    if len (train_station[weird]) > 0:
        print (col)

closure


In [14]:
# Check the unique data types that the 'closure' column contains
# The apply() method in Pandas is used to apply a function along the axis of a DataFrame or Series. 
# In this case, the type function is applied to each element in the selected column. 

train_station['closure'].apply(type).unique()

array([<class 'pandas._libs.tslibs.nattype.NaTType'>,
       <class 'pandas._libs.tslibs.timestamps.Timestamp'>], dtype=object)

The 'closure' column contains a mix of datetime objects and 'NaT' values. This mix of data types triggers the detection of mixed-type data. Should not be an issue. 

In [15]:
train_station.applymap(type)

Unnamed: 0,station_code,station_name,line,opening,closure
0,<class 'str'>,<class 'str'>,<class 'str'>,<class 'pandas._libs.tslibs.timestamps.Timesta...,<class 'pandas._libs.tslibs.nattype.NaTType'>
1,<class 'str'>,<class 'str'>,<class 'str'>,<class 'pandas._libs.tslibs.timestamps.Timesta...,<class 'pandas._libs.tslibs.nattype.NaTType'>
2,<class 'str'>,<class 'str'>,<class 'str'>,<class 'pandas._libs.tslibs.timestamps.Timesta...,<class 'pandas._libs.tslibs.nattype.NaTType'>
3,<class 'str'>,<class 'str'>,<class 'str'>,<class 'pandas._libs.tslibs.timestamps.Timesta...,<class 'pandas._libs.tslibs.nattype.NaTType'>
4,<class 'str'>,<class 'str'>,<class 'str'>,<class 'pandas._libs.tslibs.timestamps.Timesta...,<class 'pandas._libs.tslibs.nattype.NaTType'>
5,<class 'str'>,<class 'str'>,<class 'str'>,<class 'pandas._libs.tslibs.timestamps.Timesta...,<class 'pandas._libs.tslibs.nattype.NaTType'>
6,<class 'str'>,<class 'str'>,<class 'str'>,<class 'pandas._libs.tslibs.timestamps.Timesta...,<class 'pandas._libs.tslibs.nattype.NaTType'>
7,<class 'str'>,<class 'str'>,<class 'str'>,<class 'pandas._libs.tslibs.timestamps.Timesta...,<class 'pandas._libs.tslibs.nattype.NaTType'>
8,<class 'str'>,<class 'str'>,<class 'str'>,<class 'pandas._libs.tslibs.timestamps.Timesta...,<class 'pandas._libs.tslibs.nattype.NaTType'>
9,<class 'str'>,<class 'str'>,<class 'str'>,<class 'pandas._libs.tslibs.timestamps.Timesta...,<class 'pandas._libs.tslibs.nattype.NaTType'>


# 05. Data Accuracy

In [16]:
# Check the descriptive statistics for numerical data
train_station.describe()

Unnamed: 0,opening,closure
count,204,1
mean,2005-12-23 01:38:49.411764736,2019-01-13 00:00:00
min,1987-11-07 00:00:00,2019-01-13 00:00:00
25%,1998-11-29 18:00:00,2019-01-13 00:00:00
50%,2008-07-07 12:00:00,2019-01-13 00:00:00
75%,2015-12-27 00:00:00,2019-01-13 00:00:00
max,2022-11-13 00:00:00,2019-01-13 00:00:00


No unusual findings.

# 06. Data Consistency

Check the data consistency for each categorical variable

### 06.1 station_code

In [17]:
# Check the unique values of 'Station code' column
train_station['station_code'].value_counts(dropna = False)

station_code
NS1     1
DT34    1
DT24    1
DT25    1
DT26    1
DT27    1
DT28    1
DT29    1
DT30    1
DT31    1
DT32    1
DT33    1
DT35    1
DT22    1
TE1     1
TE2     1
TE3     1
TE4     1
TE5     1
TE6     1
TE7     1
TE8     1
TE9     1
TE11    1
DT23    1
DT21    1
NS2     1
DT8     1
CC28    1
CC29    1
CE1     1
CE2     1
DT1     1
DT2     1
DT3     1
DT5     1
DT6     1
DT7     1
DT9     1
DT20    1
DT10    1
DT11    1
DT12    1
DT13    1
DT14    1
DT15    1
DT16    1
DT17    1
DT18    1
DT19    1
TE12    1
TE13    1
TE14    1
PE1     1
SE5     1
SW1     1
SW2     1
SW3     1
SW4     1
SW5     1
SW6     1
SW7     1
SW8     1
PTC     1
PE2     1
TE15    1
PE3     1
PE4     1
PE5     1
PE6     1
PE7     1
PW1     1
PW3     1
PW4     1
PW5     1
PW6     1
SE4     1
SE3     1
SE2     1
SE1     1
TE16    1
TE17    1
TE18    1
TE19    1
TE20    1
TE22    1
BP1     1
BP2     1
BP3     1
BP4     1
BP5     1
BP6     1
BP7     1
BP8     1
BP9     1
BP10    1
BP11    1
BP12    1
BP13   

No unusual findings. 

### 06.2 station_name

In [18]:
# Check the unique values of 'station_name' column
train_station['station_name'].value_counts(dropna = False)

station_name
Dhoby Ghaut           3
Marina Bay            3
Outram Park           3
Chinatown             2
Raffles Place         2
Tampines              2
Paya Lebar            2
Bugis                 2
Buona Vista           2
Expo                  2
HarbourFront          2
Jurong East           2
City Hall             2
Serangoon             2
Sengkang              2
Punggol               2
Promenade             2
MacPherson            2
Caldecott             2
Bayfront              2
Bukit Panjang         2
Stevens               2
Little India          2
Botanic Gardens       2
Orchard               2
Newton                2
Choa Chu Kang         2
Woodlands             2
Bishan                2
Somerset              1
Bright Hill           1
Woodlands North       1
Woodlands South       1
Springleaf            1
Lentor                1
Mayflower             1
Orchard Boulevard     1
Upper Thomson         1
Napier                1
Great World           1
Havelock              1
Max

No unusual findings. 

### 06.3 line

In [19]:
# Check the unique values of 'line' column
train_station['line'].value_counts(dropna = False)

line
Downtown Line                 34
East-West Line                33
Circle Line                   28
North-South Line              27
Thomsom-East Coast Line       20
North East Line               16
Bukit Panjang LRT             14
Sengkang LRT                  14
Punggol LRT                   14
Changi Airport Branch Line     2
Circle Line Extension          2
Name: count, dtype: int64

No unusual findings. 

# 07. Data Duplicates

In [20]:
# Check if there are any full duplicates in the dataframe
train_station.duplicated().sum()

0

In [21]:
train_station.loc[train_station.duplicated()]

Unnamed: 0,station_code,station_name,line,opening,closure


No unusual findings.

# 08. Missing Values

In [22]:
# Check the number of missing values in each column
train_station.isnull().sum()

station_code      0
station_name      0
line              0
opening           0
closure         203
dtype: int64

There are 203 missing values in 'closure' column. 
This is because the train stations are still in operation, therefore they have no closure dates. 

# 09. Export Data

In [23]:
train_station.shape

(204, 5)

In [24]:
# Export data to 'Prepared Data' folder in csv format
train_station.to_csv(os.path.join(path, '02 Data', 'Prepared Data', 'train_station (checked).csv'))