# Table of Content
01. Import Libraries
02. Import Data
03. Change Column Data Type (if necessary)
04. Find Mixed-type Data
05. Data Accuracy
06. Data Consistency
07. Data Duplicates
08. Missing Values
09. Export Data

# 01. Import Libraries

In [1]:
# Import libraries
import pandas as pd
import numpy as np
import os

In [2]:
# Adjust the setting to view all rows
pd.options.display.max_rows = None

In [3]:
# Adjust the setting to view all columns
pd.options.display.max_columns = None

# 02. Import Data

In [4]:
# Define the main project folder path
path = r'C:\Users\saich\Desktop\CareerFoundry\Data Immersion\Achievement 6 Advanced Analytics & Dashboard Design\11-2023 HDB Flat Resale Analysis'

In [5]:
# Import the 'Extracted CPI' sheet from 'consumer_price_index.xlsx' under 'Original Data' folder
cpi = pd.read_excel(os.path.join(path, '02 Data', 'Original Data', 'consumer_price_index.xlsx'), sheet_name = 'Extracted CPI')

In [6]:
cpi.head()

Unnamed: 0,month,cpi_all_items
0,1990 Jan,62.0
1,1990 Feb,62.1
2,1990 Mar,61.9
3,1990 Apr,62.3
4,1990 May,62.4


In [7]:
cpi.shape

(408, 2)

# 03. Change Column Data Type (if necessary)

In [8]:
# Check the data type of each column
cpi.dtypes

month             object
cpi_all_items    float64
dtype: object

'month' column store date information. It should be changed into datetime data type.

In [9]:
cpi['month'] = pd.to_datetime(cpi['month'])

  cpi['month'] = pd.to_datetime(cpi['month'])


In [10]:
# Verify the changes
cpi.dtypes

month            datetime64[ns]
cpi_all_items           float64
dtype: object

In [11]:
cpi.head()

Unnamed: 0,month,cpi_all_items
0,1990-01-01,62.0
1,1990-02-01,62.1
2,1990-03-01,61.9
3,1990-04-01,62.3
4,1990-05-01,62.4


# 04. Find Mixed-type Data

In [12]:
# Check for any mixed-type columns - no output means no mixed-type data
for col in cpi.columns.tolist():
    weird = (cpi[[col]].applymap(type) != cpi[[col]].iloc[0].apply(type)).any(axis = 1)
    if len (cpi[weird]) > 0:
        print (col)

No unusual findings. 

# 05. Data Accuracy

In [13]:
cpi.describe()

Unnamed: 0,month,cpi_all_items
count,408,408.0
mean,2006-12-16 04:42:21.176470528,84.369118
min,1990-01-01 00:00:00,61.9
25%,1998-06-23 12:00:00,73.475
50%,2006-12-16 12:00:00,77.8
75%,2015-06-08 12:00:00,98.9
max,2023-12-01 00:00:00,115.3
std,,13.957868


No unusual findings.

# 06. Data Consistency

Not applicable as there is no categorical variable in this dataset. 

# 07. Data Duplicates

In [14]:
# Check if there are any full duplicates in the dataframe
cpi.duplicated().sum()

0

In [15]:
cpi.loc[cpi.duplicated()]

Unnamed: 0,month,cpi_all_items


No unusual findings.

# 08. Missing Values

In [16]:
# Check the number of missing values in each column
cpi.isnull().sum()

month            0
cpi_all_items    0
dtype: int64

No unusual findings.

# 09. Export Data

In [17]:
cpi.shape

(408, 2)

In [18]:
# Export data to 'Prepared Data' folder in pickle format
cpi.to_pickle(os.path.join(path, '02 Data', 'Prepared Data', 'consumer_price_index (checked).pkl'))