# Examples of dirty data
## Example 1: 911 calls in Phoenix, AZ

In [None]:
import pandas as pd
import numpy as numpy
import matplotlib.pyplot as plt
import seaborn as sns
from pexpect.pxssh import pxssh

# This is 911 call records in Phoenix AZ from 2016 to 2024.
# The data is too large for GitHub, so I won't share it via GitHub.

phx_911 = pd.read_csv('/Users/jieswang/PycharmProjects/2023-Heat_reselience_AZ_Gov/data/311/calls-for-service_calls-for-service_callsforservice.csv')


In [5]:
# inspect the head of the data
phx_911.head()

Unnamed: 0,INCIDENT_NUM,DISP_CODE,DISPOSITION,FINAL_RADIO_CODE,FINAL_CALL_TYPE,CALL_RECEIVED,HUNDREDBLOCKADDR,GRID
0,201502108490,N,NO ACTION REQUIRED,647V,SUSPICIOUS PERSON IN VEHICLE,11/2/2015 9:16:27AM,154XX N 29TH ST,DF34
1,201502108749,N,NO ACTION REQUIRED,418T,TRESPASSING,11/2/2015 9:58:57AM,96XX N 10TH AVE,CH26
2,201502108648,O,OTHER (SUPP/FALSE ALARM/FI/LOUD PARTY),459A,BURGLARY ALARM,11/2/2015 9:42:42AM,38XX E WATKINS ST,AH36B
3,201502108757,N,NO ACTION REQUIRED,418T,TRESPASSING,11/2/2015 10:02:11AM,36XX W CAMELBACK RD,BI20
4,201502108592,N,NO ACTION REQUIRED,900,CHECK WELFARE,11/2/2015 9:30:53AM,40XX N 44TH AVE,BG18


In [13]:
# inspect data
print(f"The shape of the dataframe is: {phx_911.shape}")
print(f"\nThe columns of the dataframe is: {phx_911.columns}")
print(f"\nInfo of the dataframe:")
print(phx_911.info())

The shape of the dataframe is: (6017978, 8)

The columns of the dataframe is: Index(['INCIDENT_NUM', 'DISP_CODE', 'DISPOSITION', 'FINAL_RADIO_CODE',
       'FINAL_CALL_TYPE', 'CALL_RECEIVED', 'HUNDREDBLOCKADDR', 'GRID'],
      dtype='object')

Info of the dataframe:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6017978 entries, 0 to 6017977
Data columns (total 8 columns):
 #   Column            Dtype 
---  ------            ----- 
 0   INCIDENT_NUM      int64 
 1   DISP_CODE         object
 2   DISPOSITION       object
 3   FINAL_RADIO_CODE  object
 4   FINAL_CALL_TYPE   object
 5   CALL_RECEIVED     object
 6   HUNDREDBLOCKADDR  object
 7   GRID              object
dtypes: int64(1), object(7)
memory usage: 367.3+ MB
None


In [15]:
# Check None/NaN values

print(f"As you can see, the column FINAL_CALL_TYPE has over 26,000 missing value.")
phx_911.isna().sum()


As you can see, the column FINAL_CALL_TYPE has over 26,000 missing value.


INCIDENT_NUM             0
DISP_CODE                0
DISPOSITION              1
FINAL_RADIO_CODE         0
FINAL_CALL_TYPE      26632
CALL_RECEIVED            0
HUNDREDBLOCKADDR         0
GRID                278598
dtype: int64

In [16]:
# Check duplicates: we have 6,017,978 entries but 6,016,991 unique values for INCIDENT_NUM
# So we can safely assume there is duplicates in INCIDENT_NUM columns
phx_911.nunique()

INCIDENT_NUM        6016991
DISP_CODE                 8
DISPOSITION               7
FINAL_RADIO_CODE        417
FINAL_CALL_TYPE         402
CALL_RECEIVED       5950026
HUNDREDBLOCKADDR     102607
GRID                   1887
dtype: int64

In [22]:
# Let's inspect the duplicated records using pd.DataFrame.duplicated( ) function
# keep=False means to flag all duplicated records as True, so keep all of them.

duplicated_record = phx_911[phx_911.duplicated(subset='INCIDENT_NUM',keep=False)]

## Example 2: Occupation task data in the U.S.
provided by the O*NET data platform


In [9]:
import pandas as pd
import numpy as np

task_rating = pd.read_excel("/Users/jieswang/PycharmProjects/2023-Heat_reselience_AZ_Gov/000-NSS_conference/data/Task_entropy_EDA_20251229/db_30_1_excel/Task Ratings.xlsx", sheet_name='Task Ratings')


In [5]:
# inspect data

print(f"The shape of the dataframe is: {task_rating.shape}")
print(f"\nThe columns of the dataframe is: {task_rating.columns}")
print(f"\nDescription of the dataframe: {task_rating.describe()}")

The shape of the dataframe is: (161559, 15)

The columns of the dataframe is: Index(['O*NET-SOC Code', 'Title', 'Task ID', 'Task', 'Scale ID', 'Scale Name',
       'Category', 'Data Value', 'N', 'Standard Error', 'Lower CI Bound',
       'Upper CI Bound', 'Recommend Suppress', 'Date', 'Domain Source'],
      dtype='object')

Description of the dataframe:              Task ID       Category     Data Value              N  \
count  161559.000000  125657.000000  161559.000000  158454.000000   
mean    12042.551724       4.000000      20.523185      49.589742   
std      6848.145700       2.000008      25.837848      27.632242   
min         1.000000       1.000000       0.000000       3.000000   
25%      6203.000000       2.000000       3.670000      25.000000   
50%     11931.000000       4.000000      10.070000      48.000000   
75%     17940.000000       6.000000      27.000000      64.000000   
max     24013.000000       7.000000     100.000000     238.000000   

       Standard Error

In [8]:
print("Check the NaN values of each column")
task_rating.isna().sum()

Check the NaN values of each column


O*NET-SOC Code            0
Title                     0
Task ID                   0
Task                      0
Scale ID                  0
Scale Name                0
Category              35902
Data Value                0
N                      3105
Standard Error        42822
Lower CI Bound        54375
Upper CI Bound        54375
Recommend Suppress    41121
Date                      0
Domain Source             0
dtype: int64