# Examples of dirty data
## Example 1: 911 calls in Phoenix, AZ

In [1]:
import pandas as pd
import numpy as numpy
import matplotlib.pyplot as plt
import seaborn as sns
from pexpect.pxssh import pxssh

# This is 911 call records in Phoenix AZ from 2016 to 2024.
# The data is too large for GitHub, so I won't share it via GitHub.

phx_911 = pd.read_csv('/Users/jieswang/PycharmProjects/2023-Heat_reselience_AZ_Gov/data/311/calls-for-service_calls-for-service_callsforservice.csv')


In [2]:
# inspect the head of the data
phx_911.head()

Unnamed: 0,INCIDENT_NUM,DISP_CODE,DISPOSITION,FINAL_RADIO_CODE,FINAL_CALL_TYPE,CALL_RECEIVED,HUNDREDBLOCKADDR,GRID
0,201502108490,N,NO ACTION REQUIRED,647V,SUSPICIOUS PERSON IN VEHICLE,11/2/2015 9:16:27AM,154XX N 29TH ST,DF34
1,201502108749,N,NO ACTION REQUIRED,418T,TRESPASSING,11/2/2015 9:58:57AM,96XX N 10TH AVE,CH26
2,201502108648,O,OTHER (SUPP/FALSE ALARM/FI/LOUD PARTY),459A,BURGLARY ALARM,11/2/2015 9:42:42AM,38XX E WATKINS ST,AH36B
3,201502108757,N,NO ACTION REQUIRED,418T,TRESPASSING,11/2/2015 10:02:11AM,36XX W CAMELBACK RD,BI20
4,201502108592,N,NO ACTION REQUIRED,900,CHECK WELFARE,11/2/2015 9:30:53AM,40XX N 44TH AVE,BG18


In [3]:
# inspect data
print(f"The shape of the dataframe is: {phx_911.shape}")
print(f"\nThe columns of the dataframe is: {phx_911.columns}")
print(f"\nInfo of the dataframe:")
print(phx_911.info())

The shape of the dataframe is: (6017978, 8)

The columns of the dataframe is: Index(['INCIDENT_NUM', 'DISP_CODE', 'DISPOSITION', 'FINAL_RADIO_CODE',
       'FINAL_CALL_TYPE', 'CALL_RECEIVED', 'HUNDREDBLOCKADDR', 'GRID'],
      dtype='object')

Info of the dataframe:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6017978 entries, 0 to 6017977
Data columns (total 8 columns):
 #   Column            Dtype 
---  ------            ----- 
 0   INCIDENT_NUM      int64 
 1   DISP_CODE         object
 2   DISPOSITION       object
 3   FINAL_RADIO_CODE  object
 4   FINAL_CALL_TYPE   object
 5   CALL_RECEIVED     object
 6   HUNDREDBLOCKADDR  object
 7   GRID              object
dtypes: int64(1), object(7)
memory usage: 367.3+ MB
None


In [4]:
# Check None/NaN values

print(f"As you can see, the column FINAL_CALL_TYPE has over 26,000 missing value.")
phx_911.isna().sum()

As you can see, the column FINAL_CALL_TYPE has over 26,000 missing value.


INCIDENT_NUM             0
DISP_CODE                0
DISPOSITION              1
FINAL_RADIO_CODE         0
FINAL_CALL_TYPE      26632
CALL_RECEIVED            0
HUNDREDBLOCKADDR         0
GRID                278598
dtype: int64

In [9]:
# If we don't understand or deal with missing data in the FINAL_CALL_TYPE,
# it might skew the understanding of what categories

phx_911['FINAL_CALL_TYPE'].value_counts(normalize=True, sort=True, ascending=False, dropna=True)


FINAL_CALL_TYPE
TRESPASSING                            9.962453e-02
CHECK WELFARE                          9.132672e-02
SUSPICIOUS PERSON                      7.542612e-02
FIGHT                                  6.395925e-02
BURGLARY ALARM                         5.895370e-02
                                           ...     
CRIME LAB PRINT SPECIALIST             1.669074e-07
KIDNAP SEXUALLY MOTIVATED SUPPL        1.669074e-07
PD AIR UNIT DOWN WITH INJURIES         1.669074e-07
FOUND BICYCLE SUPPLEMENT               1.669074e-07
SOLICIT FOR PROSTITUTION SUPPLEMENT    1.669074e-07
Name: proportion, Length: 402, dtype: float64

In [10]:
phx_911.dtypes

INCIDENT_NUM         int64
DISP_CODE           object
DISPOSITION         object
FINAL_RADIO_CODE    object
FINAL_CALL_TYPE     object
CALL_RECEIVED       object
HUNDREDBLOCKADDR    object
GRID                object
dtype: object

In [None]:
# Check duplicates: we have 6,017,978 entries but 6,016,991 unique values for INCIDENT_NUM
# So we can safely assume there is duplicates in INCIDENT_NUM columns
phx_911.nunique()

In [None]:
# Let's inspect the duplicated records using pd.DataFrame.duplicated( ) function
# keep=False means to flag all duplicated records as True, so keep all of them.

duplicated_record = phx_911[phx_911.duplicated(subset='INCIDENT_NUM',keep=False)]

## Example 2: Occupation task data in the U.S.
provided by the O*NET data platform


In [None]:
import pandas as pd
import numpy as np

task_rating = pd.read_excel("/Users/jieswang/PycharmProjects/2023-Heat_reselience_AZ_Gov/000-NSS_conference/data/Task_entropy_EDA_20251229/db_30_1_excel/Task Ratings.xlsx", sheet_name='Task Ratings')


In [None]:
# inspect data

print(f"The shape of the dataframe is: {task_rating.shape}")
print(f"The info of the dataframe is: {task_rating.dtypes}")
print(f"\nThe columns of the dataframe is: {task_rating.columns}")
print(f"\nDescription of the dataframe: {task_rating.describe()}")

In [None]:
print("Check the NaN values of each column")
task_rating.isna().sum()