In [1]:
# Importing libraries

import pandas as pd
import numpy as np

In [None]:
# Read csv file into a pandas dataframe

df = pd.read_csv(r'/Users/juanigalvalisi/Desktop/property data.csv')

In [41]:
df

Unnamed: 0,PID,ST_NUM,ST_NAME,OWN_OCCUPIED,NUM_BEDROOMS,NUM_BATH,SQ_FT
0,100001000.0,104.0,PUTNAM,Y,3,1,1000
1,100002000.0,197.0,LEXINGTON,N,3,1.5,--
2,100003000.0,,LEXINGTON,N,,1,850
3,100004000.0,201.0,BERKELEY,12,1,,700
4,,203.0,BERKELEY,Y,3,2,1600
5,100006000.0,207.0,BERKELEY,Y,,1,800
6,100007000.0,,WASHINGTON,,2,HURLEY,950
7,100008000.0,213.0,TREMONT,Y,1,1,
8,100009000.0,215.0,TREMONT,Y,na,2,1800


In [42]:
# 1) STANDARD MISSING VALUES
# Looking at the ST_NUM column

df['ST_NUM']

0    104.0
1    197.0
2      NaN
3    201.0
4    203.0
5    207.0
6      NaN
7    213.0
8    215.0
Name: ST_NUM, dtype: float64

In [43]:
df['ST_NUM'].isnull()

0    False
1    False
2     True
3    False
4    False
5    False
6     True
7    False
8    False
Name: ST_NUM, dtype: bool

In [44]:
# 2) NON-STANDARD MISSING VALUES
# Looking at the NUM_BEDROOMS column

df['NUM_BEDROOMS']

0      3
1      3
2    NaN
3      1
4      3
5    NaN
6      2
7      1
8     na
Name: NUM_BEDROOMS, dtype: object

In [45]:
df['NUM_BEDROOMS'].isnull()

0    False
1    False
2     True
3    False
4    False
5     True
6    False
7    False
8    False
Name: NUM_BEDROOMS, dtype: bool

In [46]:
# Making a list of missing value types

missing_values = ["n/a", "na"]

In [47]:
missing_values

['n/a', 'na']

In [48]:
df = pd.read_csv(r'/Users/juanigalvalisi/Desktop/property data.csv', na_values = missing_values)

In [49]:
# Looking at the NUM_BEDROOMS column

df['NUM_BEDROOMS']

0    3.0
1    3.0
2    NaN
3    1.0
4    3.0
5    NaN
6    2.0
7    1.0
8    NaN
Name: NUM_BEDROOMS, dtype: float64

In [50]:
df['NUM_BEDROOMS'].isnull()

0    False
1    False
2     True
3    False
4    False
5     True
6    False
7    False
8     True
Name: NUM_BEDROOMS, dtype: bool

In [51]:
# 3) UNEXPECTED MISSING VALUES
# Looking at the OWN_OCCUPIED column

df['OWN_OCCUPIED']

0      Y
1      N
2      N
3     12
4      Y
5      Y
6    NaN
7      Y
8      Y
Name: OWN_OCCUPIED, dtype: object

In [52]:
df['OWN_OCCUPIED'].isnull()

0    False
1    False
2    False
3    False
4    False
5    False
6     True
7    False
8    False
Name: OWN_OCCUPIED, dtype: bool

In [53]:
# SOLUTION: Detecting numbers 

cnt = 0
for row in df['OWN_OCCUPIED']:
# Use 'try' + 'except ValueError' to handle errors.
    try:
        int(row)
        df.loc[cnt, 'OWN_OCCUPIED'] = np.nan
    except ValueError:
        pass
    cnt += 1

In [54]:
cnt

9

In [55]:
# SUMMARIZING Missing Values
# Total missing values for each feature

df.isnull().sum()

PID             1
ST_NUM          2
ST_NAME         0
OWN_OCCUPIED    2
NUM_BEDROOMS    3
NUM_BATH        1
SQ_FT           1
dtype: int64

In [56]:
# Any missing values?

df.isnull().values.any()

True

In [57]:
# Total number of missing values

df.isnull().sum().sum()

10

In [61]:
# REPLACING Missing Values
# Replace missing values with a number

df['ST_NUM'].fillna(125, inplace = True)

In [63]:
df

Unnamed: 0,PID,ST_NUM,ST_NAME,OWN_OCCUPIED,NUM_BEDROOMS,NUM_BATH,SQ_FT
0,100001000.0,104.0,PUTNAM,Y,3.0,1,1000
1,100002000.0,197.0,LEXINGTON,N,3.0,1.5,--
2,100003000.0,125.0,LEXINGTON,N,,1,850
3,100004000.0,201.0,BERKELEY,,1.0,,700
4,,203.0,BERKELEY,Y,3.0,2,1600
5,100006000.0,207.0,BERKELEY,Y,,1,800
6,100007000.0,125.0,WASHINGTON,,2.0,HURLEY,950
7,100008000.0,213.0,TREMONT,Y,1.0,1,
8,100009000.0,215.0,TREMONT,Y,,2,1800


In [64]:
# Location based replacement

df.loc[2,'ST_NUM'] = 125

In [65]:
df

Unnamed: 0,PID,ST_NUM,ST_NAME,OWN_OCCUPIED,NUM_BEDROOMS,NUM_BATH,SQ_FT
0,100001000.0,104.0,PUTNAM,Y,3.0,1,1000
1,100002000.0,197.0,LEXINGTON,N,3.0,1.5,--
2,100003000.0,125.0,LEXINGTON,N,,1,850
3,100004000.0,201.0,BERKELEY,,1.0,,700
4,,203.0,BERKELEY,Y,3.0,2,1600
5,100006000.0,207.0,BERKELEY,Y,,1,800
6,100007000.0,125.0,WASHINGTON,,2.0,HURLEY,950
7,100008000.0,213.0,TREMONT,Y,1.0,1,
8,100009000.0,215.0,TREMONT,Y,,2,1800


In [66]:
# Replace using MEDIAN

median = df['NUM_BEDROOMS'].median()
df['NUM_BEDROOMS'].fillna(median, inplace = True)

In [67]:
df

Unnamed: 0,PID,ST_NUM,ST_NAME,OWN_OCCUPIED,NUM_BEDROOMS,NUM_BATH,SQ_FT
0,100001000.0,104.0,PUTNAM,Y,3.0,1,1000
1,100002000.0,197.0,LEXINGTON,N,3.0,1.5,--
2,100003000.0,125.0,LEXINGTON,N,2.5,1,850
3,100004000.0,201.0,BERKELEY,,1.0,,700
4,,203.0,BERKELEY,Y,3.0,2,1600
5,100006000.0,207.0,BERKELEY,Y,2.5,1,800
6,100007000.0,125.0,WASHINGTON,,2.0,HURLEY,950
7,100008000.0,213.0,TREMONT,Y,1.0,1,
8,100009000.0,215.0,TREMONT,Y,2.5,2,1800
