<a href="https://colab.research.google.com/github/jack-cao-623/python_learning/blob/main/missing_values_in_ml.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Dealing with Missing Values in ML

## Set up

In [7]:
# first, mount google drive

In [10]:
# copy .json file from mounted drive to current instance
! cp /content/drive/MyDrive/kaggle.json ~/.kaggle/kaggle.json 

In [11]:
# download data
! kaggle datasets download dansbecker/melbourne-housing-snapshot -f melb_data.csv

Downloading melb_data.csv.zip to /content
100% 451k/451k [00:00<00:00, 856kB/s]
100% 451k/451k [00:00<00:00, 855kB/s]


In [12]:
# unzip
! unzip melb_data.csv.zip

Archive:  melb_data.csv.zip
  inflating: melb_data.csv           


In [13]:
# libraries needed
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import SimpleImputer

In [14]:
# load data
data = pd.read_csv('melb_data.csv')
data.head()

Unnamed: 0,Suburb,Address,Rooms,Type,Price,Method,SellerG,Date,Distance,Postcode,...,Bathroom,Car,Landsize,BuildingArea,YearBuilt,CouncilArea,Lattitude,Longtitude,Regionname,Propertycount
0,Abbotsford,85 Turner St,2,h,1480000.0,S,Biggin,3/12/2016,2.5,3067.0,...,1.0,1.0,202.0,,,Yarra,-37.7996,144.9984,Northern Metropolitan,4019.0
1,Abbotsford,25 Bloomburg St,2,h,1035000.0,S,Biggin,4/02/2016,2.5,3067.0,...,1.0,0.0,156.0,79.0,1900.0,Yarra,-37.8079,144.9934,Northern Metropolitan,4019.0
2,Abbotsford,5 Charles St,3,h,1465000.0,SP,Biggin,4/03/2017,2.5,3067.0,...,2.0,0.0,134.0,150.0,1900.0,Yarra,-37.8093,144.9944,Northern Metropolitan,4019.0
3,Abbotsford,40 Federation La,3,h,850000.0,PI,Biggin,4/03/2017,2.5,3067.0,...,2.0,1.0,94.0,,,Yarra,-37.7969,144.9969,Northern Metropolitan,4019.0
4,Abbotsford,55a Park St,4,h,1600000.0,VB,Nelson,4/06/2016,2.5,3067.0,...,1.0,2.0,120.0,142.0,2014.0,Yarra,-37.8072,144.9941,Northern Metropolitan,4019.0


In [15]:
# prediction target, y
y = data['Price']
y.head()

0    1480000.0
1    1035000.0
2    1465000.0
3     850000.0
4    1600000.0
Name: Price, dtype: float64

In [16]:
# features, X; only numeric features
# but first, view all columns
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13580 entries, 0 to 13579
Data columns (total 21 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Suburb         13580 non-null  object 
 1   Address        13580 non-null  object 
 2   Rooms          13580 non-null  int64  
 3   Type           13580 non-null  object 
 4   Price          13580 non-null  float64
 5   Method         13580 non-null  object 
 6   SellerG        13580 non-null  object 
 7   Date           13580 non-null  object 
 8   Distance       13580 non-null  float64
 9   Postcode       13580 non-null  float64
 10  Bedroom2       13580 non-null  float64
 11  Bathroom       13580 non-null  float64
 12  Car            13518 non-null  float64
 13  Landsize       13580 non-null  float64
 14  BuildingArea   7130 non-null   float64
 15  YearBuilt      8205 non-null   float64
 16  CouncilArea    12211 non-null  object 
 17  Lattitude      13580 non-null  float64
 18  Longti

In [17]:
# features, X
X = (
    data
      .drop('Price', axis = 'columns')        # drop this since it's our prediction target, y
      .select_dtypes(include = 'number')      # only numeric features
)

# confirm only numeric features
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13580 entries, 0 to 13579
Data columns (total 12 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Rooms          13580 non-null  int64  
 1   Distance       13580 non-null  float64
 2   Postcode       13580 non-null  float64
 3   Bedroom2       13580 non-null  float64
 4   Bathroom       13580 non-null  float64
 5   Car            13518 non-null  float64
 6   Landsize       13580 non-null  float64
 7   BuildingArea   7130 non-null   float64
 8   YearBuilt      8205 non-null   float64
 9   Lattitude      13580 non-null  float64
 10  Longtitude     13580 non-null  float64
 11  Propertycount  13580 non-null  float64
dtypes: float64(11), int64(1)
memory usage: 1.2 MB


In [18]:
# split features, X, and prediction target, y, into train and test
X_train, X_test, y_train, y_test = train_test_split(
    X, y,                 
    train_size = 0.8,     # 80% training
    test_size = 0.2,      # 20% test
    random_state = 0
  )

In [19]:
# function that takes in training and test data and returns MAE
def score_dataset(X_train, X_test, y_train, y_test):
  model = RandomForestRegressor(n_estimators = 10, random_state = 0)  # random forest has 10 trees
  model.fit(X_train, y_train)                                         # same pattern as .fit_transform() and .transform()
  y_predicted = model.predict(X_test)
  return mean_absolute_error(y_predicted, y_test)

## Drop features with missing data

In [20]:
# view non-null counts
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10864 entries, 12167 to 2732
Data columns (total 12 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Rooms          10864 non-null  int64  
 1   Distance       10864 non-null  float64
 2   Postcode       10864 non-null  float64
 3   Bedroom2       10864 non-null  float64
 4   Bathroom       10864 non-null  float64
 5   Car            10815 non-null  float64
 6   Landsize       10864 non-null  float64
 7   BuildingArea   5708 non-null   float64
 8   YearBuilt      6557 non-null   float64
 9   Lattitude      10864 non-null  float64
 10  Longtitude     10864 non-null  float64
 11  Propertycount  10864 non-null  float64
dtypes: float64(11), int64(1)
memory usage: 1.1 MB


In [27]:
# identify columns in X_train that are complete; no missing data
complete_columns = (
    X_train
      .isnull()                    # same dimensions as X_train; entries are boolean
      .any(axis = 0)               # go along each row (axis = 0); False = no missing data; True = yes missing data
      .loc[lambda x: x == False]   # filter to columns that have no missing data
      .index                       # these are the columns                   
)

# these are the complete columns, no missing data
complete_columns

Index(['Rooms', 'Distance', 'Postcode', 'Bedroom2', 'Bathroom', 'Landsize',
       'Lattitude', 'Longtitude', 'Propertycount'],
      dtype='object')

In [28]:
# filter X_train to only complete features
X_train_complete = X_train[complete_columns]
X_train_complete.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10864 entries, 12167 to 2732
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Rooms          10864 non-null  int64  
 1   Distance       10864 non-null  float64
 2   Postcode       10864 non-null  float64
 3   Bedroom2       10864 non-null  float64
 4   Bathroom       10864 non-null  float64
 5   Landsize       10864 non-null  float64
 6   Lattitude      10864 non-null  float64
 7   Longtitude     10864 non-null  float64
 8   Propertycount  10864 non-null  float64
dtypes: float64(8), int64(1)
memory usage: 848.8 KB


In [29]:
# filter X_test to only complete features
X_test_complete = X_test[complete_columns]
X_test_complete.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2716 entries, 8505 to 2110
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Rooms          2716 non-null   int64  
 1   Distance       2716 non-null   float64
 2   Postcode       2716 non-null   float64
 3   Bedroom2       2716 non-null   float64
 4   Bathroom       2716 non-null   float64
 5   Landsize       2716 non-null   float64
 6   Lattitude      2716 non-null   float64
 7   Longtitude     2716 non-null   float64
 8   Propertycount  2716 non-null   float64
dtypes: float64(8), int64(1)
memory usage: 212.2 KB


In [30]:
# dimensions of X_train_complete and y_train
print(X_train_complete.shape)
print(y_train.shape)

(10864, 9)
(10864,)


In [31]:
# dimensions of X_test_complete and y_test
print(X_test_complete.shape)
print(y_test.shape)

(2716, 9)
(2716,)


In [32]:
# get mean absolute error
score_dataset(
    X_train = X_train_complete, 
    X_test = X_test_complete,
    y_train = y_train,
    y_test = y_test
)

183550.22137772635

## Imputation

In [58]:
# per conversation with Alex on Zoom w/family; look up Python OOP and classes
# don't think of the below as a variable; "2 rulers per Alex"
my_imputer = SimpleImputer() 

In [59]:
# X_train
# replace missing, NaN, with column mean
X_train_imputed = (
    pd.DataFrame(                     # create a dataframe
        my_imputer                    # replace missing values with column mean
          .fit_transform(X_train)
    )
)

# examine
X_train_imputed.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
0,1.0,5.0,3182.0,1.0,1.0,1.0,0.0,153.764119,1940.0,-37.85984,144.9867,13240.0
1,2.0,8.0,3016.0,2.0,2.0,1.0,193.0,153.764119,1964.839866,-37.858,144.9005,6380.0
2,3.0,12.6,3020.0,3.0,1.0,1.0,555.0,153.764119,1964.839866,-37.7988,144.822,3755.0
3,3.0,13.0,3046.0,3.0,1.0,1.0,265.0,153.764119,1995.0,-37.7083,144.9158,8870.0
4,3.0,13.3,3020.0,3.0,1.0,2.0,673.0,673.0,1970.0,-37.7623,144.8272,4217.0


In [60]:
# imputation messed up column names; rename columns
X_train_imputed.columns = X_train.columns

# examine
X_train_imputed.head()

Unnamed: 0,Rooms,Distance,Postcode,Bedroom2,Bathroom,Car,Landsize,BuildingArea,YearBuilt,Lattitude,Longtitude,Propertycount
0,1.0,5.0,3182.0,1.0,1.0,1.0,0.0,153.764119,1940.0,-37.85984,144.9867,13240.0
1,2.0,8.0,3016.0,2.0,2.0,1.0,193.0,153.764119,1964.839866,-37.858,144.9005,6380.0
2,3.0,12.6,3020.0,3.0,1.0,1.0,555.0,153.764119,1964.839866,-37.7988,144.822,3755.0
3,3.0,13.0,3046.0,3.0,1.0,1.0,265.0,153.764119,1995.0,-37.7083,144.9158,8870.0
4,3.0,13.3,3020.0,3.0,1.0,2.0,673.0,673.0,1970.0,-37.7623,144.8272,4217.0


In [61]:
# X_test
# replace missing values, NaN, with column mean
X_test_imputed = (
    pd.DataFrame(
        my_imputer
          .transform(X_test)    # tutorial uses transform()....why the difference?
    )
)

# examine
X_test_imputed.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
0,4.0,8.0,3016.0,4.0,2.0,2.0,450.0,190.0,1910.0,-37.861,144.8985,6380.0
1,2.0,6.6,3011.0,2.0,1.0,0.0,172.0,81.0,1900.0,-37.81,144.8896,2417.0
2,3.0,10.5,3020.0,3.0,1.0,1.0,581.0,153.764119,1964.839866,-37.7674,144.82421,4217.0
3,3.0,4.5,3181.0,2.0,2.0,1.0,128.0,134.0,2000.0,-37.8526,145.0071,7717.0
4,3.0,8.5,3044.0,3.0,2.0,2.0,480.0,153.764119,1964.839866,-37.72523,144.94567,7485.0


In [62]:
# imputation messed up column names; rename columns
X_test_imputed.columns = X_test.columns

# examine
X_test_imputed.head()

Unnamed: 0,Rooms,Distance,Postcode,Bedroom2,Bathroom,Car,Landsize,BuildingArea,YearBuilt,Lattitude,Longtitude,Propertycount
0,4.0,8.0,3016.0,4.0,2.0,2.0,450.0,190.0,1910.0,-37.861,144.8985,6380.0
1,2.0,6.6,3011.0,2.0,1.0,0.0,172.0,81.0,1900.0,-37.81,144.8896,2417.0
2,3.0,10.5,3020.0,3.0,1.0,1.0,581.0,153.764119,1964.839866,-37.7674,144.82421,4217.0
3,3.0,4.5,3181.0,2.0,2.0,1.0,128.0,134.0,2000.0,-37.8526,145.0071,7717.0
4,3.0,8.5,3044.0,3.0,2.0,2.0,480.0,153.764119,1964.839866,-37.72523,144.94567,7485.0


In [63]:
# get mae
score_dataset(
    X_train = X_train_imputed,
    X_test = X_test_imputed,
    y_train = y_train,
    y_test = y_test
)

178166.46269899711