In [88]:
#### 1. Handling missing values?
    # 1. Fill them with some value (aka imputation)
    # 2. remove the samples with missing data
    #Strategy
    #a. fill categorical values as missing
    #b. fill  numeric values with mean

In [89]:
import sklearn
sklearn.__version__

'0.24.1'

In [3]:
# 1. Prepare Data
import pandas as pd
import numpy as np
car_sales = pd.read_csv("../../data/car-sales-extended-missing-data.csv")
car_sales

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Honda,White,35431.0,4.0,15323.0
1,BMW,Blue,192714.0,5.0,19943.0
2,Honda,White,84714.0,4.0,28343.0
3,Toyota,White,154365.0,4.0,13434.0
4,Nissan,Blue,181577.0,3.0,14043.0
...,...,...,...,...,...
995,Toyota,Black,35820.0,4.0,32042.0
996,,White,155144.0,3.0,5716.0
997,Nissan,Blue,66604.0,4.0,31570.0
998,Honda,White,215883.0,4.0,4001.0


In [6]:
car_sales.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Make           951 non-null    object 
 1   Colour         950 non-null    object 
 2   Odometer (KM)  950 non-null    float64
 3   Doors          950 non-null    float64
 4   Price          950 non-null    float64
dtypes: float64(3), object(2)
memory usage: 39.2+ KB


In [92]:
car_sales.value_counts()

Make    Colour  Odometer (KM)  Doors  Price  
Toyota  White   248815.0       4.0    9785.0     1
Honda   White   171260.0       4.0    18524.0    1
                95579.0        4.0    11135.0    1
                95481.0        4.0    8687.0     1
                92883.0        4.0    14931.0    1
                                                ..
Toyota  Blue    51155.0        4.0    15960.0    1
                48684.0        4.0    33817.0    1
                44815.0        4.0    8460.0     1
                42480.0        4.0    19720.0    1
BMW     Black   11049.0        3.0    19500.0    1
Length: 773, dtype: int64

In [93]:
car_sales.isna().sum()



Make             49
Colour           50
Odometer (KM)    50
Doors            50
Price            50
dtype: int64

In [8]:
#### Option 1: Fill missing values with pandas
car_sales["Make"].fillna("missing", inplace=True)
car_sales["Colour"].fillna("missing", inplace=True)
#fill with avg
car_sales["Odometer (KM)"].fillna(car_sales["Odometer (KM)"].mean(), inplace=True)
#for doors we can go with majority as float dont make sense
car_sales["Doors"].mode()

0    4.0
dtype: float64

In [95]:
car_sales["Doors"].fillna(4, inplace=True)

In [96]:
#check now
car_sales.isna().sum()

Make              0
Colour            0
Odometer (KM)     0
Doors             0
Price            50
dtype: int64

In [97]:
#now we can  remove the rows , where the price is missing
car_sales.dropna(inplace=True)

In [98]:
# split inot X/y
X = car_sales.drop("Price", axis=1)
y = car_sales["Price"]

In [99]:
# conver non numeric to numbers (FEATURE ENCODING)
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
categorical_features = ["Make", "Colour", "Doors"]

one_hot = OneHotEncoder()
transformer = ColumnTransformer([("one_hot", one_hot,categorical_features)], remainder="passthrough")
transformed_X = transformer.fit_transform(car_sales)
transformed_X

array([[0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 0.00000e+00,
        3.54310e+04, 1.53230e+04],
       [1.00000e+00, 0.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        1.92714e+05, 1.99430e+04],
       [0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 0.00000e+00,
        8.47140e+04, 2.83430e+04],
       ...,
       [0.00000e+00, 0.00000e+00, 1.00000e+00, ..., 0.00000e+00,
        6.66040e+04, 3.15700e+04],
       [0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 0.00000e+00,
        2.15883e+05, 4.00100e+03],
       [0.00000e+00, 0.00000e+00, 0.00000e+00, ..., 0.00000e+00,
        2.48360e+05, 1.27320e+04]])

In [100]:
#### Option 2: Fill missing values with sklearn
cars = pd.read_csv("../../data/car-sales-extended-missing-data.csv")
cars.head()

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Honda,White,35431.0,4.0,15323.0
1,BMW,Blue,192714.0,5.0,19943.0
2,Honda,White,84714.0,4.0,28343.0
3,Toyota,White,154365.0,4.0,13434.0
4,Nissan,Blue,181577.0,3.0,14043.0


In [101]:
# drop the rows with no labels
cars.dropna(subset=["Price"], inplace=True)
cars.isna().sum()

Make             47
Colour           46
Odometer (KM)    48
Doors            47
Price             0
dtype: int64

In [102]:
#split
X = cars.drop("Price", axis=1)
y = cars["Price"]

In [103]:
#start with sklearn
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

In [104]:
# Fill categorical values with 'missing' & numerical with mean
cat_imputer = SimpleImputer(strategy="constant", fill_value="missing")
door_imputer = SimpleImputer(strategy="constant", fill_value=4)
num_imputer = SimpleImputer(strategy="mean")

In [105]:
# Define different column features
categorical_features = ["Make", "Colour"]
door_feature = ["Doors"]
numerical_feature = ["Odometer (KM)"]

In [106]:
imputer = ColumnTransformer([
    ("cat_imputer", cat_imputer, categorical_features),
    ("door_imputer", door_imputer, door_feature),
    ("num_imputer", num_imputer, numerical_feature)])

# Fill train and test values separately
filled_X_train = imputer.fit_transform(X_train)
filled_X_test = imputer.transform(X_test)

# Check filled X_train
filled_X_train

NameError: name 'X_train' is not defined

In [None]:
# Get our transformed data array's back into DataFrame's
car_sales_filled_train = pd.DataFrame(filled_X_train, 
                                      columns=["Make", "Colour", "Doors", "Odometer (KM)"])

car_sales_filled_test = pd.DataFrame(filled_X_test, 
                                      columns=["Make", "Colour", "Doors", "Odometer (KM)"])

# Check missing data in training set
car_sales_filled_train.isna().sum()

In [None]:
# Now let's one hot encode the features with the same code as before 
categorical_features = ["Make", "Colour", "Doors"]
one_hot = OneHotEncoder()
transformer = ColumnTransformer([("one_hot", 
                                 one_hot, 
                                 categorical_features)],
                                 remainder="passthrough")

# Fill train and test values separately
transformed_X_train = transformer.fit_transform(car_sales_filled_train)
transformed_X_test = transformer.transform(car_sales_filled_test)

# Check transformed and filled X_train
transformed_X_train.toarray()