In [1]:
# standard imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import warnings; warnings.filterwarnings('ignore')
%matplotlib inline

## getting the data ready to be used
* split the data into features and labels (X and y)
* deal with the missing values
* converting non-numerical values to numerical values (feature encoding)

In [2]:
heart_disease = pd.read_csv('./heart-disease.csv')
heart_disease.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [3]:
X = heart_disease.drop('target', axis=1)
X.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2


In [4]:
y = heart_disease['target']
y.head()

0    1
1    1
2    1
3    1
4    1
Name: target, dtype: int64

In [5]:
# split the data into training and test sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [6]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((242, 13), (61, 13), (242,), (61,))

In [7]:
X.shape

(303, 13)

In [8]:
len(heart_disease)

303

### make sure it's all numerical

In [9]:
car_sales = pd.read_csv('./car-sales-extended.csv')
car_sales.head()

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Honda,White,35431,4,15323
1,BMW,Blue,192714,5,19943
2,Honda,White,84714,4,28343
3,Toyota,White,154365,4,13434
4,Nissan,Blue,181577,3,14043


In [10]:
len(car_sales)

1000

In [11]:
car_sales.dtypes

Make             object
Colour           object
Odometer (KM)     int64
Doors             int64
Price             int64
dtype: object

In [12]:
# split into X y
X = car_sales.drop('Price', axis=1)
y = car_sales['Price']

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [14]:
# turn the categories into numbers
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

categorical_features = ['Make', 'Colour', 'Doors']
one_hot = OneHotEncoder()
transformer = ColumnTransformer([('one_hot',
                                   one_hot,
                                   categorical_features)],
                                   remainder='passthrough')

transformed_X_train = transformer.fit_transform(X_train)
transformed_X_train

array([[0.00000e+00, 0.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 8.16560e+04],
       [0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 1.37690e+05],
       [0.00000e+00, 0.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 2.25286e+05],
       ...,
       [0.00000e+00, 0.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 2.43177e+05],
       [0.00000e+00, 0.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 4.86740e+04],
       [0.00000e+00, 0.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 6.03440e+04]])

In [15]:
categorical_features = ['Make', 'Colour', 'Doors']
one_hot = OneHotEncoder()
transformer = ColumnTransformer([('one_hot',
                                   one_hot,
                                   categorical_features)],
                                   remainder='passthrough')

transformed_X_test = transformer.fit_transform(X_test)
transformed_X_test

array([[0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 2.08131e+05],
       [0.00000e+00, 0.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 5.11550e+04],
       [1.00000e+00, 0.00000e+00, 0.00000e+00, ..., 0.00000e+00,
        1.00000e+00, 8.68050e+04],
       ...,
       [0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 3.16770e+04],
       [1.00000e+00, 0.00000e+00, 0.00000e+00, ..., 0.00000e+00,
        1.00000e+00, 1.12156e+05],
       [0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 4.77120e+04]])

In [16]:
pd.DataFrame(transformed_X_train)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,81656.0
1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,137690.0
2,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,225286.0
3,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,19482.0
4,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,235294.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
795,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,228678.0
796,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,175134.0
797,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,243177.0
798,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,48674.0


In [17]:
df1 = pd.DataFrame(X_train)
df1

Unnamed: 0,Make,Colour,Odometer (KM),Doors
724,Toyota,White,81656,4
972,Honda,White,137690,4
233,Toyota,Blue,225286,4
652,Nissan,White,19482,3
527,Honda,White,235294,4
...,...,...,...,...
742,BMW,White,228678,5
831,BMW,Red,175134,5
845,Toyota,Green,243177,4
609,Toyota,White,48674,4


In [18]:
len(df1[df1['Colour'] == 'Black'])

79

In [19]:
dummies = pd.get_dummies(X_train[['Make', 'Colour', 'Doors']])
dummies

Unnamed: 0,Doors,Make_BMW,Make_Honda,Make_Nissan,Make_Toyota,Colour_Black,Colour_Blue,Colour_Green,Colour_Red,Colour_White
724,4,0,0,0,1,0,0,0,0,1
972,4,0,1,0,0,0,0,0,0,1
233,4,0,0,0,1,0,1,0,0,0
652,3,0,0,1,0,0,0,0,0,1
527,4,0,1,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...
742,5,1,0,0,0,0,0,0,0,1
831,5,1,0,0,0,0,0,0,1,0
845,4,0,0,0,1,0,0,1,0,0
609,4,0,0,0,1,0,0,0,0,1


In [20]:
# build machine learning model
from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor()
model.fit(transformed_X_train, y_train)
model.score(transformed_X_test, y_test)

0.21686428413108816

### handling missing values
* fill with some values (imputation)
* remove the missing data altogether

In [21]:
car_sales_missing = pd.read_csv('./car-sales-extended-missing-data.csv')
car_sales_missing.head()

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Honda,White,35431.0,4.0,15323.0
1,BMW,Blue,192714.0,5.0,19943.0
2,Honda,White,84714.0,4.0,28343.0
3,Toyota,White,154365.0,4.0,13434.0
4,Nissan,Blue,181577.0,3.0,14043.0


In [22]:
car_sales_missing.isna().sum()

Make             49
Colour           50
Odometer (KM)    50
Doors            50
Price            50
dtype: int64

In [23]:
car_sales_missing['Make'].value_counts()

Toyota    379
Honda     292
Nissan    183
BMW        97
Name: Make, dtype: int64

In [24]:
# split into X y
X = car_sales_missing.drop('Price', axis=1)
y = car_sales_missing['Price']

In [25]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

#### option1: fill missing values with pandas

In [26]:
# fill the 'Make' column
X_train['Make'].fillna('missing', inplace=True)

# fill the 'Colour' column
X_train['Colour'].fillna('missing', inplace=True)

# fill the 'Odometer (KM)' column
X_train['Odometer (KM)'].fillna(car_sales_missing['Odometer (KM)'].mean(), inplace=True)

# fill the 'Doors' column
X_train['Doors'].fillna(4, inplace=True)

In [27]:
X_train.isna().sum()

Make             0
Colour           0
Odometer (KM)    0
Doors            0
dtype: int64

In [28]:
# remove rows with missing Price value
X_train.dropna(inplace=True)

In [29]:
X_train.isna().sum()

Make             0
Colour           0
Odometer (KM)    0
Doors            0
dtype: int64

In [30]:
len(X_train)

800

In [31]:
X_train.reset_index(inplace=True)

In [32]:
# turn the categories into numbers
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

categorical_features = ['Make', 'Colour', 'Doors']
one_hot = OneHotEncoder()
transformer = ColumnTransformer([('one_hot',
                                   one_hot,
                                   categorical_features)],
                                   remainder='passthrough')

transformed_X_train = transformer.fit_transform(X_train)
transformed_X_train

array([[0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
        0.00000000e+00, 1.95000000e+02, 1.72401000e+05],
       [1.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
        1.00000000e+00, 6.50000000e+01, 1.78796000e+05],
       [0.00000000e+00, 1.00000000e+00, 0.00000000e+00, ...,
        0.00000000e+00, 9.40000000e+02, 5.34740000e+04],
       ...,
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
        0.00000000e+00, 6.24000000e+02, 7.49190000e+04],
       [1.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
        1.00000000e+00, 9.53000000e+02, 1.02773000e+05],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
        0.00000000e+00, 3.98000000e+02, 1.31253238e+05]])

In [33]:
dummies = pd.get_dummies(X_train[['Make', 'Colour', 'Doors']])
dummies

Unnamed: 0,Doors,Make_BMW,Make_Honda,Make_Nissan,Make_Toyota,Make_missing,Colour_Black,Colour_Blue,Colour_Green,Colour_Red,Colour_White,Colour_missing
0,4.0,0,0,0,1,0,0,0,1,0,0,0
1,5.0,1,0,0,0,0,0,0,0,0,1,0
2,4.0,0,1,0,0,0,0,0,0,0,1,0
3,5.0,1,0,0,0,0,0,1,0,0,0,0
4,4.0,0,1,0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
795,4.0,0,1,0,0,0,0,1,0,0,0,0
796,4.0,0,0,0,0,1,0,1,0,0,0,0
797,4.0,0,0,0,1,0,0,0,0,0,1,0
798,5.0,1,0,0,0,0,0,0,0,0,1,0


#### option2: fill missing values with sklearn