# Introduction to sckit-Learn(sklearn)
This notebook demonstrates some of the most useful functions of the beautiful scikit-learn library

What we're going to cover:
0. An End to End Scikit-Learn workflow.
1. Getting the data ready.
2. Choose the right esstimator / algorithm for our problems.
3. Fit the model/algorithm and use it to make predictons on our data 
4. Evaluating a model
5.IMprove a model
6. Save and load a trained model
7. Putting it all together

## An end to end Scikit-Learn workflow

In [1]:
# 104 - Typical Scikit learn work flow

In [2]:
# 1. Getting the data ready
import pandas as pd
import numpy as np
heart_disease = pd.read_csv('heart-disease.csv')

In [3]:
heart_disease

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,57,0,0,140,241,0,1,123,1,0.2,1,0,3,0
299,45,1,3,110,264,0,1,132,0,1.2,1,0,3,0
300,68,1,0,144,193,1,1,141,0,3.4,1,2,3,0
301,57,1,0,130,131,0,1,115,1,1.2,1,1,3,0


In [4]:
# create X ( Which is features matrix)
X = heart_disease.drop("target", axis = 1)
X

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,57,0,0,140,241,0,1,123,1,0.2,1,0,3
299,45,1,3,110,264,0,1,132,0,1.2,1,0,3
300,68,1,0,144,193,1,1,141,0,3.4,1,2,3
301,57,1,0,130,131,0,1,115,1,1.2,1,1,3


In [5]:
# create y ( labels)
y = heart_disease['target']
y

0      1
1      1
2      1
3      1
4      1
      ..
298    0
299    0
300    0
301    0
302    0
Name: target, Length: 303, dtype: int64

In [6]:
# choose the right model
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators=100)

# keep the Hyper parameter
clf.get_params()


{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'sqrt',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

In [7]:
# Fit the model to the training data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train,y_test = train_test_split(X, y, test_size= 0.2)

In [8]:
X_train

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
119,46,0,0,138,243,0,0,152,1,0.0,1,0,2
62,52,1,3,118,186,0,0,190,0,0.0,1,0,1
19,69,0,3,140,239,0,1,151,0,1.8,2,2,2
295,63,1,0,140,187,0,0,144,1,4.0,2,2,3
95,53,1,0,142,226,0,0,111,1,0.0,2,0,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...
289,55,0,0,128,205,0,2,130,1,2.0,1,1,3
144,76,0,2,140,197,0,2,116,0,1.1,1,0,2
186,60,1,0,130,253,0,1,144,1,1.4,2,1,3
61,54,1,1,108,309,0,1,156,0,0.0,2,0,3


In [9]:
y_train

119    1
62     1
19     1
295    0
95     1
      ..
289    0
144    1
186    0
61     1
57     1
Name: target, Length: 242, dtype: int64

In [10]:
X_test

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
80,41,1,2,112,250,0,1,179,0,0.0,2,0,2
75,55,0,1,135,250,0,0,161,0,1.4,1,0,2
26,59,1,2,150,212,1,1,157,0,1.6,2,0,2
156,47,1,2,130,253,0,1,179,0,0.0,2,0,2
7,44,1,1,120,263,0,1,173,0,0.0,2,0,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...
173,58,1,2,132,224,0,0,173,0,3.2,2,2,3
191,58,1,0,128,216,0,0,131,1,2.2,1,3,3
253,67,1,0,100,299,0,0,125,1,0.9,1,2,2
85,67,0,2,115,564,0,0,160,0,1.6,1,0,3


In [11]:
y_test

80     1
75     1
26     1
156    1
7      1
      ..
173    0
191    0
253    0
85     1
202    0
Name: target, Length: 61, dtype: int64

In [12]:
clf.fit(X_train, y_train)

In [13]:
y_preds = clf.predict(X_test)
y_preds

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0,
       0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1,
       1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0], dtype=int64)

In [14]:
# 4 . Evaluate the model on trainig data and test data

clf.score(X_train, y_train)


1.0

In [15]:
clf.score(X_test, y_test)

0.8032786885245902

In [16]:
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

In [17]:
y_test

80     1
75     1
26     1
156    1
7      1
      ..
173    0
191    0
253    0
85     1
202    0
Name: target, Length: 61, dtype: int64

In [18]:
y_preds

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0,
       0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1,
       1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0], dtype=int64)

In [19]:
classification_report(y_test, y_preds)

'              precision    recall  f1-score   support\n\n           0       0.92      0.70      0.79        33\n           1       0.72      0.93      0.81        28\n\n    accuracy                           0.80        61\n   macro avg       0.82      0.81      0.80        61\nweighted avg       0.83      0.80      0.80        61\n'

In [20]:
accuracy_score(y_test, y_preds)

0.8032786885245902

In [21]:
confusion_matrix(y_test,y_preds)

array([[23, 10],
       [ 2, 26]], dtype=int64)

In [22]:
# Imporve the model
# with diffrent n_estimators
for i in range (10, 100, 10):
    print(f'Trying Model with {i} estimators ...')
    clf = RandomForestClassifier(n_estimators = i).fit(X_train, y_train)
    print(f' Model Accuracy on Test Set: {clf.score(X_test, y_test) * 100:.2f}%')
    print(" ")

Trying Model with 10 estimators ...
 Model Accuracy on Test Set: 67.21%
 
Trying Model with 20 estimators ...
 Model Accuracy on Test Set: 88.52%
 
Trying Model with 30 estimators ...
 Model Accuracy on Test Set: 80.33%
 
Trying Model with 40 estimators ...
 Model Accuracy on Test Set: 78.69%
 
Trying Model with 50 estimators ...
 Model Accuracy on Test Set: 78.69%
 
Trying Model with 60 estimators ...
 Model Accuracy on Test Set: 75.41%
 
Trying Model with 70 estimators ...
 Model Accuracy on Test Set: 78.69%
 
Trying Model with 80 estimators ...
 Model Accuracy on Test Set: 78.69%
 
Trying Model with 90 estimators ...
 Model Accuracy on Test Set: 80.33%
 


In [23]:
#  Save and Load the model
import pickle
pickle.dump(clf, open('random_forest_model.pkl', 'wb'))

# To load the model which is allready created
load_model = pickle.load(open('random_forest_model.pkl', 'rb'))
load_model.score(X_test, y_test)

0.8032786885245902

In [24]:
# 105 - Optional Debugging Warinign In Jypyter


In [25]:
import warnings
warnings.filterwarnings('ignore')

In [26]:
import sklearn
sklearn.show_versions()


System:
    python: 3.11.7 | packaged by Anaconda, Inc. | (main, Dec 15 2023, 18:05:47) [MSC v.1916 64 bit (AMD64)]
executable: C:\Users\ASUS\anaconda3\python.exe
   machine: Windows-10-10.0.19045-SP0

Python dependencies:
      sklearn: 1.2.2
          pip: 23.3.1
   setuptools: 68.2.2
        numpy: 1.26.4
        scipy: 1.11.4
       Cython: None
       pandas: 2.1.4
   matplotlib: 3.8.0
       joblib: 1.2.0
threadpoolctl: 2.2.0

Built with OpenMP: True

threadpoolctl info:
       filepath: C:\Users\ASUS\anaconda3\Library\bin\mkl_rt.2.dll
         prefix: mkl_rt
       user_api: blas
   internal_api: mkl
        version: 2023.1-Product
    num_threads: 2
threading_layer: intel

       filepath: C:\Users\ASUS\anaconda3\vcomp140.dll
         prefix: vcomp
       user_api: openmp
   internal_api: openmp
        version: None
    num_threads: 4


In [27]:
# 106 - Getting your Data ready splitting your data

In [28]:
# lets listify the contents
what_were_covering = [
    "0. An end-to-end Scikit-learn workflow",
    "1. Getting the data ready",
    "2. Choose the right estimator/algorithm for our problems",
    "3. Fit the model/Algorithm and use it to make pridictions on our train data",
    "4. Evaluating a model",
    "5. Imporve a model",
    "6. Save and load Trained model",
    "7. Putting all together"
]

In [29]:
what_were_covering

['0. An end-to-end Scikit-learn workflow',
 '1. Getting the data ready',
 '2. Choose the right estimator/algorithm for our problems',
 '3. Fit the model/Algorithm and use it to make pridictions on our train data',
 '4. Evaluating a model',
 '5. Imporve a model',
 '6. Save and load Trained model',
 '7. Putting all together']

In [30]:
# standard imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

## 1. Getting our data ready to be used with machine learning
    Three main things we have to do:
    1.split the data into features and labels ( usually 'X' and 'y').
    2.Falling (also called imputing) or disregarding missing values.
    3. Convering non-numereical vlaues to numerical values ( also called as Feature encoding )

In [31]:
heart_disease

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,57,0,0,140,241,0,1,123,1,0.2,1,0,3,0
299,45,1,3,110,264,0,1,132,0,1.2,1,0,3,0
300,68,1,0,144,193,1,1,141,0,3.4,1,2,3,0
301,57,1,0,130,131,0,1,115,1,1.2,1,1,3,0


In [32]:
X = heart_disease.drop("target", axis = 1)
X

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,57,0,0,140,241,0,1,123,1,0.2,1,0,3
299,45,1,3,110,264,0,1,132,0,1.2,1,0,3
300,68,1,0,144,193,1,1,141,0,3.4,1,2,3
301,57,1,0,130,131,0,1,115,1,1.2,1,1,3


In [33]:
y = heart_disease['target']
y.head()

0    1
1    1
2    1
3    1
4    1
Name: target, dtype: int64

In [34]:
# split the data inot training and test sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    test_size = 0.2)

In [35]:
X_train

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
241,59,0,0,174,249,0,1,143,1,0.0,1,0,2
186,60,1,0,130,253,0,1,144,1,1.4,2,1,3
210,57,1,2,128,229,0,0,150,0,0.4,1,1,3
149,42,1,2,130,180,0,1,150,0,0.0,2,0,2
147,60,0,3,150,240,0,1,171,0,0.9,2,0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...
17,66,0,3,150,226,0,1,114,0,2.6,0,0,2
148,44,1,2,120,226,0,1,169,0,0.0,2,0,2
65,35,0,0,138,183,0,1,182,0,1.4,2,0,2
285,46,1,0,140,311,0,1,120,1,1.8,1,2,3


In [36]:
X_test

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
170,56,1,2,130,256,1,0,142,1,0.6,1,1,1
109,50,0,0,110,254,0,0,159,0,0.0,2,0,2
131,49,0,1,134,271,0,1,162,0,0.0,1,0,2
12,49,1,1,130,266,0,1,171,0,0.6,2,0,2
141,43,1,0,115,303,0,1,181,0,1.2,1,0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...
165,67,1,0,160,286,0,0,108,1,1.5,1,3,2
71,51,1,2,94,227,0,1,154,1,0.0,2,1,3
49,53,0,0,138,234,0,0,160,0,0.0,2,0,2
16,58,0,2,120,340,0,1,172,0,0.0,2,0,2


In [37]:
y_train

241    0
186    0
210    0
149    1
147    1
      ..
17     1
148    1
65     1
285    0
24     1
Name: target, Length: 242, dtype: int64

In [38]:
y_test

170    0
109    1
131    1
12     1
141    1
      ..
165    0
71     1
49     1
16     1
290    0
Name: target, Length: 61, dtype: int64

In [39]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((242, 13), (61, 13), (242,), (61,))

In [40]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 242 entries, 241 to 24
Data columns (total 13 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       242 non-null    int64  
 1   sex       242 non-null    int64  
 2   cp        242 non-null    int64  
 3   trestbps  242 non-null    int64  
 4   chol      242 non-null    int64  
 5   fbs       242 non-null    int64  
 6   restecg   242 non-null    int64  
 7   thalach   242 non-null    int64  
 8   exang     242 non-null    int64  
 9   oldpeak   242 non-null    float64
 10  slope     242 non-null    int64  
 11  ca        242 non-null    int64  
 12  thal      242 non-null    int64  
dtypes: float64(1), int64(12)
memory usage: 26.5 KB


In [41]:
X.shape

(303, 13)

In [42]:
X.shape[0] * 0.8

242.4

In [43]:
y.shape

(303,)

In [44]:
y.shape[0] * 0.2

60.6

In [45]:
# 107 - Quick Tip Clean Transform REduce

### Clean Data -> Transform Data -> Reduce Data

In [46]:
# 108 - Geeting Your Data Ready Convert Data to Numbers

### Make sure its all numberical

In [47]:
car_sales = pd.read_csv("car-sales-extended.csv")
car_sales.head()

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Honda,White,35431,4,15323
1,BMW,Blue,192714,5,19943
2,Honda,White,84714,4,28343
3,Toyota,White,154365,4,13434
4,Nissan,Blue,181577,3,14043


In [48]:
car_sales.shape

(1000, 5)

In [49]:
len(car_sales)

1000

In [50]:
# split into X and y
X = car_sales.drop('Price', axis = 1)
y = car_sales['Price']

# Spliting into Training and Test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

In [51]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((800, 4), (200, 4), (800,), (200,))

In [52]:
# Build the machine learnign model
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor()
# model.fit(X_train, y_train)
# The above line encounter error due to Machine learning model doesnot deal with strings,
# so that columns make and model has to convert to numerical values

In [53]:
# you need to conver Doors column also numerical
car_sales.Doors.value_counts()

Doors
4    856
5     79
3     65
Name: count, dtype: int64

In [54]:
# Turn categorical features into Numereical features ie make, colour and doors by using onehotencoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

categorical_features = ['Make', 'Colour', 'Doors']

one_hot = OneHotEncoder()

transformer = ColumnTransformer([('one_hot', one_hot, categorical_features)], remainder = 'passthrough')

transformer_x = transformer.fit_transform(X)
transformer_x


array([[0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 3.54310e+04],
       [1.00000e+00, 0.00000e+00, 0.00000e+00, ..., 0.00000e+00,
        1.00000e+00, 1.92714e+05],
       [0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 8.47140e+04],
       ...,
       [0.00000e+00, 0.00000e+00, 1.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 6.66040e+04],
       [0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 2.15883e+05],
       [0.00000e+00, 0.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 2.48360e+05]])

In [55]:
transformer_x

array([[0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 3.54310e+04],
       [1.00000e+00, 0.00000e+00, 0.00000e+00, ..., 0.00000e+00,
        1.00000e+00, 1.92714e+05],
       [0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 8.47140e+04],
       ...,
       [0.00000e+00, 0.00000e+00, 1.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 6.66040e+04],
       [0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 2.15883e+05],
       [0.00000e+00, 0.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 2.48360e+05]])

In [56]:
dummies = pd.get_dummies(car_sales[['Make','Colour', 'Doors']])
dummies

Unnamed: 0,Doors,Make_BMW,Make_Honda,Make_Nissan,Make_Toyota,Colour_Black,Colour_Blue,Colour_Green,Colour_Red,Colour_White
0,4,False,True,False,False,False,False,False,False,True
1,5,True,False,False,False,False,True,False,False,False
2,4,False,True,False,False,False,False,False,False,True
3,4,False,False,False,True,False,False,False,False,True
4,3,False,False,True,False,False,True,False,False,False
...,...,...,...,...,...,...,...,...,...,...
995,4,False,False,False,True,True,False,False,False,False
996,3,False,False,True,False,False,False,False,False,True
997,4,False,False,True,False,False,True,False,False,False
998,4,False,True,False,False,False,False,False,False,True


In [57]:
# 110 - Getting Your Data Ready Handling Missing Values with Pandas

### 1.2 What if there were missing values ...?

1. Fill them with some value(also known as imputation).
2. Remove the samples with missing data all together.
   

In [58]:
# import car_sales missing data 

car_sales_missing = pd.read_csv("car-sales-extended-missing-data.csv")
car_sales_missing.head()

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Honda,White,35431.0,4.0,15323.0
1,BMW,Blue,192714.0,5.0,19943.0
2,Honda,White,84714.0,4.0,28343.0
3,Toyota,White,154365.0,4.0,13434.0
4,Nissan,Blue,181577.0,3.0,14043.0


In [59]:
car_sales_missing.isna().sum()

Make             49
Colour           50
Odometer (KM)    50
Doors            50
Price            50
dtype: int64

In [60]:
# Lets try and convert out data to numbers
X

Unnamed: 0,Make,Colour,Odometer (KM),Doors
0,Honda,White,35431,4
1,BMW,Blue,192714,5
2,Honda,White,84714,4
3,Toyota,White,154365,4
4,Nissan,Blue,181577,3
...,...,...,...,...
995,Toyota,Black,35820,4
996,Nissan,White,155144,3
997,Nissan,Blue,66604,4
998,Honda,White,215883,4


In [61]:
X = car_sales_missing.drop('Price', axis = 1)
y = car_sales_missing['Price']

In [62]:
# Try to convert categroical values to Numerical
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

categorical_features = ['Make', 'Colour', 'Doors']
one_hot = OneHotEncoder(sparse = False)

transformer = ColumnTransformer( [('one-hot', one_hot, categorical_features)], remainder = 'passthrough')

transformed_X = transformer.fit_transform(X)
transformed_X


array([[0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 0.00000e+00,
        0.00000e+00, 3.54310e+04],
       [1.00000e+00, 0.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 1.92714e+05],
       [0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 0.00000e+00,
        0.00000e+00, 8.47140e+04],
       ...,
       [0.00000e+00, 0.00000e+00, 1.00000e+00, ..., 0.00000e+00,
        0.00000e+00, 6.66040e+04],
       [0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 0.00000e+00,
        0.00000e+00, 2.15883e+05],
       [0.00000e+00, 0.00000e+00, 0.00000e+00, ..., 0.00000e+00,
        0.00000e+00, 2.48360e+05]])

#### option 1: fill missing data with pandas

In [63]:
# Fill the 'Make' column
car_sales_missing["Make"].fillna('Missing', inplace = True)

# Fill the 'Colour ' column
car_sales_missing['Colour'].fillna('Missing ', inplace = True)

# fill the 'Odometer (KM)' column
car_sales_missing['Odometer (KM)'].fillna(car_sales_missing['Odometer (KM)'].mean(), inplace = True)



In [69]:
car_sales_missing['Doors'].value_counts()

Doors
4.0    811
5.0     75
3.0     64
Name: count, dtype: int64

In [70]:
car_sales_missing["Doors"].fillna(4, inplace = True)

In [86]:
car_sales_missing.dropna(inplace = True)

In [87]:
car_sales_missing.isna().sum()

Make             0
Colour           0
Odometer (KM)    0
Doors            0
Price            0
dtype: int64

In [92]:
# create X and y
X = car_sales_missing.drop("Price" , axis = 1)
X
y = car_sales_missing["Price"]
y


0      15323.0
1      19943.0
2      28343.0
3      13434.0
4      14043.0
        ...   
995    32042.0
996     5716.0
997    31570.0
998     4001.0
999    12732.0
Name: Price, Length: 950, dtype: float64

In [105]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

categorical_features = ['Make', 'Colour', 'Doors']
one_hot = OneHotEncoder()
transformer = ColumnTransformer([('one-hot', one_hot, categorical_features)], remainder = "passthrough")

transformed_X = transformer.fit_transform(car_sales_missing)

transformed_X

array([[0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 0.00000e+00,
        3.54310e+04, 1.53230e+04],
       [1.00000e+00, 0.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        1.92714e+05, 1.99430e+04],
       [0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 0.00000e+00,
        8.47140e+04, 2.83430e+04],
       ...,
       [0.00000e+00, 0.00000e+00, 0.00000e+00, ..., 0.00000e+00,
        6.66040e+04, 3.15700e+04],
       [0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 0.00000e+00,
        2.15883e+05, 4.00100e+03],
       [0.00000e+00, 0.00000e+00, 0.00000e+00, ..., 0.00000e+00,
        2.48360e+05, 1.27320e+04]])

### Option 2: Fill the missing values using Scikit_learn

In [107]:
# 111 - Getting your data , Handling the missing values using scikitlearn

In [111]:
car_sales_missing = pd.read_csv('car-sales-extended-missing-data.csv')
car_sales_missing.head()

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Honda,White,35431.0,4.0,15323.0
1,BMW,Blue,192714.0,5.0,19943.0
2,Honda,White,84714.0,4.0,28343.0
3,Toyota,White,154365.0,4.0,13434.0
4,Nissan,Blue,181577.0,3.0,14043.0


In [112]:
car_sales_missing.isna().sum()

Make             49
Colour           50
Odometer (KM)    50
Doors            50
Price            50
dtype: int64

In [115]:
# Drop the missing values in the price column
car_sales_missing.dropna(subset = ['Price'], inplace = True)

In [116]:
car_sales_missing.isna().sum()

Make             47
Colour           46
Odometer (KM)    48
Doors            47
Price             0
dtype: int64

In [119]:
X = car_sales_missing.drop("Price", axis = 1)
y = car_sales_missing["Price"]

In [120]:
X.isna().sum()

Make             47
Colour           46
Odometer (KM)    48
Doors            47
dtype: int64

In [141]:
# Fill the missing value using scikit learn

from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer


cat_impute = SimpleImputer(strategy = "constant", fill_value = "Missing")
door_impute = SimpleImputer(strategy = "constant", fill_value = 4)
num_impute = SimpleImputer(strategy = "mean")

# Define the columns
cat_features = ['Make', 'Colour']
door_features = ['Doors']
num_features = ["Odometer (KM)"]

impute = ColumnTransformer([
                                ('cat_impute', cat_impute, cat_features),
                                ('door_impute', door_impute, door_features),
                                ('num_impute', num_impute, num_features)
                            ], remainder = "passthrough")

filled_X = impute.fit_transform(X)
filled_X

array([['Honda', 'White', 4.0, 35431.0],
       ['BMW', 'Blue', 5.0, 192714.0],
       ['Honda', 'White', 4.0, 84714.0],
       ...,
       ['Nissan', 'Blue', 4.0, 66604.0],
       ['Honda', 'White', 4.0, 215883.0],
       ['Toyota', 'Blue', 4.0, 248360.0]], dtype=object)

In [145]:
car_sales_filled = pd.DataFrame(filled_X, columns = ['Make', 'Colour', 'Doors','Odometer (KM)'])
car_sales_filled.head()

Unnamed: 0,Make,Colour,Doors,Odometer (KM)
0,Honda,White,4.0,35431.0
1,BMW,Blue,5.0,192714.0
2,Honda,White,4.0,84714.0
3,Toyota,White,4.0,154365.0
4,Nissan,Blue,3.0,181577.0


In [146]:
car_sales_filled.isna().sum()

Make             0
Colour           0
Doors            0
Odometer (KM)    0
dtype: int64

In [1]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

categorical_features = ["Make", "Colour","Doors"]
one_hot = OneHotEncoder()
transformer = ColumnTransformer([("one_hot",one_hot,
                                  categorical_features)],
                                remainder= 'passthrough')

transformed_X = transformer.fit_transformed(car_sales_filled)

AttributeError: 'ColumnTransformer' object has no attribute 'fit_transformed'