## Scikit-Learn (sklearn)

###  An end-to-end Scikit-Learn workflow

In [25]:
# 1.Get the data ready
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

heart_disease = pd.read_csv("heart-disease.csv")
heart_disease

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,57,0,0,140,241,0,1,123,1,0.2,1,0,3,0
299,45,1,3,110,264,0,1,132,0,1.2,1,0,3,0
300,68,1,0,144,193,1,1,141,0,3.4,1,2,3,0
301,57,1,0,130,131,0,1,115,1,1.2,1,1,3,0


In [3]:
# Create X (feature matrix) --> X = all your input features for all your samples.
X = heart_disease.drop("target",axis=1) #removing the “target” column from the DataFrame.
#axis=1 means you’re dropping a column (not a row).

#Create Y (label vector) --> Y =  what you want to predict
Y = heart_disease["target"] #we want to predict the target either 0 or 1

In [5]:
#2. Choose the right model and hyperparameter. Hyperparameters are settings you choose before training a model.They control how the algorithm learns.They are not learned from data, but control how the model learns.
from sklearn.ensemble import RandomForestClassifier
# RandomForestClassifier --> machine learning model in scikit-learn.It builds a “forest” of decision trees to classify data.initialize it with settings called hyperparameters.
clf = RandomForestClassifier(n_estimators=100) #This creates a RandomForest with 100 trees.
#We'll keep the default hyperparamter
clf.get_params() #.get_params() is a method on any scikit-learn model.It returns a dictionary of all hyperparameters and their current values.

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'sqrt',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'monotonic_cst': None,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

In [6]:
#3. Fit the model to the training data.Splitting data into training and testing sets.
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test =  train_test_split(X,Y,test_size=0.2) #train_test_split() splits your data into training and testing sets 
#X_train, Y_train Used to train the model  X_test, Y_test Used to test (evaluate) the model
#test_size = 0.2 --> Use 20% of the data for testing, 80% for training.

In [7]:
clf.fit(X_train,Y_train); #fit your model only on training data:

In [8]:
X_train

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
6,56,0,1,140,294,0,0,153,0,1.3,1,0,2
252,62,0,0,138,294,1,1,106,0,1.9,1,3,2
274,47,1,0,110,275,0,0,118,1,1.0,1,1,2
138,57,1,0,110,201,0,1,126,1,1.5,1,0,1
118,46,0,1,105,204,0,1,172,0,0.0,2,0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...
255,45,1,0,142,309,0,0,147,1,0.0,1,3,3
155,58,0,0,130,197,0,1,131,0,0.6,1,0,2
183,58,1,2,112,230,0,0,165,0,2.5,1,1,3
39,65,0,2,160,360,0,0,151,0,0.8,2,0,2


In [9]:
# Make a prediction
Y_pred = clf.predict(X_test)
Y_pred

array([1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0,
       1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1,
       1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0])

In [10]:
#4. Evaluate the model on the training data and test data
clf.score(X_train,Y_train)

1.0

In [11]:
clf.score(X_test,Y_test)

0.8360655737704918

In [12]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
print(classification_report(Y_test,Y_pred))

              precision    recall  f1-score   support

           0       0.85      0.79      0.81        28
           1       0.83      0.88      0.85        33

    accuracy                           0.84        61
   macro avg       0.84      0.83      0.83        61
weighted avg       0.84      0.84      0.84        61



In [13]:
accuracy_score(Y_test,Y_pred)

0.8360655737704918

In [16]:
# 5. Imporve a model
# Try different n_estimator
np.random.seed(42)
for i in range(10,100,10):
    print(f'Trying model with {i} estimator ..')
    clf = RandomForestClassifier(n_estimators=i).fit(X_train,Y_train)
    print(f'Model accuracy on test set : {clf.score(X_test,Y_test)*100:.2f}%')
    print(" ")

Trying model with 10 estimator ..
Model accuracy on test set : 80.33%
 
Trying model with 20 estimator ..
Model accuracy on test set : 78.69%
 
Trying model with 30 estimator ..
Model accuracy on test set : 80.33%
 
Trying model with 40 estimator ..
Model accuracy on test set : 80.33%
 
Trying model with 50 estimator ..
Model accuracy on test set : 83.61%
 
Trying model with 60 estimator ..
Model accuracy on test set : 81.97%
 
Trying model with 70 estimator ..
Model accuracy on test set : 81.97%
 
Trying model with 80 estimator ..
Model accuracy on test set : 81.97%
 
Trying model with 90 estimator ..
Model accuracy on test set : 85.25%
 


In [17]:
# 6.Save a model and load it 
import pickle
pickle.dump(clf,open("random_forst_model_1.pkl","wb"))

In [24]:
loaded_model = pickle.load(open("random_forst_model_1.pkl","rb"))
score_latest=loaded_model.score(X_test,Y_test)
score_latest

0.8524590163934426

## 1. Getting the Data Ready

### Three main things to do:
    1.Split the data into Features matrix(X) and Label vector(y)
    2.Filling (also called imputing) or disregarding missing values
    3.Converting non numeric values to numerci values (also called feature coding)

In [27]:
heart_disease.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


### 1.1 Split the data into Features matrix(X) and Label vector(y)

In [30]:
#Split the data into Features matrix(X) 
X = heart_disease.drop("target",axis=1)
X.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2


In [31]:
#Split the data into Label vector(y)
y = heart_disease["target"]
y.head()

0    1
1    1
2    1
3    1
4    1
Name: target, dtype: int64

In [32]:
#Spliting the data into training and test data 
#mostly the data is split as 80% training and 20% testing
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2)

In [35]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((242, 13), (61, 13), (242,), (61,))

### 1.2 Converting non numeric values to numeric values

In [41]:
# for this one we are getting another dataset since heart_disease dataset has all numeric values

In [42]:
car_sales = pd.read_csv("car-sales-extended.csv")
car_sales.head()

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Honda,White,35431,4,15323
1,BMW,Blue,192714,5,19943
2,Honda,White,84714,4,28343
3,Toyota,White,154365,4,13434
4,Nissan,Blue,181577,3,14043


In [43]:
len(car_sales)

1000

In [44]:
car_sales.dtypes

Make             object
Colour           object
Odometer (KM)     int64
Doors             int64
Price             int64
dtype: object

we have to convert make and color to numeric value

In [56]:
#Split the data into X and y
X = car_sales.drop("Price",axis=1)
y = car_sales["Price"]

#SPLIT THE DATA INTO TRAINING AND TESTING

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2)

In [58]:
 # Turn the categories into numbers
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

cateogorical_feature = ["Make","Colour","Doors"] #list the columns you want to transform
one_hot = OneHotEncoder() # it will turn each category into a separate binary column.
transformer = ColumnTransformer([("one_hot",
                                 one_hot,
                                 cateogorical_feature)],
                                remainder="passthrough")
transformed_x = transformer.fit_transform(X)
transformed_x

array([[0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 3.54310e+04],
       [1.00000e+00, 0.00000e+00, 0.00000e+00, ..., 0.00000e+00,
        1.00000e+00, 1.92714e+05],
       [0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 8.47140e+04],
       ...,
       [0.00000e+00, 0.00000e+00, 1.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 6.66040e+04],
       [0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 2.15883e+05],
       [0.00000e+00, 0.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 2.48360e+05]], shape=(1000, 13))

In [59]:
# fit the model
X_train,X_test,y_train,y_test = train_test_split(transformed_x,y,test_size=0.2)

In [60]:
# Build machine learning model
from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor()
model.fit(X_train,y_train)
model.score(X_test,y_test)

0.2330485230539474

## 1.3 Handle Missing Values
    Two ways to handle
     1. Fill them with some value(also known as imputation).
     2. Remove the samples with missing data altogether
    Note - In a newer version of Scikit-Learn (0.23+), the OneHotEncoder class was upgraded to be able to handle None & NaN values so no error will appear.

In [70]:
car_sales_missing = pd.read_csv("car-sales-missing-data.csv")
car_sales_missing.head()

Unnamed: 0,Make,Colour,Odometer,Doors,Price
0,Toyota,White,150043.0,4.0,"$4,000"
1,Honda,Red,87899.0,4.0,"$5,000"
2,Toyota,Blue,,3.0,"$7,000"
3,BMW,Black,11179.0,5.0,"$22,000"
4,Nissan,White,213095.0,4.0,"$3,500"


In [72]:
#split the data
X = car_sales_missing.drop("Price",axis=1)
y = car_sales_missing["Price"]
X_train,X_test,y_train,y_test =  train_test_split(X,y,test_size=0.2)
#convert to numeric
cateogorical_feature = ["Make","Colour","Doors"]
one_hot = OneHotEncoder()
transformer = ColumnTransformer([("one_hot",one_hot,cateogorical_feature)], remainder="passthrough")
transformed_x1 = transformer.fit_transform(X)
transformed_x1.toarray()

array([[0.00000e+00, 0.00000e+00, 0.00000e+00, 1.00000e+00, 0.00000e+00,
        0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00, 1.00000e+00,
        0.00000e+00, 0.00000e+00, 1.00000e+00, 0.00000e+00, 0.00000e+00,
        1.50043e+05],
       [0.00000e+00, 1.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00,
        0.00000e+00, 0.00000e+00, 0.00000e+00, 1.00000e+00, 0.00000e+00,
        0.00000e+00, 0.00000e+00, 1.00000e+00, 0.00000e+00, 0.00000e+00,
        8.78990e+04],
       [0.00000e+00, 0.00000e+00, 0.00000e+00, 1.00000e+00, 0.00000e+00,
        0.00000e+00, 1.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00,
        0.00000e+00, 1.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00,
                nan],
       [1.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00,
        1.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00,
        0.00000e+00, 0.00000e+00, 0.00000e+00, 1.00000e+00, 0.00000e+00,
        1.11790e+04],
       [0.00000e+00, 0.00000e+00, 1.

### Option 1. Fill missing data with Pandas

In [71]:
car_sales_missing.isna().sum()

Make        1
Colour      1
Odometer    4
Doors       1
Price       2
dtype: int64

In [74]:
# Fill the "Make" column
car_sales_missing["Make"].fillna("missing",inplace=True)
# Fill the "Colour" column
car_sales_missing["Colour"].fillna("missing",inplace=True)
# Fill the "Odometer" column
car_sales_missing["Odometer"].fillna(car_sales_missing["Odometer"].mean(),inplace=True)
# Fill the "Doors" column
car_sales_missing["Doors"].fillna(4.0,inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  car_sales_missing["Make"].fillna("missing",inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  car_sales_missing["Colour"].fillna("missing",inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which

In [75]:
car_sales_missing.isna().sum()

Make        0
Colour      0
Odometer    0
Doors       0
Price       2
dtype: int64

In [90]:
# Remove rows with missing Price value
car_sales_missing.dropna(inplace=True)

In [77]:
car_sales_missing.isna().sum()

Make        0
Colour      0
Odometer    0
Doors       0
Price       0
dtype: int64

### Option 2. Fill missing data with Scikitlearn

In [123]:
car_sales_missing = pd.read_csv("car-sales-extended-missing-data.csv")
car_sales_missing.head()

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Honda,White,35431.0,4.0,15323.0
1,BMW,Blue,192714.0,5.0,19943.0
2,Honda,White,84714.0,4.0,28343.0
3,Toyota,White,154365.0,4.0,13434.0
4,Nissan,Blue,181577.0,3.0,14043.0


In [124]:
car_sales_missing.isna().sum()

Make             49
Colour           50
Odometer (KM)    50
Doors            50
Price            50
dtype: int64

In [125]:
# Drop the rows with no labels
car_sales_missing.dropna(subset=["Price"], inplace=True)
car_sales_missing.isna().sum()

Make             47
Colour           46
Odometer (KM)    48
Doors            47
Price             0
dtype: int64

In [126]:
# Split into X & y
X = car_sales_missing.drop("Price", axis=1)
y = car_sales_missing["Price"]

# Split data into train and test
np.random.seed(42)
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.2)

In [129]:
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

# Fill the categorical values with 'missing' & numeric value with mean
categorical_imputer = SimpleImputer(strategy='constant', fill_value = 'missing')
door_imputer = SimpleImputer(strategy='constant',fill_value=4)
odometer_imputer = SimpleImputer(strategy='mean')

#Define columns
cateogorical_feature = ["Make","Colour"]
door_feature = ["Doors"]
odometer_feature = ["Odometer (KM)"]

#Create an imputer (something that fills missing data)
imputer = ColumnTransformer([
    ('categorical_imputer',categorical_imputer,cateogorical_feature),
    ('door_imputer',door_imputer,door_feature),
    ('odometer_imputer',odometer_imputer,odometer_feature)
])

#Transform the data
# Fill train and test values separately
filled_X_train = imputer.fit_transform(X_train)
filled_X_test = imputer.transform(X_test)
filled_X_train

array([['Honda', 'White', 4.0, 71934.0],
       ['Toyota', 'Red', 4.0, 162665.0],
       ['Honda', 'White', 4.0, 42844.0],
       ...,
       ['Toyota', 'White', 4.0, 196225.0],
       ['Honda', 'Blue', 4.0, 133117.0],
       ['Honda', 'missing', 4.0, 150582.0]], shape=(760, 4), dtype=object)

In [130]:
# Get our transformed data array's back into DataFrame's
car_sales_filled_train = pd.DataFrame(filled_X_train, 
                                      columns=["Make", "Colour", "Doors", "Odometer (KM)"])

car_sales_filled_test = pd.DataFrame(filled_X_test, 
                                     columns=["Make", "Colour", "Doors", "Odometer (KM)"])

# Check missing data in training set
car_sales_filled_train.isna().sum()

Make             0
Colour           0
Doors            0
Odometer (KM)    0
dtype: int64

In [98]:
car_sales_filled.isna().sum()

Make        0
Colour      0
Doors       0
Odometer    0
dtype: int64

In [131]:
#convert to numeric
cateogorical_feature = ["Make","Colour","Doors"]
one_hot = OneHotEncoder()
transformer = ColumnTransformer([("one_hot",one_hot,cateogorical_feature)], remainder="passthrough")
# Fill train and test values separately
transformed_X_train = transformer.fit_transform(car_sales_filled_train)
transformed_X_test = transformer.transform(car_sales_filled_test)
# Check transformed and filled X_train
transformed_X_train.toarray()

array([[0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 7.19340e+04],
       [0.00000e+00, 0.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 1.62665e+05],
       [0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 4.28440e+04],
       ...,
       [0.00000e+00, 0.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 1.96225e+05],
       [0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 1.33117e+05],
       [0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 1.50582e+05]], shape=(760, 15))

In [133]:
# now we go the data as numbers and filled the missing values
# lets fit a model
np.random.seed(42)
from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor(n_estimators=100)
model.fit(transformed_X_train,y_train)
model.score(transformed_X_test,y_test)

0.21229043336119102

## 2.Choosing the right estimator/algorithm for a problem

Some things to notes:
*sklearn refers to machone learning models,algortithms as estimators.
* Classification problem = predicting category (heart disease or not)
    * `clf` (short form form classifier) used as a classification estimator
* Regression problem - predicting a number (selling price of a car)
* MachineLearningMap : https://scikit-learn.org/stable/machine_learning_map.html

### 2.1 Picking a machine Learning model for a regression problem
 using California Housing Dataset - https://scikit-learn.org/stable/modules/generated/sklearn.datasets.fetch_california_housing.html

In [140]:
# Get california housing data
from sklearn.datasets import fetch_california_housing
housing_data = fetch_california_housing()
housing_data

{'data': array([[   8.3252    ,   41.        ,    6.98412698, ...,    2.55555556,
           37.88      , -122.23      ],
        [   8.3014    ,   21.        ,    6.23813708, ...,    2.10984183,
           37.86      , -122.22      ],
        [   7.2574    ,   52.        ,    8.28813559, ...,    2.80225989,
           37.85      , -122.24      ],
        ...,
        [   1.7       ,   17.        ,    5.20554273, ...,    2.3256351 ,
           39.43      , -121.22      ],
        [   1.8672    ,   18.        ,    5.32951289, ...,    2.12320917,
           39.43      , -121.32      ],
        [   2.3886    ,   16.        ,    5.25471698, ...,    2.61698113,
           39.37      , -121.24      ]], shape=(20640, 8)),
 'target': array([4.526, 3.585, 3.521, ..., 0.923, 0.847, 0.894], shape=(20640,)),
 'frame': None,
 'target_names': ['MedHouseVal'],
 'feature_names': ['MedInc',
  'HouseAge',
  'AveRooms',
  'AveBedrms',
  'Population',
  'AveOccup',
  'Latitude',
  'Longitude'],
 'DESCR': 

In [141]:
# Create a DataFrame with housing_data
housing_data_df = pd.DataFrame(housing_data['data'],columns=housing_data["feature_names"])
housing_data_df

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
0,8.3252,41.0,6.984127,1.023810,322.0,2.555556,37.88,-122.23
1,8.3014,21.0,6.238137,0.971880,2401.0,2.109842,37.86,-122.22
2,7.2574,52.0,8.288136,1.073446,496.0,2.802260,37.85,-122.24
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25
...,...,...,...,...,...,...,...,...
20635,1.5603,25.0,5.045455,1.133333,845.0,2.560606,39.48,-121.09
20636,2.5568,18.0,6.114035,1.315789,356.0,3.122807,39.49,-121.21
20637,1.7000,17.0,5.205543,1.120092,1007.0,2.325635,39.43,-121.22
20638,1.8672,18.0,5.329513,1.171920,741.0,2.123209,39.43,-121.32


In [143]:
# the target field gives the Medhouse value create a column in housing_data_df for target
housing_data_df["target"] = housing_data["target"]
housing_data_df.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,target
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22,3.585
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24,3.521
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422


In [149]:
# Import algorithm
from sklearn.linear_model import Ridge
# Setup random seed
np.random.seed(42)
# Create the data
X = housing_data_df.drop("target",axis=1)
y = housing_data_df["target"] # median house price in $100,00s
# Split into train and test sets
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2)

# Instantiate and fit the model on trainning set
model = Ridge(alpha=0.5)
model.fit(X_train,y_train)

#Check the score of the model
model.score(X_test,y_test)

0.5758213996714423

In [148]:
# trying another model Lasso
from sklearn.linear_model import Lasso
model = Lasso()
model.fit(X_train,y_train)

#Check the score of the model
model.score(X_test,y_test)

0.2841671821008396

In [151]:
# trying another model - Essemble model
# import the randomForestRegressor model class from the ensemble module
from sklearn.ensemble import RandomForestRegressor
np.random.seed(42)
# Create the data
X = housing_data_df.drop("target",axis=1)
y = housing_data_df["target"] # median house price in $100,00s
# Split into train and test sets
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2)
model = RandomForestRegressor()
model.fit(X_train,y_train)
model.score(X_test,y_test)

0.806652667101436

## 2.2 Picking a machine Learning model for a classification problem

* Using breast_cancer dataset from scikitlearn - https://scikit-learn.org/stable/modules/generated/sklearn.datasets.load_breast_cancer.html#sklearn.datasets.load_breast_cancer
* MachineMachineLearningMap : https://scikit-learn.org/stable/machine_learning_map.html

In [152]:
from sklearn.datasets import load_breast_cancer
data = load_breast_cancer()
data

{'data': array([[1.799e+01, 1.038e+01, 1.228e+02, ..., 2.654e-01, 4.601e-01,
         1.189e-01],
        [2.057e+01, 1.777e+01, 1.329e+02, ..., 1.860e-01, 2.750e-01,
         8.902e-02],
        [1.969e+01, 2.125e+01, 1.300e+02, ..., 2.430e-01, 3.613e-01,
         8.758e-02],
        ...,
        [1.660e+01, 2.808e+01, 1.083e+02, ..., 1.418e-01, 2.218e-01,
         7.820e-02],
        [2.060e+01, 2.933e+01, 1.401e+02, ..., 2.650e-01, 4.087e-01,
         1.240e-01],
        [7.760e+00, 2.454e+01, 4.792e+01, ..., 0.000e+00, 2.871e-01,
         7.039e-02]], shape=(569, 30)),
 'target': array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
        0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0,
        1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0,
        1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1,
        1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1,

In [159]:
#Convert the imported dataset to DataFrame
data_df = pd.DataFrame(data['data'],columns=data['feature_names'])
data_df

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
0,17.99,10.38,122.80,1001.0,0.11840,0.27760,0.30010,0.14710,0.2419,0.07871,...,25.380,17.33,184.60,2019.0,0.16220,0.66560,0.7119,0.2654,0.4601,0.11890
1,20.57,17.77,132.90,1326.0,0.08474,0.07864,0.08690,0.07017,0.1812,0.05667,...,24.990,23.41,158.80,1956.0,0.12380,0.18660,0.2416,0.1860,0.2750,0.08902
2,19.69,21.25,130.00,1203.0,0.10960,0.15990,0.19740,0.12790,0.2069,0.05999,...,23.570,25.53,152.50,1709.0,0.14440,0.42450,0.4504,0.2430,0.3613,0.08758
3,11.42,20.38,77.58,386.1,0.14250,0.28390,0.24140,0.10520,0.2597,0.09744,...,14.910,26.50,98.87,567.7,0.20980,0.86630,0.6869,0.2575,0.6638,0.17300
4,20.29,14.34,135.10,1297.0,0.10030,0.13280,0.19800,0.10430,0.1809,0.05883,...,22.540,16.67,152.20,1575.0,0.13740,0.20500,0.4000,0.1625,0.2364,0.07678
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
564,21.56,22.39,142.00,1479.0,0.11100,0.11590,0.24390,0.13890,0.1726,0.05623,...,25.450,26.40,166.10,2027.0,0.14100,0.21130,0.4107,0.2216,0.2060,0.07115
565,20.13,28.25,131.20,1261.0,0.09780,0.10340,0.14400,0.09791,0.1752,0.05533,...,23.690,38.25,155.00,1731.0,0.11660,0.19220,0.3215,0.1628,0.2572,0.06637
566,16.60,28.08,108.30,858.1,0.08455,0.10230,0.09251,0.05302,0.1590,0.05648,...,18.980,34.12,126.70,1124.0,0.11390,0.30940,0.3403,0.1418,0.2218,0.07820
567,20.60,29.33,140.10,1265.0,0.11780,0.27700,0.35140,0.15200,0.2397,0.07016,...,25.740,39.42,184.60,1821.0,0.16500,0.86810,0.9387,0.2650,0.4087,0.12400


In [160]:
# Add the target column to the Dataset
data_df['target'] = data['target']
data_df

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,target
0,17.99,10.38,122.80,1001.0,0.11840,0.27760,0.30010,0.14710,0.2419,0.07871,...,17.33,184.60,2019.0,0.16220,0.66560,0.7119,0.2654,0.4601,0.11890,0
1,20.57,17.77,132.90,1326.0,0.08474,0.07864,0.08690,0.07017,0.1812,0.05667,...,23.41,158.80,1956.0,0.12380,0.18660,0.2416,0.1860,0.2750,0.08902,0
2,19.69,21.25,130.00,1203.0,0.10960,0.15990,0.19740,0.12790,0.2069,0.05999,...,25.53,152.50,1709.0,0.14440,0.42450,0.4504,0.2430,0.3613,0.08758,0
3,11.42,20.38,77.58,386.1,0.14250,0.28390,0.24140,0.10520,0.2597,0.09744,...,26.50,98.87,567.7,0.20980,0.86630,0.6869,0.2575,0.6638,0.17300,0
4,20.29,14.34,135.10,1297.0,0.10030,0.13280,0.19800,0.10430,0.1809,0.05883,...,16.67,152.20,1575.0,0.13740,0.20500,0.4000,0.1625,0.2364,0.07678,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
564,21.56,22.39,142.00,1479.0,0.11100,0.11590,0.24390,0.13890,0.1726,0.05623,...,26.40,166.10,2027.0,0.14100,0.21130,0.4107,0.2216,0.2060,0.07115,0
565,20.13,28.25,131.20,1261.0,0.09780,0.10340,0.14400,0.09791,0.1752,0.05533,...,38.25,155.00,1731.0,0.11660,0.19220,0.3215,0.1628,0.2572,0.06637,0
566,16.60,28.08,108.30,858.1,0.08455,0.10230,0.09251,0.05302,0.1590,0.05648,...,34.12,126.70,1124.0,0.11390,0.30940,0.3403,0.1418,0.2218,0.07820,0
567,20.60,29.33,140.10,1265.0,0.11780,0.27700,0.35140,0.15200,0.2397,0.07016,...,39.42,184.60,1821.0,0.16500,0.86810,0.9387,0.2650,0.4087,0.12400,0


In [164]:
## No missing data
data_df.isna().sum()

mean radius                0
mean texture               0
mean perimeter             0
mean area                  0
mean smoothness            0
mean compactness           0
mean concavity             0
mean concave points        0
mean symmetry              0
mean fractal dimension     0
radius error               0
texture error              0
perimeter error            0
area error                 0
smoothness error           0
compactness error          0
concavity error            0
concave points error       0
symmetry error             0
fractal dimension error    0
worst radius               0
worst texture              0
worst perimeter            0
worst area                 0
worst smoothness           0
worst compactness          0
worst concavity            0
worst concave points       0
worst symmetry             0
worst fractal dimension    0
target                     0
dtype: int64

In [167]:
# All data are numeric 
data_df.dtypes

mean radius                float64
mean texture               float64
mean perimeter             float64
mean area                  float64
mean smoothness            float64
mean compactness           float64
mean concavity             float64
mean concave points        float64
mean symmetry              float64
mean fractal dimension     float64
radius error               float64
texture error              float64
perimeter error            float64
area error                 float64
smoothness error           float64
compactness error          float64
concavity error            float64
concave points error       float64
symmetry error             float64
fractal dimension error    float64
worst radius               float64
worst texture              float64
worst perimeter            float64
worst area                 float64
worst smoothness           float64
worst compactness          float64
worst concavity            float64
worst concave points       float64
worst symmetry      

In [170]:
# Import algorithm
from sklearn.svm import LinearSVC
np.random.seed(42)
# Create X and y dataset
X = data_df.drop("target",axis=1)
y = data_df["target"]
#Split the dataset into training and testing
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2)

# Instantiate and fit the model on trainning set
clf = LinearSVC()
clf.fit(X_train,y_train)
clf.score(X_test,y_test)

0.956140350877193

In [172]:
# try another algorithm - RandomForestClassifier
# Import algorithm
from sklearn.ensemble import RandomForestClassifier
np.random.seed(42)
# Create X and y dataset
X = data_df.drop("target",axis=1)
y = data_df["target"]
#Split the dataset into training and testing
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2)

# Instantiate and fit the model on trainning set
clf = RandomForestClassifier()
clf.fit(X_train,y_train)
clf.score(X_test,y_test)

0.9649122807017544

### Tidbit:
         1. If you have structured data(in form of tables), use ensemble methods
         2. If you have unstructured data(like images,audio,text), use deep learning or transfer learning

## 3.Fit the model/algorithm on our data and use it to make predictions

### 3.1 Fitting the model on the data
Different names for:
 * `X` = features, features variables, data
 * `y` = labels, targets, target variables

In [174]:
# Import algorithm
from sklearn.ensemble import RandomForestClassifier
np.random.seed(42)
# Create X and y dataset
X = data_df.drop("target",axis=1)
y = data_df["target"]
#Split the dataset into training and testing
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2)

# Instantiate and fit the model on trainning set
clf = RandomForestClassifier()

#Fit the model to the data
clf.fit(X_train,y_train)

clf.score(X_test,y_test)

0.9649122807017544

### 3.2 Make predictions using a machine learning model
Once your model instance is trained, you can use the predict() method to predict a target value given a set of features.
In other words, use the model, along with some new, unseen and unlabelled data to predict the label.
Note: Data you predict on should be in the same shape and format as data you trained on.
2 ways to male predictions:
    1.`predict()` 
    2.`predict_proba()`

In [181]:
# import the csv file and model
heart_disease = pd.read_csv("heart-disease.csv")
from sklearn.ensemble import RandomForestClassifier
np.random.seed(42)

# Create feature matrix and label vector
X = heart_disease.drop("target",axis = 1)
y = heart_disease["target"]

# Split the data set inro training and test set
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2)

# Select required model
model = RandomForestClassifier(n_estimators=100)

#fit the training data into the model
model.fit(X_train,y_train)

#score the model
model.score(X_test,y_test)


0.8524590163934426

In [185]:
y_preds = model.predict(np.array([0,2,3,5,6])) # This doesn't work... incorrect shapes



ValueError: Expected 2D array, got 1D array instead:
array=[0. 2. 3. 5. 6.].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.

We get a ValueError (mismatched shapes):

ValueError: Expected 2D array, got 1D array instead:
array=[0. 2. 3. 4.].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.
This happens because we're trying to make predictions on data that is in a different format to the data our model was trained on.

Since our model was trained on data from X_train, predictions should be made on data in the same format and shape as X_train.

Our goal in many machine learning problems is to use patterns learned from the training data to make predictions on the test data (or future unseen data).

In [186]:
y_preds = model.predict(X_test)
y_preds

array([0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0,
       1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0])

In [188]:
np.array(y_test)

array([0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0,
       0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0])

In [190]:
# Compare the prediction (y_preds) to true label (y_test) to evaluate the model
np.mean(y_preds == y_test)

np.float64(0.8524590163934426)

In [191]:
model.score(X_test,y_test)

0.8524590163934426

#### Make predictions with `predict_proba()`
predict_proba() - Returns probabilities for all classification label
Unlike predict(), which outputs the most likely class label, predict_proba() provides a more granular insight into the model's confidence for each possible outcome.

predict_proba() 

*   It returns the probabilities of each class.
    For binary classification (2 classes):
    Returns an array with two probabilities for each sample.
    Example output for one sample: [0.2, 0.8]
    20% chance of class 0
    80% chance of class 1
    For multiclass classification (>2 classes):
    Returns probabilities for each class.
    Example: [0.1, 0.3, 0.6] for 3 classes.
    All probabilities for each sample sum to 1.

In [192]:
model.predict_proba(X_test[:5])

array([[0.89, 0.11],
       [0.49, 0.51],
       [0.43, 0.57],
       [0.84, 0.16],
       [0.18, 0.82]])

In [193]:
model.predict(X_test[:5])

array([0, 1, 1, 0, 1])

`predict()` can also be used for regression model

In [194]:
housing_data_df.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,target
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22,3.585
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24,3.521
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422
