In [3]:
# import packages

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.preprocessing import StandardScaler,MinMaxScaler
from sklearn.model_selection import train_test_split
%matplotlib inline

# Import the model

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.discriminant_analysis import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.tree import DecisionTreeClassifier

# import pipeline
from sklearn.pipeline import Pipeline

In [4]:
# 2. Data Loading
# For this example, let's use iris datasets

iris_data = datasets.load_iris(as_frame=True)
df = iris_data.frame
features = iris_data.data
label = iris_data.target
print(df.info())
print(df.describe())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   sepal length (cm)  150 non-null    float64
 1   sepal width (cm)   150 non-null    float64
 2   petal length (cm)  150 non-null    float64
 3   petal width (cm)   150 non-null    float64
 4   target             150 non-null    int64  
dtypes: float64(4), int64(1)
memory usage: 6.0 KB
None
       sepal length (cm)  sepal width (cm)  petal length (cm)  \
count         150.000000        150.000000         150.000000   
mean            5.843333          3.057333           3.758000   
std             0.828066          0.435866           1.765298   
min             4.300000          2.000000           1.000000   
25%             5.100000          2.800000           1.600000   
50%             5.800000          3.000000           4.350000   
75%             6.400000          3.300000        

In [5]:
# check for missing values and 
print(df.isna().sum())
print(df.duplicated().sum())

# 4. Data Cleaning
# Check the proportion of data using label
print(label.value_counts())

# Drop duplicates
df = df.drop_duplicates()
features = df.copy()
label = features.pop('target')

sepal length (cm)    0
sepal width (cm)     0
petal length (cm)    0
petal width (cm)     0
target               0
dtype: int64
1
target
0    50
1    50
2    50
Name: count, dtype: int64


In [12]:
# 5. Data Spliting
x_train,x_test,y_train,y_test = train_test_split(features,label,train_size=0.9,shuffle=True,random_state=123)

In [13]:
# 2. Data Loading
# For this example, let's use iris datasets

iris_data = datasets.load_iris(as_frame=True)
df = iris_data.frame
features = iris_data.data
label = iris_data.target
print(df.info())
print(df.describe())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   sepal length (cm)  150 non-null    float64
 1   sepal width (cm)   150 non-null    float64
 2   petal length (cm)  150 non-null    float64
 3   petal width (cm)   150 non-null    float64
 4   target             150 non-null    int64  
dtypes: float64(4), int64(1)
memory usage: 6.0 KB
None
       sepal length (cm)  sepal width (cm)  petal length (cm)  \
count         150.000000        150.000000         150.000000   
mean            5.843333          3.057333           3.758000   
std             0.828066          0.435866           1.765298   
min             4.300000          2.000000           1.000000   
25%             5.100000          2.800000           1.600000   
50%             5.800000          3.000000           4.350000   
75%             6.400000          3.300000        

In [14]:
# 6. Construct the pipeline
# (A) Create one list for all the scalers and one list for all the models




scaler_dict = {
    'min_max': MinMaxScaler(),
    'standard': StandardScaler()
}

model_dict = {
    'knn': KNeighborsClassifier(),
    'svc': SVC(),
    'decision_tree': DecisionTreeClassifier(),
    'random_forest': RandomForestClassifier()
}

pipelines = []
for scaler_name,scaler_class in scaler_dict.items():
    for model_name,model_class in model_dict.items():
        pipeline = Pipeline([
            (scaler_name,scaler_class),
            (model_name,model_class)
            ])
        pipelines.append(pipeline)

In [15]:
print(pipelines[1].steps)

[('min_max', MinMaxScaler()), ('svc', SVC())]


The next plan is to train and test each individual pipeline and see which one performs better

In [16]:
# Define a function to contain the pipeline training and evaluation
from sklearn import metrics

def train_evaluate(pipeline, x_train, y_train, x_test, y_test):
    pipeline.fit(x_train, y_train)
    prediction = pipeline.predict(x_test)
    print(metrics.classification_report(prediction, y_test))
    accuracy = pipeline.score(x_test, y_test)
    return accuracy

In [17]:
# Loop through the list of pipelies to perform the training and evaluation
accuracy_list = []
for i, pipeline in enumerate(pipelines):
    print("Training and evaluation pipeline #", i + 1)
    print("Step: ", pipeline.steps)
    accuracy = train_evaluate(pipeline, x_train, y_train, x_test, y_test)
    accuracy_list.append(accuracy)


Training and evaluation pipeline # 1
Step:  [('min_max', MinMaxScaler()), ('knn', KNeighborsClassifier())]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00         4
           1       1.00      1.00      1.00         5
           2       1.00      1.00      1.00         6

    accuracy                           1.00        15
   macro avg       1.00      1.00      1.00        15
weighted avg       1.00      1.00      1.00        15

Training and evaluation pipeline # 2
Step:  [('min_max', MinMaxScaler()), ('svc', SVC())]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00         4
           1       1.00      1.00      1.00         5
           2       1.00      1.00      1.00         6

    accuracy                           1.00        15
   macro avg       1.00      1.00      1.00        15
weighted avg       1.00      1.00      1.00        15

Training and evaluation pipeline # 3
Step

Because iris datasetis very simple dataset, so we can easily obtain a pipeline that can give 100% accuracy result

Do that note usually is not this case. The data you get in real life cases are usually more complicate than this

In our case of iris dataset, we can just choose whichever pipeline that give us the 100% accuracy result (for a more complete picture, you can take a look and compare the classification reports)

#Task
1. Try to implement what you have practised using the diabetes dataset
2. Diabetes dataset sklearn.datasets.load_diabetes()
3. Do take not that tihs is a regression dataset, to please import the regression models that you have leart before

In [27]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge
from sklearn.datasets import load_diabetes
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [20]:
# Load the diabetes dataset
diabetes = load_diabetes()
X, y = diabetes.data, diabetes.target

In [23]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=42)

# Define the pipeline
pipeline = Pipeline([
    ('scaler', StandardScaler()),  # Step 1: Standardize the data
    ('ridge', Ridge())             # Step 2: Apply Ridge regression
])

In [28]:
# Fit the pipeline on the training data
pipeline.fit(X_train, y_train)

# Make predictions on the testing data
y_pred = pipeline.predict(X_test)

# Evaluate the pipeline on the test data
score = pipeline.score(X_test, y_test)
print(f"Test Score: {score}")

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)

Test Score: 0.45260276297191915
Mean Squared Error: 2900.1936284934827


In [29]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_diabetes
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# Load the diabetes dataset
diabetes_data = load_diabetes(as_frame=True)
X = diabetes_data.data
y = diabetes_data.target

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a pipeline
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('model', LinearRegression())
])

# Fit the pipeline to the training data
pipeline.fit(X_train, y_train)

# Make predictions on the testing data
y_pred = pipeline.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)

# Evaluate the pipeline on the test data
score = pipeline.score(X_test, y_test)
print(f"Test Score: {score}")


Mean Squared Error: 2900.1936284934827
Test Score: 0.45260276297191915


##Model Hyperparameter tuning
1. Exhauseted grid search GridSearch()
2. Random parameter search: RandomizedSearchCV

In [30]:
# import
# best_pipline = regressor_pipelines[best_pipline_index[0][0]]
best_pipline_index = np.where(r2_list==np.max(r2_list))
best_pipline = pipelines[best_pipline_index[0][0]]
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

In [32]:
# for grid search CV function
param_grid = {
    # 'min_max_feature_range': [(0, 2), (-1,1), (-2,2)],
    'random_forest_r': [10, 100, 200],
    'random_forest_r_max_depth': [None, 5, 10],
    'random_forest_r_max_leaf_nodes': [None, 5, 10]
}

In [None]:
grid_search = GridSearchCV(best_pipeline, param_grid=param_grid)
model_grid_search = grid_search.fit(x_train, y_train)

In [40]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.datasets import load_diabetes

# Load the diabetes dataset
diabetes = load_diabetes()
X, y = diabetes.data, diabetes.target

# Standardization
scaler = StandardScaler()
X_standardized = scaler.fit_transform(X)

# Normalization
min_max_scaler = MinMaxScaler()
X_normalized = min_max_scaler.fit_transform(X)

print("Standardized Data:\n", X_standardized[:5])
print("Normalized Data:\n", X_normalized[:5])


Standardized Data:
 [[ 0.80050009  1.06548848  1.29708846  0.45984057 -0.92974581 -0.73206462
  -0.91245053 -0.05449919  0.41853093 -0.37098854]
 [-0.03956713 -0.93853666 -1.08218016 -0.55350458 -0.17762425 -0.40288615
   1.56441355 -0.83030083 -1.43658851 -1.93847913]
 [ 1.79330681  1.06548848  0.93453324 -0.1192138  -0.95867356 -0.71889748
  -0.68024452 -0.05449919  0.06015558 -0.54515416]
 [-1.87244107 -0.93853666 -0.24377122 -0.77064997  0.25629203  0.52539714
  -0.75764652  0.72130245  0.47698252 -0.19682291]
 [ 0.11317236 -0.93853666 -0.76494435  0.45984057  0.08272552  0.32789006
   0.17117751 -0.05449919 -0.67250161 -0.98056821]]
Normalized Data:
 [[0.66666667 1.         0.58264463 0.54929577 0.29411765 0.25697211
  0.20779221 0.28208745 0.562217   0.43939394]
 [0.48333333 0.         0.14876033 0.35211268 0.42156863 0.30677291
  0.62337662 0.14104372 0.22243673 0.16666667]
 [0.88333333 1.         0.51652893 0.43661972 0.28921569 0.25896414
  0.24675325 0.28208745 0.49657763 0.4

In [41]:
from sklearn.neural_network import MLPClassifier
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split

# Load the digits dataset
digits = load_digits()
X, y = digits.data, digits.target

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Define the neural network with hyperparameters
mlp = MLPClassifier(hidden_layer_sizes=(100,), max_iter=300, alpha=0.0001, solver='adam', learning_rate_init=0.001)

# Fit the model on the training data
mlp.fit(X_train, y_train)

# Evaluate the model on the test data
score = mlp.score(X_test, y_test)
print(f"Test Score: {score}")


Test Score: 0.975


In [42]:
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC

param_grid = {'C': [0.1, 1, 10], 'kernel': ['linear', 'rbf']}
grid_search = GridSearchCV(SVC(), param_grid, cv=5)
grid_search.fit(X_train, y_train)


In [43]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform

param_dist = {'C': uniform(0.1, 10), 'kernel': ['linear', 'rbf']}
random_search = RandomizedSearchCV(SVC(), param_dist, n_iter=10, cv=5)
random_search.fit(X_train, y_train)


In [44]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split

# Load the digits dataset
digits = load_digits()
X, y = digits.data, digits.target

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Define the model
rf = RandomForestClassifier()

# Define the hyperparameters and their values to search
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Set up the grid search
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)

# Fit the grid search to the data
grid_search.fit(X_train, y_train)

# Print the best parameters and the corresponding score
print(f"Best Parameters: {grid_search.best_params_}")
print(f"Best Score: {grid_search.best_score_}")

# Evaluate the best model on the test data
best_model = grid_search.best_estimator_
test_score = best_model.score(X_test, y_test)
print(f"Test Score: {test_score}")


Fitting 5 folds for each of 108 candidates, totalling 540 fits
Best Parameters: {'max_depth': 20, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best Score: 0.9749419279907083
Test Score: 0.9694444444444444


In [45]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split
from scipy.stats import randint

# Load the digits dataset
digits = load_digits()
X, y = digits.data, digits.target

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Define the model
rf = RandomForestClassifier()

# Define the hyperparameters and their distributions to sample from
param_dist = {
    'n_estimators': randint(50, 200),
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': randint(2, 11),
    'min_samples_leaf': randint(1, 5)
}

# Set up the random search
random_search = RandomizedSearchCV(estimator=rf, param_distributions=param_dist, n_iter=100, cv=5, n_jobs=-1, verbose=2, random_state=0)

# Fit the random search to the data
random_search.fit(X_train, y_train)

# Print the best parameters and the corresponding score
print(f"Best Parameters: {random_search.best_params_}")
print(f"Best Score: {random_search.best_score_}")

# Evaluate the best model on the test data
best_model = random_search.best_estimator_
test_score = best_model.score(X_test, y_test)
print(f"Test Score: {test_score}")


Fitting 5 folds for each of 100 candidates, totalling 500 fits
Best Parameters: {'max_depth': 20, 'min_samples_leaf': 1, 'min_samples_split': 3, 'n_estimators': 155}
Best Score: 0.9756387921022067
Test Score: 0.9722222222222222


In [56]:
df = pd.DataFrame(digits.data)
# df.describe()
print(digits.target)

[0 1 2 ... 8 9 8]


In [59]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.datasets import load_diabetes
from sklearn.model_selection import train_test_split

# Load the diabetes dataset
diabetes = load_diabetes()
X, y = diabetes.data, diabetes.target

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Define the model
knn = KNeighborsRegressor()

# Define the hyperparameters and their values to search
param_grid = {
    'n_neighbors': [3, 5, 7, 9],
    'weights': ['uniform', 'distance'],
    'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
    'leaf_size': [20, 30, 40],
    'p': [1, 2]
}

# Set up the grid search
grid_search = GridSearchCV(estimator=knn, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)

# Fit the grid search to the data
grid_search.fit(X_train, y_train)

# Print the best parameters and the corresponding score
print(f"Best Parameters: {grid_search.best_params_}")
print(f"Best Score: {grid_search.best_score_}")

# Evaluate the best model on the test data
best_model = grid_search.best_estimator_
test_score = best_model.score(X_test, y_test)
print(f"Test Score: {test_score}")


Fitting 5 folds for each of 192 candidates, totalling 960 fits
Best Parameters: {'algorithm': 'auto', 'leaf_size': 20, 'n_neighbors': 9, 'p': 2, 'weights': 'distance'}
Best Score: 0.46834588028613594
Test Score: 0.23758030750180603


  _data = np.array(data, dtype=dtype, copy=copy,


In [65]:
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import mean_squared_error
import numpy as np

# Sample data
# X, y = np.random.rand(100, 5), np.random.rand(100)
diabetes = load_diabetes()
X, y = diabetes.data, diabetes.target

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define models
models = {
    'LinearRegression': LinearRegression(),
    'Ridge': Ridge(),
    'Lasso': Lasso()
}

# Define hyperparameter grid
param_grid = {
    'LinearRegression': {'fit_intercept': [True, False]},
    'Ridge': {'alpha': [0.1, 1.0, 10.0]},
    'Lasso': {'alpha': [0.1, 1.0, 10.0]}
}

# Perform grid search
best_models = {}
for name, model in models.items():
    grid_search = GridSearchCV(model, param_grid[name], cv=5, scoring='neg_mean_squared_error', n_jobs=-1, verbose=2)
    grid_search.fit(X_train, y_train)
    best_models[name] = grid_search.best_estimator_

# Evaluate models
for name, model in best_models.items():
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    print(f"{name} MSE: {mse}")


Fitting 5 folds for each of 2 candidates, totalling 10 fits
Fitting 5 folds for each of 3 candidates, totalling 15 fits
Fitting 5 folds for each of 3 candidates, totalling 15 fits
LinearRegression MSE: 2900.19362849348
Ridge MSE: 2856.486887670654
Lasso MSE: 2798.193485169719
