# Chương 15: Cải thiện Hiệu suất với Các Mô Hình Hợp Thành (Ensembles)

### Cây quyết định gộp (Bagged Decision Trees)

In [24]:
# Bagged Decision Trees for Classification
from pandas import read_csv
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

# load data
filename = 'pima-indians-diabetes.data.csv'
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
dataframe = read_csv(filename, names=names)
array = dataframe.values
X = array[:,0:8]
Y = array[:,8]

# define seed for reproducibility
seed = 7

# define cross-validation method with shuffle=True
kfold = KFold(n_splits=10, shuffle=True, random_state=seed)

# define the base model (Decision Tree)
cart = DecisionTreeClassifier()

# number of trees in the bagging ensemble
num_trees = 100

# create the Bagging model using the base model
model = BaggingClassifier(estimator=cart, n_estimators=num_trees, random_state=seed)

# evaluate the model using cross-validation
results = cross_val_score(model, X, Y, cv=kfold)

# print the mean accuracy of the cross-validation
print(results.mean())

0.7578263841421736


### Rừng ngẫu nhiên (Random Forest)

In [26]:
# Random Forest Classification
from pandas import read_csv
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier

# load data
filename = 'pima-indians-diabetes.data.csv'
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
dataframe = read_csv(filename, names=names)
array = dataframe.values
X = array[:,0:8]
Y = array[:,8]

# define the number of trees and maximum features
num_trees = 100
max_features = 3

# define cross-validation method with shuffle=True (for random state to take effect)
kfold = KFold(n_splits=10, shuffle=True, random_state=7)

# create the Random Forest model
model = RandomForestClassifier(n_estimators=num_trees, max_features=max_features)

# evaluate the model using cross-validation
results = cross_val_score(model, X, Y, cv=kfold)

# print the mean accuracy of the cross-validation
print(results.mean())


0.7656185919343814


### Cây quyết định mở rộng (Extra Trees)

In [28]:
# Extra Trees Classification
from pandas import read_csv
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import ExtraTreesClassifier

# load data
filename = 'pima-indians-diabetes.data.csv'
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
dataframe = read_csv(filename, names=names)
array = dataframe.values
X = array[:, 0:8]
Y = array[:, 8]

# define number of trees and maximum features
num_trees = 100
max_features = 7

# define cross-validation method with shuffle=True (for random state to take effect)
kfold = KFold(n_splits=10, shuffle=True, random_state=7)

# create the Extra Trees model
model = ExtraTreesClassifier(n_estimators=num_trees, max_features=max_features)

# evaluate the model using cross-validation
results = cross_val_score(model, X, Y, cv=kfold)

# print the mean accuracy of the cross-validation
print(results.mean())


0.7551777170198224


### AdaBoost

In [32]:
# AdaBoost Classification
from pandas import read_csv
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import AdaBoostClassifier

# load data
filename = 'pima-indians-diabetes.data.csv'
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
dataframe = read_csv(filename, names=names)
array = dataframe.values
X = array[:, 0:8]
Y = array[:, 8]

# define number of trees and seed for random state
num_trees = 30
seed = 7

# define cross-validation method with shuffle=True (to ensure random_state takes effect)
kfold = KFold(n_splits=10, shuffle=True, random_state=seed)

# create the AdaBoost model with 'SAMME' algorithm
model = AdaBoostClassifier(n_estimators=num_trees, random_state=seed, algorithm='SAMME')

# evaluate the model using cross-validation
results = cross_val_score(model, X, Y, cv=kfold)

# print the mean accuracy of the cross-validation
print(results.mean())


0.7552460697197538


### Stochastic Gradient Boosting

In [34]:
# Stochastic Gradient Boosting Classification
from pandas import read_csv
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import GradientBoostingClassifier

# load data
filename = 'pima-indians-diabetes.data.csv'
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
dataframe = read_csv(filename, names=names)
array = dataframe.values
X = array[:, 0:8]  # features
Y = array[:, 8]    # target

# set random seed for reproducibility
seed = 7
num_trees = 100

# define cross-validation method with shuffle=True (to ensure random_state takes effect)
kfold = KFold(n_splits=10, shuffle=True, random_state=seed)

# create the Gradient Boosting model
model = GradientBoostingClassifier(n_estimators=num_trees, random_state=seed)

# evaluate the model using cross-validation
results = cross_val_score(model, X, Y, cv=kfold)

# print the mean accuracy of the cross-validation
print(results.mean())


0.7578947368421053


### Mô Hình Voting

In [None]:
# Voting Ensemble for Classification
from pandas import read_csv
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier

# load data
filename = 'pima-indians-diabetes.data.csv'
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
dataframe = read_csv(filename, names=names)
array = dataframe.values
X = array[:, 0:8]  # features
Y = array[:, 8]    # target

# define cross-validation method with shuffle=True (to ensure random_state takes effect)
kfold = KFold(n_splits=10, shuffle=True, random_state=7)

# create the sub models
estimators = []
model1 = LogisticRegression(max_iter=1000, random_state=7)
estimators.append(('logistic', model1))
model2 = DecisionTreeClassifier(random_state=7)
estimators.append(('cart', model2))
model3 = SVC(kernel='linear', random_state=7)
estimators.append(('svm', model3))

# create the ensemble model
ensemble = VotingClassifier(estimators, voting='hard')

# evaluate the model using cross-validation
results = cross_val_score(ensemble, X, Y, cv=kfold)

# print the mean accuracy of the cross-validation
print(results.mean())

# Kết thúc