In [13]:
from sklearn.datasets import load_digits
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, mean_squared_error, accuracy_score, r2_score
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.impute import SimpleImputer
import numpy as np
import pandas as pd
import time

### Load digits dataset

In [2]:
digits = load_digits()
xd, yd = load_digits(return_X_y=True)
sc = StandardScaler()
sc.fit(xd)
xd_std = sc.transform(xd)
xd_train, xd_test, yd_train, yd_test = train_test_split(xd_std, yd, test_size=0.3, random_state=1)
print(np.shape(xd))
print(np.shape(yd))

(1797, 64)
(1797,)


In [3]:
print(np.where(pd.isna(xd)))

(array([], dtype=int64), array([], dtype=int64))


This dataset has not any missing value.

### Random forest ensemble learning algorithm

In [70]:
start_time = time.time()
forest = RandomForestClassifier(criterion='gini', n_estimators=100, bootstrap=False,  random_state=1, n_jobs=-1)
forest.fit(xd_train, yd_train)
print("Running time:  %s seconds " % (time.time() - start_time))
yd_train_pred = forest.predict(xd_train)
yd_test_pred = forest.predict(xd_test)

Running time:  0.10924100875854492 seconds 


In [71]:
print('MSE of train data: %s' % (mean_squared_error(yd_train, yd_train_pred)))
print('MSE of test data: %s' % (mean_squared_error(yd_test, yd_test_pred)))
print('R2 score of train data: %s' % (r2_score(yd_train, yd_train_pred)))
print('R2 score of test data: %s' %  (r2_score(yd_test, yd_test_pred)))
print('Accuracy score of train data: %s' %  (accuracy_score(yd_train, yd_train_pred)))
print('Accuracy score of test data: %s' %  (accuracy_score(yd_test, yd_test_pred)))
print('Train data:')
print(classification_report(yd_train, yd_train_pred))
print('Test data:')
print(classification_report(yd_test, yd_test_pred))

MSE of train data: 0.0
MSE of test data: 0.17407407407407408
R2 score of train data: 1.0
R2 score of test data: 0.9789225765945926
Accuracy score of train data: 1.0
Accuracy score of test data: 0.9851851851851852
Train data:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       119
           1       1.00      1.00      1.00       133
           2       1.00      1.00      1.00       128
           3       1.00      1.00      1.00       119
           4       1.00      1.00      1.00       120
           5       1.00      1.00      1.00       135
           6       1.00      1.00      1.00       130
           7       1.00      1.00      1.00       122
           8       1.00      1.00      1.00       128
           9       1.00      1.00      1.00       123

    accuracy                           1.00      1257
   macro avg       1.00      1.00      1.00      1257
weighted avg       1.00      1.00      1.00      1257

Test data:
     

### Bagging ensemble learning algorithm

In [105]:
start_time = time.time()
model = DecisionTreeClassifier(criterion='entropy', max_depth=None, random_state=1)
bagging = BaggingClassifier(base_estimator=model, n_estimators=200, max_samples=1.0, max_features=1.0, bootstrap=True, bootstrap_features=True, n_jobs=-1, random_state=1)
bagging.fit(xd_train, yd_train)
print("Running time:  %s seconds " % (time.time() - start_time))
yd_train_pred = bagging.predict(xd_train)
yd_test_pred = bagging.predict(xd_test)

Running time:  0.4757039546966553 seconds 


In [106]:
print('MSE of train data: %s' % (mean_squared_error(yd_train, yd_train_pred)))
print('MSE of test data: %s' % (mean_squared_error(yd_test, yd_test_pred)))
print('R2 score of train data: %s' % (r2_score(yd_train, yd_train_pred)))
print('R2 score of test data: %s' %  (r2_score(yd_test, yd_test_pred)))
print('Accuracy score of train data: %s' %  (accuracy_score(yd_train, yd_train_pred)))
print('Accuracy score of test data: %s' %  (accuracy_score(yd_test, yd_test_pred)))
print('Train data:')
print(classification_report(yd_train, yd_train_pred))
print('Test data:')
print(classification_report(yd_test, yd_test_pred))

MSE of train data: 0.0
MSE of test data: 0.37407407407407406
R2 score of train data: 1.0
R2 score of test data: 0.9547059624692309
Accuracy score of train data: 1.0
Accuracy score of test data: 0.9666666666666667
Train data:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       119
           1       1.00      1.00      1.00       133
           2       1.00      1.00      1.00       128
           3       1.00      1.00      1.00       119
           4       1.00      1.00      1.00       120
           5       1.00      1.00      1.00       135
           6       1.00      1.00      1.00       130
           7       1.00      1.00      1.00       122
           8       1.00      1.00      1.00       128
           9       1.00      1.00      1.00       123

    accuracy                           1.00      1257
   macro avg       1.00      1.00      1.00      1257
weighted avg       1.00      1.00      1.00      1257

Test data:
     

### AdaBoost ensemble learning algorithm

In [111]:
start_time = time.time()
model = DecisionTreeClassifier(criterion='entropy', max_depth=5, random_state=1)
adaboost = AdaBoostClassifier(base_estimator=model, n_estimators=300, learning_rate=0.01, random_state=1)
adaboost.fit(xd_train, yd_train)
print("Running time:  %s seconds " % (time.time() - start_time))
yd_train_pred = adaboost.predict(xd_train)
yd_test_pred = adaboost.predict(xd_test)

Running time:  2.8625757694244385 seconds 


In [112]:
print('MSE of train data: %s' % (mean_squared_error(yd_train, yd_train_pred)))
print('MSE of test data: %s' % (mean_squared_error(yd_test, yd_test_pred)))
print('R2 score of train data: %s' % (r2_score(yd_train, yd_train_pred)))
print('R2 score of test data: %s' %  (r2_score(yd_test, yd_test_pred)))
print('Accuracy score of train data: %s' %  (accuracy_score(yd_train, yd_train_pred)))
print('Accuracy score of test data: %s' %  (accuracy_score(yd_test, yd_test_pred)))
print('Train data:')
print(classification_report(yd_train, yd_train_pred))
print('Test data:')
print(classification_report(yd_test, yd_test_pred))

MSE of train data: 0.0
MSE of test data: 1.0037037037037038
R2 score of train data: 1.0
R2 score of test data: 0.8784684735560553
Accuracy score of train data: 1.0
Accuracy score of test data: 0.95
Train data:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       119
           1       1.00      1.00      1.00       133
           2       1.00      1.00      1.00       128
           3       1.00      1.00      1.00       119
           4       1.00      1.00      1.00       120
           5       1.00      1.00      1.00       135
           6       1.00      1.00      1.00       130
           7       1.00      1.00      1.00       122
           8       1.00      1.00      1.00       128
           9       1.00      1.00      1.00       123

    accuracy                           1.00      1257
   macro avg       1.00      1.00      1.00      1257
weighted avg       1.00      1.00      1.00      1257

Test data:
              precis

### Load Mammographic Mass Data Set

In [10]:
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/mammographic-masses/mammographic_masses.data"
mam_data = pd.read_csv(url, header=None)
print(np.shape(mam_data))

(961, 6)


### Handle the missing values of a dataset

In [11]:
# Repalce the missing value with np.nan to become distinguishabe
mam_data1 = mam_data.replace(to_replace = "?", value = np.nan)
# Counting the missing values of data
mam_data1.isnull().sum()

0     2
1     5
2    31
3    48
4    76
5     0
dtype: int64

In [14]:
# Replace the missing value of data by calculating the median value along each column
m = SimpleImputer(missing_values=np.nan, strategy='median')
mm = m.fit(mam_data1)    
mam_data_n = mm.transform(mam_data1)

In [15]:
xm = mam_data_n[:, 0:-1]
ym = mam_data_n[:, -1]
sc = StandardScaler()
sc.fit(xm)
xm_std = sc.transform(xm)
xm_train, xm_test, ym_train, ym_test = train_test_split(xm_std, ym, test_size=0.3, random_state=1)
print(np.shape(xm))
print(np.shape(ym))

(961, 5)
(961,)


### Random forest ensemble learning algorithm

In [119]:
start_time = time.time()
forest = RandomForestClassifier(criterion='gini', n_estimators=200, bootstrap=False,  random_state=1, n_jobs=-1)
forest.fit(xm_train, ym_train)
print("Running time:  %s seconds " % (time.time() - start_time))
ym_train_pred = forest.predict(xm_train)
ym_test_pred = forest.predict(xm_test)

Running time:  0.16777610778808594 seconds 


In [120]:
print('MSE of train data: %s' % (mean_squared_error(ym_train, ym_train_pred)))
print('MSE of test data: %s' % (mean_squared_error(ym_test, ym_test_pred)))
print('R2 score of train data: %s' % (r2_score(ym_train, ym_train_pred)))
print('R2 score of test data: %s' %  (r2_score(ym_test, ym_test_pred)))
print('Accuracy score of train data: %s' %  (accuracy_score(ym_train, ym_train_pred)))
print('Accuracy score of test data: %s' %  (accuracy_score(ym_test, ym_test_pred)))
print('Train data:')
print(classification_report(ym_train, ym_train_pred))
print('Test data:')
print(classification_report(ym_test, ym_test_pred))

MSE of train data: 0.05357142857142857
MSE of test data: 0.24567474048442905
R2 score of train data: 0.784521381300603
R2 score of test data: 0.012084737602310991
Accuracy score of train data: 0.9464285714285714
Accuracy score of test data: 0.754325259515571
Train data:
              precision    recall  f1-score   support

         0.0       0.93      0.97      0.95       361
         1.0       0.96      0.92      0.94       311

    accuracy                           0.95       672
   macro avg       0.95      0.94      0.95       672
weighted avg       0.95      0.95      0.95       672

Test data:
              precision    recall  f1-score   support

         0.0       0.75      0.81      0.78       155
         1.0       0.76      0.69      0.72       134

    accuracy                           0.75       289
   macro avg       0.75      0.75      0.75       289
weighted avg       0.75      0.75      0.75       289



### Bagging ensemble learning algorithm

In [127]:
start_time = time.time()
model = DecisionTreeClassifier(criterion='entropy', max_depth=None, random_state=1)
bagging = BaggingClassifier(base_estimator=model, n_estimators=200, max_samples=1.0, max_features=1.0, bootstrap=True, bootstrap_features=True, n_jobs=-1, random_state=1)
bagging.fit(xm_train, ym_train)
print("Running time:  %s seconds " % (time.time() - start_time))
ym_train_pred = bagging.predict(xm_train)
ym_test_pred = bagging.predict(xm_test)

Running time:  0.08121323585510254 seconds 


In [128]:
print('MSE of train data: %s' % (mean_squared_error(ym_train, ym_train_pred)))
print('MSE of test data: %s' % (mean_squared_error(ym_test, ym_test_pred)))
print('R2 score of train data: %s' % (r2_score(ym_train, ym_train_pred)))
print('R2 score of test data: %s' %  (r2_score(ym_test, ym_test_pred)))
print('Accuracy score of train data: %s' %  (accuracy_score(ym_train, ym_train_pred)))
print('Accuracy score of test data: %s' %  (accuracy_score(ym_test, ym_test_pred)))
print('Train data:')
print(classification_report(ym_train, ym_train_pred))
print('Test data:')
print(classification_report(ym_test, ym_test_pred))

MSE of train data: 0.07589285714285714
MSE of test data: 0.18685121107266436
R2 score of train data: 0.6947386235091875
R2 score of test data: 0.24862782859894073
Accuracy score of train data: 0.9241071428571429
Accuracy score of test data: 0.8131487889273357
Train data:
              precision    recall  f1-score   support

         0.0       0.94      0.92      0.93       361
         1.0       0.91      0.93      0.92       311

    accuracy                           0.92       672
   macro avg       0.92      0.92      0.92       672
weighted avg       0.92      0.92      0.92       672

Test data:
              precision    recall  f1-score   support

         0.0       0.82      0.83      0.83       155
         1.0       0.80      0.79      0.80       134

    accuracy                           0.81       289
   macro avg       0.81      0.81      0.81       289
weighted avg       0.81      0.81      0.81       289



### AdaBoost ensemble learning algorithm

In [133]:
start_time = time.time()
model = DecisionTreeClassifier(criterion='entropy', max_depth=None, random_state=1)
adaboost = AdaBoostClassifier(base_estimator=model, n_estimators=100, learning_rate=0.1, random_state=1)
adaboost.fit(xd_train, yd_train)
print("Running time:  %s seconds " % (time.time() - start_time))
yd_train_pred = adaboost.predict(xd_train)
yd_test_pred = adaboost.predict(xd_test)

Running time:  0.016350269317626953 seconds 


In [134]:
print('MSE of train data: %s' % (mean_squared_error(ym_train, ym_train_pred)))
print('MSE of test data: %s' % (mean_squared_error(ym_test, ym_test_pred)))
print('R2 score of train data: %s' % (r2_score(ym_train, ym_train_pred)))
print('R2 score of test data: %s' %  (r2_score(ym_test, ym_test_pred)))
print('Accuracy score of train data: %s' %  (accuracy_score(ym_train, ym_train_pred)))
print('Accuracy score of test data: %s' %  (accuracy_score(ym_test, ym_test_pred)))
print('Train data:')
print(classification_report(ym_train, ym_train_pred))
print('Test data:')
print(classification_report(ym_test, ym_test_pred))

MSE of train data: 0.07589285714285714
MSE of test data: 0.18685121107266436
R2 score of train data: 0.6947386235091875
R2 score of test data: 0.24862782859894073
Accuracy score of train data: 0.9241071428571429
Accuracy score of test data: 0.8131487889273357
Train data:
              precision    recall  f1-score   support

         0.0       0.94      0.92      0.93       361
         1.0       0.91      0.93      0.92       311

    accuracy                           0.92       672
   macro avg       0.92      0.92      0.92       672
weighted avg       0.92      0.92      0.92       672

Test data:
              precision    recall  f1-score   support

         0.0       0.82      0.83      0.83       155
         1.0       0.80      0.79      0.80       134

    accuracy                           0.81       289
   macro avg       0.81      0.81      0.81       289
weighted avg       0.81      0.81      0.81       289

