# **Boosting**

In [1]:
import numpy as np
import pandas as pd

In [2]:
data = pd.read_csv('cancer.csv')

In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 32 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 569 non-null    int64  
 1   diagnosis          569 non-null    object 
 2   radius_mean        569 non-null    float64
 3   texture_mean       569 non-null    float64
 4   perimeter_mean     569 non-null    float64
 5   area_mean          569 non-null    float64
 6   smoothness_mean    569 non-null    float64
 7   compactness_mean   569 non-null    float64
 8   concavity_mean     569 non-null    float64
 9   points_mean        569 non-null    float64
 10  symmetry_mean      569 non-null    float64
 11  dimension_mean     569 non-null    float64
 12  radius_se          569 non-null    float64
 13  texture_se         569 non-null    float64
 14  perimeter_se       569 non-null    float64
 15  area_se            569 non-null    float64
 16  smoothness_se      569 non

In [4]:
data.diagnosis.value_counts()

B    357
M    212
Name: diagnosis, dtype: int64

In [5]:
features = data.iloc[:,2:32].values
label = data.iloc[:,1].values

In [6]:
# eXtreme Gradient BOOSTing
# Expects the label column to  be represented in the form of numeric data
# LabelEncoding


from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
label = le.fit_transform(label)

In [7]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(features,
                                               label,
                                               test_size=0.2,
                                               random_state=1)

In [8]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(max_iter=10000)
model.fit(X_train,y_train)

In [9]:
model.score(X_train,y_train)

0.9560439560439561

In [10]:
model.score(X_test,y_test)

0.9824561403508771

In [11]:
# Boosting
# XGBoost can be used for Supevised learning only (Regression and Classification)
# XGBoost uses DecisionTree for creating weak learner in XGBClassifier and XGBRegressor
# XGBoost uses RandomForest for creating weak learner in XGBRFClassifier and XGBRFRegressor
# Classification: XGBClassifier | XGBRFClassifier
# Regression: XGBRegressor | XGBRFRegressor

In [12]:
# pip install xgboost

In [13]:
CL = 0.94
from xgboost import XGBClassifier
for i in range(1,101):
  X_train,X_test,y_train,y_test = train_test_split(features,label,test_size=0.2,random_state=i)
  model=XGBClassifier()
  model.fit(X_train,y_train)
  trainScore = model.score(X_train,y_train)
  testScore = model.score(X_test,y_test)

  if testScore > CL:
    print(f"Test Score : {testScore} Train Score: {trainScore} RS : {i}")

Test Score : 0.956140350877193 Train Score: 1.0 RS : 1
Test Score : 0.9824561403508771 Train Score: 1.0 RS : 2
Test Score : 0.9649122807017544 Train Score: 1.0 RS : 3
Test Score : 0.956140350877193 Train Score: 1.0 RS : 4
Test Score : 0.9649122807017544 Train Score: 1.0 RS : 5
Test Score : 0.9736842105263158 Train Score: 1.0 RS : 6
Test Score : 0.9736842105263158 Train Score: 1.0 RS : 7
Test Score : 0.9649122807017544 Train Score: 1.0 RS : 9
Test Score : 0.9824561403508771 Train Score: 1.0 RS : 10
Test Score : 0.9824561403508771 Train Score: 1.0 RS : 11
Test Score : 0.9649122807017544 Train Score: 1.0 RS : 12
Test Score : 0.9824561403508771 Train Score: 1.0 RS : 13
Test Score : 0.9824561403508771 Train Score: 1.0 RS : 14
Test Score : 0.9473684210526315 Train Score: 1.0 RS : 15
Test Score : 0.9912280701754386 Train Score: 1.0 RS : 16
Test Score : 0.9649122807017544 Train Score: 1.0 RS : 17
Test Score : 0.9824561403508771 Train Score: 1.0 RS : 19
Test Score : 0.9824561403508771 Train Sco

In [14]:
CL = 0.94
from xgboost import XGBClassifier
for i in range(1,101):
  X_train,X_test,y_train,y_test = train_test_split(features,label,test_size=0.2,random_state=i)
  model=XGBClassifier(learning_rate=0.001)
  model.fit(X_train,y_train)
  trainScore = model.score(X_train,y_train)
  testScore = model.score(X_test,y_test)

  if testScore > CL:
    print(f"Test Score : {testScore} Train Score: {trainScore} RS : {i}")

In [15]:
CL = 0.94
from xgboost import XGBClassifier
for i in range(1,101):
  X_train,X_test,y_train,y_test = train_test_split(features,label,test_size=0.2,random_state=i)
  model=XGBClassifier(max_depth=4,n_estimators=200,learning_rate=0.1)
  model.fit(X_train,y_train)
  trainScore = model.score(X_train,y_train)
  testScore = model.score(X_test,y_test)

  if testScore > trainScore and testScore >= CL:
    print(f"Test Score : {testScore} Train Score: {trainScore} RS : {i}")

In [16]:
CL = 0.94
from xgboost import XGBRFClassifier
for i in range(1,101):
  X_train,X_test,y_train,y_test = train_test_split(features,label,test_size=0.2,random_state=i)
  model=XGBRFClassifier()
  model.fit(X_train,y_train)
  trainScore = model.score(X_train,y_train)
  testScore = model.score(X_test,y_test)

  if testScore > trainScore and testScore >= CL:
    print(f"Test Score : {testScore} Train Score: {trainScore} RS : {i}")

Test Score : 0.9912280701754386 Train Score: 0.9912087912087912 RS : 11
Test Score : 0.9912280701754386 Train Score: 0.9868131868131869 RS : 44
Test Score : 0.9912280701754386 Train Score: 0.989010989010989 RS : 72


In [17]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import GradientBoostingRegressor

#GradientBoostingClassifier(n_estimators=100, learning_rate=1.0)
CL = 0.94

for i in range(1,101):
  X_train,X_test,y_train,y_test = train_test_split(features,label,test_size=0.2,random_state=i)
  model=GradientBoostingClassifier(n_estimators=150, learning_rate=0.1)
  model.fit(X_train,y_train)
  trainScore = model.score(X_train,y_train)
  testScore = model.score(X_test,y_test)

  if testScore > trainScore and testScore >= CL:
    print(f"Test Score : {testScore} Train Score: {trainScore} RS : {i}")