# Breast Cancer
- https://www.kaggle.com/shasun/tool-wear-detection-in-cnc-mill
- Predict "Machining_Process"

## Setup

In [1]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
import matplotlib.pyplot as plt
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import load_breast_cancer
import pandas as pd
from sklearn.metrics import accuracy_score

In [2]:
# Number of features to select.
max_features = 10

# Transform X with variance selector

In [3]:
from sklearn.feature_selection import VarianceThreshold

dataObj = load_breast_cancer()
X = dataObj.data
y = dataObj.target
colsX = dataObj.feature_names

# Creating object
sel = VarianceThreshold(threshold=0)

# Fitting
sel.fit(X)

# Boolean
sup = sel.get_support()
cols_sel = colsX[sup]
print(f'Chosen columns:')
print('-'*30)
print(*cols_sel, sep=', ')

cols_rem = colsX[np.logical_not(sup)]
print(f'\nRemoved columns:')
print('-'*30)
print(*cols_rem, sep=', ')

X = sel.transform(X)
dfX = pd.DataFrame(data=X, columns=cols_sel)
display(dfX.head())

# Initialize dictionary to store selector
sel = {}

Chosen columns:
------------------------------
mean radius, mean texture, mean perimeter, mean area, mean smoothness, mean compactness, mean concavity, mean concave points, mean symmetry, mean fractal dimension, radius error, texture error, perimeter error, area error, smoothness error, compactness error, concavity error, concave points error, symmetry error, fractal dimension error, worst radius, worst texture, worst perimeter, worst area, worst smoothness, worst compactness, worst concavity, worst concave points, worst symmetry, worst fractal dimension

Removed columns:
------------------------------



Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


## L1

In [4]:
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LogisticRegression

# Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0, stratify=y)
stdsc = StandardScaler()
X_train_std = stdsc.fit_transform(X_train)
print(X_train_std.shape)
# Classifier
lr = LogisticRegression(solver='liblinear', penalty='l1', C=0.1, multi_class='ovr')

# Create selector object
threshold=1e-5
sel['L1'] = SelectFromModel(estimator=lr, norm_order=1, threshold=threshold, max_features=max_features)

# Training
sel['L1'].fit(X_train_std, y_train)

# Extract norm of weights
coef = sel['L1'].estimator_.coef_
coef_norm = np.linalg.norm(coef , ord=1, axis=0)

# Get column boolean
sup = sel['L1'].get_support()
print('\nSupport array')
print('-'*30)
print(*sup, sep=', ')

# Store selector


(398, 30)

Support array
------------------------------
False, False, False, False, False, False, True, True, False, False, False, False, False, False, False, False, False, False, False, False, True, True, False, False, False, False, False, True, False, False


In [5]:
# Sort array from based on weights from large to small
cols = dfX.columns
idxs = np.argsort(coef_norm)[::-1]
sup = sup[idxs]
coef_norm = coef_norm[idxs]
cols = cols[idxs]

# Print results
for count, (col, coef) in enumerate(zip(cols, coef_norm)):
    print(f"{count+1:2d}) {col:30s} \t{coef:5.3f}")

 1) worst radius                   	2.031
 2) mean concave points            	0.888
 3) worst concave points           	0.710
 4) worst texture                  	0.579
 5) mean concavity                 	0.238
 6) worst symmetry                 	0.121
 7) worst smoothness               	0.052
 8) mean compactness               	0.000
 9) mean fractal dimension         	0.000
10) mean symmetry                  	0.000
11) worst fractal dimension        	0.000
12) texture error                  	0.000
13) mean smoothness                	0.000
14) mean area                      	0.000
15) mean perimeter                 	0.000
16) mean texture                   	0.000
17) radius error                   	0.000
18) smoothness error               	0.000
19) perimeter error                	0.000
20) area error                     	0.000
21) compactness error              	0.000
22) concavity error                	0.000
23) concave points error           	0.000
24) symmetry error                

In [6]:
# Print columns
cols_sel = cols[sup]
print('\nChosen columns')
print('-'*30)
print(*cols_sel, sep=', ')

cols_rem = cols[np.logical_not(sup)]
print(f'\nRemoved columns')
print('-'*30)
print(*cols_rem, sep=', ')


Chosen columns
------------------------------
worst radius, mean concave points, worst concave points, worst texture, mean concavity

Removed columns
------------------------------
worst symmetry, worst smoothness, mean compactness, mean fractal dimension, mean symmetry, worst fractal dimension, texture error, mean smoothness, mean area, mean perimeter, mean texture, radius error, smoothness error, perimeter error, area error, compactness error, concavity error, concave points error, symmetry error, fractal dimension error, worst perimeter, worst area, worst compactness, worst concavity, mean radius


## Importance

In [7]:
from sklearn.ensemble import RandomForestClassifier

# Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0, stratify=y)
stdsc = StandardScaler()
X_train_std = stdsc.fit_transform(X_train)
print(X_train_std.shape)

#Create classifier
forest = RandomForestClassifier(n_estimators=500, random_state=0, n_jobs=-1)

#Create object
threshold = 1e-15
sel['imp'] = SelectFromModel(forest, threshold=threshold, max_features=max_features)

# Training (Actually, it does not matter whether we have X_train_std or X_train because random forrest does not care)
sel['imp'].fit(X_train_std, y_train)

# Columns chosen
sup = sel['imp'].get_support()

# Extract importances values
importances = sel['imp'].estimator_.feature_importances_

# Select only chosen columns
print('\nSupport array')
print('-'*30)
print(*sup, sep=', ')

(398, 30)

Support array
------------------------------
False, False, False, False, False, False, False, True, False, False, False, False, False, False, False, False, False, False, False, False, True, False, True, True, False, False, False, True, False, False


In [8]:
# Sort array from based on importances from large to small
cols = dfX.columns
idxs = np.argsort(importances)[::-1]
sup = sup[idxs]
importances = importances[idxs]
cols = cols[idxs]

# Print results
for count, (col, importance) in enumerate(zip(cols, importances)):
    print(f"{count+1:2d}) {col:30s} \t{importance:5.3f}")

 1) worst perimeter                	0.151
 2) worst radius                   	0.123
 3) worst concave points           	0.116
 4) worst area                     	0.104
 5) mean concave points            	0.100
 6) mean concavity                 	0.065
 7) mean perimeter                 	0.047
 8) worst concavity                	0.041
 9) mean area                      	0.039
10) mean radius                    	0.038
11) area error                     	0.026
12) radius error                   	0.020
13) worst texture                  	0.016
14) perimeter error                	0.013
15) mean texture                   	0.013
16) mean compactness               	0.012
17) worst compactness              	0.011
18) worst smoothness               	0.010
19) worst symmetry                 	0.008
20) worst fractal dimension        	0.007
21) texture error                  	0.005
22) mean smoothness                	0.005
23) compactness error              	0.004
24) concavity error               

In [9]:
# Print columns
cols_sel = cols[sup]
print('\nChosen columns')
print('-'*30)
print(*cols_sel, sep=', ')

cols_rem = cols[np.logical_not(sup)]
print(f'\nRemoved columns')
print('-'*30)
print(*cols_rem, sep=', ')


Chosen columns
------------------------------
worst perimeter, worst radius, worst concave points, worst area, mean concave points

Removed columns
------------------------------
mean concavity, mean perimeter, worst concavity, mean area, mean radius, area error, radius error, worst texture, perimeter error, mean texture, mean compactness, worst compactness, worst smoothness, worst symmetry, worst fractal dimension, texture error, mean smoothness, compactness error, concavity error, symmetry error, concave points error, mean fractal dimension, mean symmetry, fractal dimension error, smoothness error


# Sequential feature selector

In [11]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_selection import SequentialFeatureSelector

# Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0, stratify=y)
stdsc = StandardScaler()
X_train_std = stdsc.fit_transform(X_train)
print(X_train_std.shape)

# Create classifier
knn = KNeighborsClassifier(n_neighbors=3)
forest = RandomForestClassifier(n_estimators=100, random_state=0, n_jobs=-1)

# Create selector object
sel['seq_forward'] = SequentialFeatureSelector(knn, direction='forward', n_features_to_select=max_features, scoring='accuracy', cv=3, n_jobs=-1)

# Training
sel['seq_forward'].fit(X_train_std, y_train)

# Get column boolean
sup = sel['seq_forward'].get_support()
print('\nSupport array')
print('-'*30)
print(*sup, sep=', ')

(398, 30)

Support array
------------------------------
False, True, False, True, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, True, True, False, False, False, False, False, True, False, False


In [12]:
# Print columns
cols = dfX.columns
cols_sel = cols[sup]
print('\nChosen columns')
print('-'*30)
print(*cols_sel, sep=', ')

cols_rem = cols[np.logical_not(sup)]
print(f'\nRemoved columns')
print('-'*30)
print(*cols_rem, sep=', ')


Chosen columns
------------------------------
mean texture, mean area, worst radius, worst texture, worst concave points

Removed columns
------------------------------
mean radius, mean perimeter, mean smoothness, mean compactness, mean concavity, mean concave points, mean symmetry, mean fractal dimension, radius error, texture error, perimeter error, area error, smoothness error, compactness error, concavity error, concave points error, symmetry error, fractal dimension error, worst perimeter, worst area, worst smoothness, worst compactness, worst concavity, worst symmetry, worst fractal dimension


In [13]:
# Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0, stratify=y)
stdsc = StandardScaler()
X_train_std = stdsc.fit_transform(X_train)
print(X_train_std.shape)

# Create classifier
knn = KNeighborsClassifier(n_neighbors=3)
forest = RandomForestClassifier(n_estimators=100, random_state=0, n_jobs=-1)

# Create selector object
sel['seq_backward'] = SequentialFeatureSelector(knn, direction='backward', n_features_to_select=max_features, scoring='accuracy', cv=3, n_jobs=-1)

# Training
sel['seq_backward'].fit(X_train_std, y_train)

# Get column boolean
sup = sel['seq_backward'].get_support()
print('\nSupport array')
print('-'*30)
print(*sup, sep=', ')

(398, 30)

Support array
------------------------------
False, False, False, False, False, False, True, False, False, False, False, False, False, False, False, False, False, False, False, False, False, True, True, True, False, False, False, False, True, False


In [14]:
# Print columns
cols = dfX.columns
cols_sel = cols[sup]
print('\nChosen columns')
print('-'*30)
print(*cols_sel, sep=', ')

cols_rem = cols[np.logical_not(sup)]
print(f'\nRemoved columns')
print('-'*30)
print(*cols_rem, sep=', ')


Chosen columns
------------------------------
mean concavity, worst texture, worst perimeter, worst area, worst symmetry

Removed columns
------------------------------
mean radius, mean texture, mean perimeter, mean area, mean smoothness, mean compactness, mean concave points, mean symmetry, mean fractal dimension, radius error, texture error, perimeter error, area error, smoothness error, compactness error, concavity error, concave points error, symmetry error, fractal dimension error, worst radius, worst smoothness, worst compactness, worst concavity, worst concave points, worst fractal dimension


## Training with random forrest

In [15]:
from sklearn.ensemble import RandomForestClassifier

paramSetAll = {
    "ex1": {
        "criterion": "gini",
        "n_estimators": 25,
        "max_samples": None,
        "max_features": "auto",
        "max_depth": None,
    },
    "ex2": {
        "criterion": "gini",
        "n_estimators": 50,
        "max_samples": None,
        "max_features": "auto",
        "max_depth": None,
    },
    "ex3": {
        "criterion": "gini",
        "n_estimators": 100,
        "max_samples": None,
        "max_features": "auto",
        "max_depth": None,
    },
}

In [16]:
cl = list(range(1,4))
ca = [ f'ex{i}' for i in cl]
paramSet = { k: paramSetAll[k] for k in ca} 
print(paramSet)

{'ex1': {'criterion': 'gini', 'n_estimators': 25, 'max_samples': None, 'max_features': 'auto', 'max_depth': None}, 'ex2': {'criterion': 'gini', 'n_estimators': 50, 'max_samples': None, 'max_features': 'auto', 'max_depth': None}, 'ex3': {'criterion': 'gini', 'n_estimators': 100, 'max_samples': None, 'max_features': 'auto', 'max_depth': None}}


In [17]:
def training(X_train, y_train, X_test, y_test):
    for paramName, paramValue in paramSet.items():
        # Extract parameteres
        forrest = RandomForestClassifier(**paramValue)

        # Training
        forrest.fit(X_train, y_train)

        # Prediction
        y_pred = forrest.predict(X_test)

        # Misclassification from the test samples
        sumMiss = (y_test != y_pred).sum()

        # Accuracy score from the test samples
        accuracyScore = accuracy_score(y_test, y_pred)

        print(f"Parameters: {paramValue}")
        print(f"Misclassified examples: {sumMiss}")
        print(f"Accuracy score: {accuracyScore}")
        print(f"--------------------------------------------------")

In [18]:
# Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0, stratify=y)

# Standardize
stdsc = StandardScaler()
X_train_std = stdsc.fit_transform(X_train)
X_test_std = stdsc.transform(X_test)
print(X_train_std.shape)

(398, 30)


In [19]:
# No transformation
training(X_train_std, y_train, X_test_std, y_test)

Parameters: {'criterion': 'gini', 'n_estimators': 25, 'max_samples': None, 'max_features': 'auto', 'max_depth': None}
Misclassified examples: 13
Accuracy score: 0.9239766081871345
--------------------------------------------------
Parameters: {'criterion': 'gini', 'n_estimators': 50, 'max_samples': None, 'max_features': 'auto', 'max_depth': None}
Misclassified examples: 9
Accuracy score: 0.9473684210526315
--------------------------------------------------
Parameters: {'criterion': 'gini', 'n_estimators': 100, 'max_samples': None, 'max_features': 'auto', 'max_depth': None}
Misclassified examples: 9
Accuracy score: 0.9473684210526315
--------------------------------------------------


In [20]:
X_train_std_trans = sel['L1'].transform(X_train_std)
X_test_std_trans = sel['L1'].transform(X_test_std)

training(X_train_std_trans, y_train, X_test_std_trans, y_test)

Parameters: {'criterion': 'gini', 'n_estimators': 25, 'max_samples': None, 'max_features': 'auto', 'max_depth': None}
Misclassified examples: 11
Accuracy score: 0.935672514619883
--------------------------------------------------
Parameters: {'criterion': 'gini', 'n_estimators': 50, 'max_samples': None, 'max_features': 'auto', 'max_depth': None}
Misclassified examples: 9
Accuracy score: 0.9473684210526315
--------------------------------------------------
Parameters: {'criterion': 'gini', 'n_estimators': 100, 'max_samples': None, 'max_features': 'auto', 'max_depth': None}
Misclassified examples: 9
Accuracy score: 0.9473684210526315
--------------------------------------------------


In [21]:
X_train_std_trans = sel['imp'].transform(X_train_std)
X_test_std_trans = sel['imp'].transform(X_test_std)

training(X_train_std_trans, y_train, X_test_std_trans, y_test)

Parameters: {'criterion': 'gini', 'n_estimators': 25, 'max_samples': None, 'max_features': 'auto', 'max_depth': None}
Misclassified examples: 14
Accuracy score: 0.9181286549707602
--------------------------------------------------
Parameters: {'criterion': 'gini', 'n_estimators': 50, 'max_samples': None, 'max_features': 'auto', 'max_depth': None}
Misclassified examples: 13
Accuracy score: 0.9239766081871345
--------------------------------------------------
Parameters: {'criterion': 'gini', 'n_estimators': 100, 'max_samples': None, 'max_features': 'auto', 'max_depth': None}
Misclassified examples: 12
Accuracy score: 0.9298245614035088
--------------------------------------------------


In [22]:
X_train_std_trans = sel['seq_forward'].transform(X_train_std)
X_test_std_trans = sel['seq_forward'].transform(X_test_std)

training(X_train_std_trans, y_train, X_test_std_trans, y_test)

Parameters: {'criterion': 'gini', 'n_estimators': 25, 'max_samples': None, 'max_features': 'auto', 'max_depth': None}
Misclassified examples: 9
Accuracy score: 0.9473684210526315
--------------------------------------------------
Parameters: {'criterion': 'gini', 'n_estimators': 50, 'max_samples': None, 'max_features': 'auto', 'max_depth': None}
Misclassified examples: 11
Accuracy score: 0.935672514619883
--------------------------------------------------
Parameters: {'criterion': 'gini', 'n_estimators': 100, 'max_samples': None, 'max_features': 'auto', 'max_depth': None}
Misclassified examples: 9
Accuracy score: 0.9473684210526315
--------------------------------------------------


In [23]:
X_train_std_trans = sel['seq_backward'].transform(X_train_std)
X_test_std_trans = sel['seq_backward'].transform(X_test_std)

training(X_train_std_trans, y_train, X_test_std_trans, y_test)

Parameters: {'criterion': 'gini', 'n_estimators': 25, 'max_samples': None, 'max_features': 'auto', 'max_depth': None}
Misclassified examples: 14
Accuracy score: 0.9181286549707602
--------------------------------------------------
Parameters: {'criterion': 'gini', 'n_estimators': 50, 'max_samples': None, 'max_features': 'auto', 'max_depth': None}
Misclassified examples: 9
Accuracy score: 0.9473684210526315
--------------------------------------------------
Parameters: {'criterion': 'gini', 'n_estimators': 100, 'max_samples': None, 'max_features': 'auto', 'max_depth': None}
Misclassified examples: 8
Accuracy score: 0.9532163742690059
--------------------------------------------------
