In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, KFold, cross_val_score, GridSearchCV

from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn import neighbors
from sklearn import svm

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

In [2]:
import warnings
warnings.filterwarnings("ignore")

# Data set

In [3]:
train = pd.read_csv("breast_cancer_train.csv")
test = pd.read_csv("breast_cancer_test.csv")
train.head()

Unnamed: 0,ID,class,radiusMean,textureMean,perimeterMean,areaMean,smoothnessMean,compactnessMean,concavityMean,concavePointsMean,...,radiusWorst,textureWorst,perimeterWorst,areaWorst,smoothnessWorst,compactnessWorst,concavityWorst,concavePointsWorst,symmetryWorst,fractalDimensionWorst
0,913102,False,14.64,16.85,94.21,666.0,0.08641,0.06698,0.05192,0.02791,...,16.46,25.44,106.0,831.0,0.1142,0.207,0.2437,0.07828,0.2455,0.06596
1,89511501,False,12.2,15.21,78.01,457.9,0.08673,0.06545,0.01994,0.01692,...,13.75,21.38,91.11,583.1,0.1256,0.1928,0.1167,0.05556,0.2661,0.07961
2,87163,True,13.43,19.63,85.84,565.4,0.09048,0.06288,0.05858,0.03438,...,17.98,29.87,116.6,993.6,0.1401,0.1546,0.2644,0.116,0.2884,0.07371
3,894047,False,8.597,18.6,54.09,221.2,0.1074,0.05847,0.0,0.0,...,8.952,22.44,56.65,240.1,0.1347,0.07767,0.0,0.0,0.3142,0.08116
4,86409,False,14.26,19.65,97.83,629.9,0.07837,0.2233,0.3003,0.07798,...,15.3,23.73,107.0,709.0,0.08949,0.4193,0.6783,0.1505,0.2398,0.1082


In [4]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 285 entries, 0 to 284
Data columns (total 32 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   ID                       285 non-null    int64  
 1   class                    285 non-null    bool   
 2   radiusMean               285 non-null    float64
 3    textureMean             285 non-null    float64
 4    perimeterMean           285 non-null    float64
 5    areaMean                285 non-null    float64
 6    smoothnessMean          285 non-null    float64
 7    compactnessMean         285 non-null    float64
 8    concavityMean           285 non-null    float64
 9    concavePointsMean       285 non-null    float64
 10   symmetryMean            285 non-null    float64
 11   fractalDimensionMean    285 non-null    float64
 12   radiusStdErr            285 non-null    float64
 13   textureStdErr           285 non-null    float64
 14   perimeterStdErr         2

In [5]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 284 entries, 0 to 283
Data columns (total 31 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   ID                       284 non-null    int64  
 1   radiusMean               284 non-null    float64
 2    textureMean             284 non-null    float64
 3    perimeterMean           284 non-null    float64
 4    areaMean                284 non-null    float64
 5    smoothnessMean          284 non-null    float64
 6    compactnessMean         284 non-null    float64
 7    concavityMean           284 non-null    float64
 8    concavePointsMean       284 non-null    float64
 9    symmetryMean            284 non-null    float64
 10   fractalDimensionMean    284 non-null    float64
 11   radiusStdErr            284 non-null    float64
 12   textureStdErr           284 non-null    float64
 13   perimeterStdErr         284 non-null    float64
 14   areaStdErr              2

In [6]:
X = train.drop(["ID", "class"], axis=1)
y = train["class"]

X_test = test.drop(["ID"], axis=1)

In [7]:
X.head()

Unnamed: 0,radiusMean,textureMean,perimeterMean,areaMean,smoothnessMean,compactnessMean,concavityMean,concavePointsMean,symmetryMean,fractalDimensionMean,...,radiusWorst,textureWorst,perimeterWorst,areaWorst,smoothnessWorst,compactnessWorst,concavityWorst,concavePointsWorst,symmetryWorst,fractalDimensionWorst
0,14.64,16.85,94.21,666.0,0.08641,0.06698,0.05192,0.02791,0.1409,0.05355,...,16.46,25.44,106.0,831.0,0.1142,0.207,0.2437,0.07828,0.2455,0.06596
1,12.2,15.21,78.01,457.9,0.08673,0.06545,0.01994,0.01692,0.1638,0.06129,...,13.75,21.38,91.11,583.1,0.1256,0.1928,0.1167,0.05556,0.2661,0.07961
2,13.43,19.63,85.84,565.4,0.09048,0.06288,0.05858,0.03438,0.1598,0.05671,...,17.98,29.87,116.6,993.6,0.1401,0.1546,0.2644,0.116,0.2884,0.07371
3,8.597,18.6,54.09,221.2,0.1074,0.05847,0.0,0.0,0.2163,0.07359,...,8.952,22.44,56.65,240.1,0.1347,0.07767,0.0,0.0,0.3142,0.08116
4,14.26,19.65,97.83,629.9,0.07837,0.2233,0.3003,0.07798,0.1704,0.07769,...,15.3,23.73,107.0,709.0,0.08949,0.4193,0.6783,0.1505,0.2398,0.1082


In [8]:
y.head()

0    False
1    False
2     True
3    False
4    False
Name: class, dtype: bool

# Exploration

## Overview

In [11]:
X.isnull().sum().sum()

0

## Plots

In [45]:
# [TODO]

# Pre-processing

## Feature selection

In [46]:
# [TODO]

## Feature scaling

In [47]:
# [TODO]

## Encoding

In [48]:
# [TODO]

# Model selection

In [49]:
cv = 10
verbose = 3

## KNN

In [22]:
parameters = {"n_neighbors":[1, 5, 10, 20, 30, 40, 50], "weights":["uniform", "distance"]}
knn = GridSearchCV(neighbors.KNeighborsClassifier(), parameters, cv=cv, verbose=verbose)
knn.fit(X, y)

Fitting 5 folds for each of 14 candidates, totalling 70 fits
[CV] END .....................n_neighbors=1, weights=uniform; total time=   0.0s
[CV] END .....................n_neighbors=1, weights=uniform; total time=   0.0s
[CV] END .....................n_neighbors=1, weights=uniform; total time=   0.0s
[CV] END .....................n_neighbors=1, weights=uniform; total time=   0.0s
[CV] END .....................n_neighbors=1, weights=uniform; total time=   0.0s
[CV] END ....................n_neighbors=1, weights=distance; total time=   0.0s
[CV] END ....................n_neighbors=1, weights=distance; total time=   0.0s
[CV] END ....................n_neighbors=1, weights=distance; total time=   0.0s
[CV] END ....................n_neighbors=1, weights=distance; total time=   0.0s
[CV] END ....................n_neighbors=1, weights=distance; total time=   0.0s
[CV] END .....................n_neighbors=5, weights=uniform; total time=   0.0s
[CV] END .....................n_neighbors=5, wei

GridSearchCV(cv=5, estimator=KNeighborsClassifier(),
             param_grid={'n_neighbors': [1, 5, 10, 20, 30, 40, 50],
                         'weights': ['uniform', 'distance']},
             verbose=2)

In [23]:
knn_results = pd.DataFrame(knn.cv_results_)
knn_results[["param_n_neighbors", "param_weights", "mean_test_score"]]

Unnamed: 0,param_n_neighbors,param_weights,mean_test_score
0,1,uniform,0.34325
1,1,distance,0.34325
2,5,uniform,0.3885
3,5,distance,0.412
4,10,uniform,0.434
5,10,distance,0.4485
6,20,uniform,0.46625
7,20,distance,0.48325
8,30,uniform,0.4835
9,30,distance,0.49425


## Decision Tree

In [18]:
parameters = {"n_estimators":[100, 200, 300, 400, 500], "criterion":["gini", "entropy"], "max_depth":["None", 10, 20]}
dt = GridSearchCV(RandomForestClassifier(), parameters, cv=cv, verbose=verbose)
dt.fit(X, y)

Fitting 5 folds for each of 30 candidates, totalling 150 fits
[CV] END ...criterion=gini, max_depth=None, n_estimators=100; total time=   0.0s
[CV] END ...criterion=gini, max_depth=None, n_estimators=100; total time=   0.0s
[CV] END ...criterion=gini, max_depth=None, n_estimators=100; total time=   0.0s
[CV] END ...criterion=gini, max_depth=None, n_estimators=100; total time=   0.0s
[CV] END ...criterion=gini, max_depth=None, n_estimators=100; total time=   0.0s
[CV] END ...criterion=gini, max_depth=None, n_estimators=200; total time=   0.0s
[CV] END ...criterion=gini, max_depth=None, n_estimators=200; total time=   0.0s
[CV] END ...criterion=gini, max_depth=None, n_estimators=200; total time=   0.0s
[CV] END ...criterion=gini, max_depth=None, n_estimators=200; total time=   0.0s
[CV] END ...criterion=gini, max_depth=None, n_estimators=200; total time=   0.0s
[CV] END ...criterion=gini, max_depth=None, n_estimators=300; total time=   0.0s
[CV] END ...criterion=gini, max_depth=None, n_e

50 fits failed out of a total of 150.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
50 fits failed with the following error:
Traceback (most recent call last):
  File "f:\Programmieren\Python\ml-projects\.venv\lib\site-packages\sklearn\model_selection\_validation.py", line 681, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "f:\Programmieren\Python\ml-projects\.venv\lib\site-packages\sklearn\ensemble\_forest.py", line 442, in fit
    trees = Parallel(
  File "f:\Programmieren\Python\ml-projects\.venv\lib\site-packages\joblib\parallel.py", line 1043, in __call__
    if self.dispatch_one_batch(iterator):
  File "f:\Programmieren\Python\ml-projects\.venv\lib\site-packages\joblib\parallel.py", line 861, in dispa

GridSearchCV(cv=5, estimator=RandomForestClassifier(),
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_depth': ['None', 10, 20],
                         'n_estimators': [100, 200, 300, 400, 500]},
             verbose=2)

In [19]:
dt_results = pd.DataFrame(dt.cv_results_)
dt_results[["param_n_estimators", "param_criterion", "param_max_depth", "mean_test_score"]]

Unnamed: 0,param_n_estimators,param_criterion,param_max_depth,mean_test_score
0,100,gini,,
1,200,gini,,
2,300,gini,,
3,400,gini,,
4,500,gini,,
5,100,gini,10.0,0.49625
6,200,gini,10.0,0.496
7,300,gini,10.0,0.49525
8,400,gini,10.0,0.5055
9,500,gini,10.0,0.50075


## Random Forest

In [20]:
parameters = {"n_estimators":[100, 200, 300, 400, 500], "criterion":["gini", "entropy"], "max_depth":["None", 10, 20]}
rf = GridSearchCV(RandomForestClassifier(), parameters, cv=cv, verbose=verbose)
rf.fit(X, y)

Fitting 5 folds for each of 30 candidates, totalling 150 fits
[CV] END ...criterion=gini, max_depth=None, n_estimators=100; total time=   0.0s
[CV] END ...criterion=gini, max_depth=None, n_estimators=100; total time=   0.0s
[CV] END ...criterion=gini, max_depth=None, n_estimators=100; total time=   0.0s
[CV] END ...criterion=gini, max_depth=None, n_estimators=100; total time=   0.0s
[CV] END ...criterion=gini, max_depth=None, n_estimators=100; total time=   0.0s
[CV] END ...criterion=gini, max_depth=None, n_estimators=200; total time=   0.0s
[CV] END ...criterion=gini, max_depth=None, n_estimators=200; total time=   0.0s
[CV] END ...criterion=gini, max_depth=None, n_estimators=200; total time=   0.0s
[CV] END ...criterion=gini, max_depth=None, n_estimators=200; total time=   0.0s
[CV] END ...criterion=gini, max_depth=None, n_estimators=200; total time=   0.0s
[CV] END ...criterion=gini, max_depth=None, n_estimators=300; total time=   0.0s
[CV] END ...criterion=gini, max_depth=None, n_e

50 fits failed out of a total of 150.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
50 fits failed with the following error:
Traceback (most recent call last):
  File "f:\Programmieren\Python\ml-projects\.venv\lib\site-packages\sklearn\model_selection\_validation.py", line 681, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "f:\Programmieren\Python\ml-projects\.venv\lib\site-packages\sklearn\ensemble\_forest.py", line 442, in fit
    trees = Parallel(
  File "f:\Programmieren\Python\ml-projects\.venv\lib\site-packages\joblib\parallel.py", line 1043, in __call__
    if self.dispatch_one_batch(iterator):
  File "f:\Programmieren\Python\ml-projects\.venv\lib\site-packages\joblib\parallel.py", line 861, in dispa

GridSearchCV(cv=5, estimator=RandomForestClassifier(),
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_depth': ['None', 10, 20],
                         'n_estimators': [100, 200, 300, 400, 500]},
             verbose=2)

In [21]:
rf_results = pd.DataFrame(rf.cv_results_)
rf_results[["param_n_estimators", "param_criterion", "param_max_depth", "mean_test_score"]]

Unnamed: 0,param_n_estimators,param_criterion,param_max_depth,mean_test_score
0,100,gini,,
1,200,gini,,
2,300,gini,,
3,400,gini,,
4,500,gini,,
5,100,gini,10.0,0.48425
6,200,gini,10.0,0.498
7,300,gini,10.0,0.5025
8,400,gini,10.0,0.4965
9,500,gini,10.0,0.4985


## SVM

In [33]:
parameters = {"kernel":["sigmoid", "rbf"], "C":[1, 5, 10, 20], "gamma":["scale", "auto"]}
svm = GridSearchCV(svm.SVC(), parameters, cv=cv, verbose=verbose)
svm.fit(X, y)

Fitting 10 folds for each of 16 candidates, totalling 160 fits
[CV 1/10] END .C=1, gamma=scale, kernel=sigmoid;, score=0.680 total time=   1.5s
[CV 2/10] END .C=1, gamma=scale, kernel=sigmoid;, score=0.762 total time=   1.6s
[CV 3/10] END .C=1, gamma=scale, kernel=sigmoid;, score=0.682 total time=   1.5s
[CV 4/10] END .C=1, gamma=scale, kernel=sigmoid;, score=0.725 total time=   1.5s
[CV 5/10] END .C=1, gamma=scale, kernel=sigmoid;, score=0.713 total time=   1.5s
[CV 6/10] END .C=1, gamma=scale, kernel=sigmoid;, score=0.693 total time=   1.5s
[CV 7/10] END .C=1, gamma=scale, kernel=sigmoid;, score=0.705 total time=   1.5s
[CV 8/10] END .C=1, gamma=scale, kernel=sigmoid;, score=0.698 total time=   1.5s
[CV 9/10] END .C=1, gamma=scale, kernel=sigmoid;, score=0.700 total time=   1.5s
[CV 10/10] END C=1, gamma=scale, kernel=sigmoid;, score=0.685 total time=   1.5s
[CV 1/10] END .....C=1, gamma=scale, kernel=rbf;, score=0.675 total time=   2.9s
[CV 2/10] END .....C=1, gamma=scale, kernel=rb

GridSearchCV(cv=10, estimator=SVC(),
             param_grid={'C': [1, 5, 10, 20], 'gamma': ['scale', 'auto'],
                         'kernel': ['sigmoid', 'rbf']},
             verbose=3)

In [34]:
svm_results = pd.DataFrame(svm.cv_results_)
svm_results = svm_results[["param_C", "param_kernel", "param_gamma", "mean_test_score"]]
svm_results.sort_values(["mean_test_score"], ascending=False)

Unnamed: 0,param_C,param_kernel,param_gamma,mean_test_score
5,5,rbf,scale,0.7135
9,10,rbf,scale,0.7135
13,20,rbf,scale,0.7135
11,10,rbf,auto,0.70625
15,20,rbf,auto,0.70625
7,5,rbf,auto,0.70475
10,10,sigmoid,auto,0.70475
0,1,sigmoid,scale,0.70425
14,20,sigmoid,auto,0.7015
6,5,sigmoid,auto,0.698


# Final model

In [98]:
model = svm.SVC(C=5, kernel="rbf", gamma="scale")
model.fit(X, y)

SVC(C=5)

In [99]:
predictions = pd.DataFrame(test["ID"])
predictions["class"] = model.predict(X_test)

In [61]:
predictions.to_csv("location_prediction.csv", index=False)