In [52]:
import pandas as pd
import numpy as np
import math

## **Loading the Dataset**

In [53]:
bc_dataset = pd.read_csv('breast_cancer_wisconsin_csv.csv', index_col = False, names = ['id', 'clump_thickness', 'cell_size_uniformity', 'cell_shape_uniformity', 'marginal_adhesion', 'single_epithelial_cell_size', 'bare_nuclei', 'bland_chromatin', 'normal_nucleoli', 'mitoses', 'class'])
bc_dataset

Unnamed: 0,id,clump_thickness,cell_size_uniformity,cell_shape_uniformity,marginal_adhesion,single_epithelial_cell_size,bare_nuclei,bland_chromatin,normal_nucleoli,mitoses,class
0,1000025,5,1,1,1,2,1,3,1,1,2
1,1002945,5,4,4,5,7,10,3,2,1,2
2,1015425,3,1,1,1,2,2,3,1,1,2
3,1016277,6,8,8,1,3,4,3,7,1,2
4,1017023,4,1,1,3,2,1,3,1,1,2
...,...,...,...,...,...,...,...,...,...,...,...
694,776715,3,1,1,1,3,2,1,1,1,2
695,841769,2,1,1,1,2,1,1,1,1,2
696,888820,5,10,10,3,7,3,8,10,2,4
697,897471,4,8,6,4,3,4,10,6,1,4


In [54]:
#Column data types
bc_dataset.dtypes

id                              int64
clump_thickness                 int64
cell_size_uniformity            int64
cell_shape_uniformity           int64
marginal_adhesion               int64
single_epithelial_cell_size     int64
bare_nuclei                    object
bland_chromatin                 int64
normal_nucleoli                 int64
mitoses                         int64
class                           int64
dtype: object

## **Data Cleaning and Transformation**

In [55]:
#According to the dataset description that missing values are denoted by '?'.
#Check for missing values and replace '?' with column mean. 
#There are 16 missing values in the dataset.

missing_values_count = 0
for col in bc_dataset:
    for row in bc_dataset.index:
        if bc_dataset.loc[row, col] == '?':
            missing_values_count += 1
    bc_dataset[col] = pd.to_numeric(bc_dataset[col], errors='coerce', downcast='integer')
    bc_dataset[col] = bc_dataset[col].replace(np.nan, math.floor(bc_dataset[col].mean())) 

print('Number of missing values: ', missing_values_count)
print('Shape: ', bc_dataset.shape)
print('Data Types\n', bc_dataset.dtypes)

Number of missing values:  16
Shape:  (699, 11)
Data Types
 id                               int32
clump_thickness                   int8
cell_size_uniformity              int8
cell_shape_uniformity             int8
marginal_adhesion                 int8
single_epithelial_cell_size       int8
bare_nuclei                    float64
bland_chromatin                   int8
normal_nucleoli                   int8
mitoses                           int8
class                             int8
dtype: object


## **Identifying X and Y features**

In [56]:
x = bc_dataset.drop(['id', 'class'], axis=1)
y = bc_dataset['class']

## **Data Standardization/Normalization**

In [57]:
from sklearn.preprocessing import StandardScaler

cols = x.columns
scaler = StandardScaler()
x = scaler.fit_transform(x)
standardized_dataset = pd.DataFrame(x, columns=cols)
standardized_dataset['class'] = y
standardized_dataset

## **Export Standardized Dataset to CSV**

In [None]:
standardized_dataset.to_csv('breast_cancer_dataset_standardized.csv', index=False)

## **Assigning variables for train and test**

In [58]:
from sklearn.model_selection import train_test_split as tts

X_train, X_test, y_train, y_test = tts(x, y, test_size=0.2)

## **Import Learning Algorithms**

In [59]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

## **Training the Dataset**

### 1. Logistic Regression

In [60]:
#Logistic Regression
logistic_reg = LogisticRegression(C=1e6)

logistic_reg.fit(X_train, y_train)

logistic_predictions_train = logistic_reg.predict(X_train)
logistic_predictions_test = logistic_reg.predict(X_test)

### 2. Decision Trees

In [61]:
dt_classifier = DecisionTreeClassifier(random_state=0)

dt_classifier.fit(X_train, y_train)

dt_predictions_train = dt_classifier.predict(X_train)
dt_predictions_test = dt_classifier.predict(X_test)

### 3. Random Forest

In [62]:
rf_classifier = RandomForestClassifier(random_state=0)

rf_classifier.fit(X_train, y_train)

rf_predictions_train = rf_classifier.predict(X_train)
rf_predictions_test = rf_classifier.predict(X_test)

## **Performance Summary**

### Import accuracy metric

In [63]:
from sklearn.metrics import accuracy_score

### 1. Logistic Regression

In [64]:
accuracy_logistic_reg_train = accuracy_score(y_true=y_train, y_pred=logistic_predictions_train)
accuracy_logistic_reg_test = accuracy_score(y_true=y_test, y_pred=logistic_predictions_test)
print("Logistic Regression train accuracy: ", accuracy_logistic_reg_train)
print("Logistic Regression test accuracy: ", accuracy_logistic_reg_test)

Logistic Regression train accuracy:  0.9731663685152058
Logistic Regression test accuracy:  0.9571428571428572


### 2. Decision Tree

In [65]:
accuracy_dt_train = accuracy_score(y_true=y_train, y_pred=dt_predictions_train)
accuracy_dt_test = accuracy_score(y_true=y_test, y_pred=dt_predictions_test)
print("Decision Tree train accuracy: ", accuracy_dt_train)
print("Decision Tree test accuracy: ", accuracy_dt_test)

Decision Tree train accuracy:  1.0
Decision Tree test accuracy:  0.9357142857142857


### 3. Random Forest

In [66]:
accuracy_rf_train = accuracy_score(y_true=y_train, y_pred=rf_predictions_train)
accuracy_rf_test = accuracy_score(y_true=y_test, y_pred=rf_predictions_test)
print("Random Forest train accuracy: ", accuracy_rf_train)
print("Random Forest test accuracy: ", accuracy_rf_test)

Random Forest train accuracy:  1.0
Random Forest test accuracy:  0.9571428571428572
