In [4]:
# Required Python Packages
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

In [5]:
# File Paths
INPUT_PATH = "https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.data"
OUTPUT_PATH = "breast-cancer-wisconsin.csv"

In [6]:
# Headers
HEADERS = ["CodeNumber", "ClumpThickness", "UniformityCellSize", "UniformityCellShape", "MarginalAdhesion",
           "SingleEpithelialCellSize", "BareNuclei", "BlandChromatin", "NormalNucleoli", "Mitoses", "CancerType"]

In [9]:
# Load the dataset into Pandas data frame
dataset = pd.read_csv(INPUT_PATH)

# Add the headers to the loaded dataset
dataset.columns = HEADERS

# Save the loaded dataset into csv format
dataset.to_csv(OUTPUT_PATH, index=False)
print ("File saved ...!")

File saved ...!


In [10]:
# Load the csv file into pandas dataframe
dataset = pd.read_csv(OUTPUT_PATH)

In [12]:
# Get basic statistics of the loaded dataset
print (dataset.describe())

         CodeNumber  ClumpThickness  UniformityCellSize  UniformityCellShape  \
count  6.980000e+02      698.000000          698.000000           698.000000   
mean   1.071807e+06        4.416905            3.137536             3.210602   
std    6.175323e+05        2.817673            3.052575             2.972867   
min    6.163400e+04        1.000000            1.000000             1.000000   
25%    8.702582e+05        2.000000            1.000000             1.000000   
50%    1.171710e+06        4.000000            1.000000             1.000000   
75%    1.238354e+06        6.000000            5.000000             5.000000   
max    1.345435e+07       10.000000           10.000000            10.000000   

       MarginalAdhesion  SingleEpithelialCellSize  BlandChromatin  \
count        698.000000                698.000000      698.000000   
mean           2.809456                  3.217765        3.438395   
std            2.856606                  2.215408        2.440056   
min

In [17]:
# Filter missing values
dataset = dataset[dataset[HEADERS[6]] != '?']
train_x, test_x, train_y, test_y = train_test_split(dataset[HEADERS[1:-1]], dataset[HEADERS[-1]],test_size=0.3)

In [18]:
# Train and Test dataset size details
print ("Train_x Shape :: ", train_x.shape)
print ("Train_y Shape :: ", train_y.shape)
print ("Test_x Shape :: ", test_x.shape)
print ("Test_y Shape :: ", test_y.shape)

Train_x Shape ::  (477, 9)
Train_y Shape ::  (477,)
Test_x Shape ::  (205, 9)
Test_y Shape ::  (205,)


In [19]:
# Create random forest classifier instance
clf = RandomForestClassifier()
trained_model = clf.fit(train_x, train_y)
print ("Trained model :: ", trained_model)
predictions = trained_model.predict(test_x)

Trained model ::  RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)


In [22]:
for i in range(0, 5):
    print ("Actual outcome :: {} and Predicted outcome :: {}".format(list(test_y)[i], predictions[i]))
print ("Train Accuracy :: ", accuracy_score(train_y, trained_model.predict(train_x)))
print ("Test Accuracy  :: ", accuracy_score(test_y, predictions))
print (" Confusion matrix ", confusion_matrix(test_y, predictions))

Actual outcome :: 4 and Predicted outcome :: 4
Actual outcome :: 2 and Predicted outcome :: 2
Actual outcome :: 2 and Predicted outcome :: 2
Actual outcome :: 2 and Predicted outcome :: 2
Actual outcome :: 4 and Predicted outcome :: 4
Train Accuracy ::  0.9958071278825996
Test Accuracy  ::  0.9463414634146341
 Confusion matrix  [[128   4]
 [  7  66]]
