In [67]:
# import libraries
import pandas as pd
import sklearn
import os
import numpy as np
import pickle

from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [68]:
# Current directory
current_dir = os.getcwd()

# Parent directory
parent_dir = os.path.dirname(current_dir)

# Data directory
data_dir = parent_dir + "/data/"

# Model directory
model_dir = parent_dir + "/model/"

# Data file
data_file = data_dir + "cancer_data.csv"

In [69]:
# Load data
data = pd.read_csv(data_file)

data

Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,...,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst,Unnamed: 32
0,842302,M,17.99,10.38,122.80,1001.0,0.11840,0.27760,0.30010,0.14710,...,17.33,184.60,2019.0,0.16220,0.66560,0.7119,0.2654,0.4601,0.11890,
1,842517,M,20.57,17.77,132.90,1326.0,0.08474,0.07864,0.08690,0.07017,...,23.41,158.80,1956.0,0.12380,0.18660,0.2416,0.1860,0.2750,0.08902,
2,84300903,M,19.69,21.25,130.00,1203.0,0.10960,0.15990,0.19740,0.12790,...,25.53,152.50,1709.0,0.14440,0.42450,0.4504,0.2430,0.3613,0.08758,
3,84348301,M,11.42,20.38,77.58,386.1,0.14250,0.28390,0.24140,0.10520,...,26.50,98.87,567.7,0.20980,0.86630,0.6869,0.2575,0.6638,0.17300,
4,84358402,M,20.29,14.34,135.10,1297.0,0.10030,0.13280,0.19800,0.10430,...,16.67,152.20,1575.0,0.13740,0.20500,0.4000,0.1625,0.2364,0.07678,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
564,926424,M,21.56,22.39,142.00,1479.0,0.11100,0.11590,0.24390,0.13890,...,26.40,166.10,2027.0,0.14100,0.21130,0.4107,0.2216,0.2060,0.07115,
565,926682,M,20.13,28.25,131.20,1261.0,0.09780,0.10340,0.14400,0.09791,...,38.25,155.00,1731.0,0.11660,0.19220,0.3215,0.1628,0.2572,0.06637,
566,926954,M,16.60,28.08,108.30,858.1,0.08455,0.10230,0.09251,0.05302,...,34.12,126.70,1124.0,0.11390,0.30940,0.3403,0.1418,0.2218,0.07820,
567,927241,M,20.60,29.33,140.10,1265.0,0.11780,0.27700,0.35140,0.15200,...,39.42,184.60,1821.0,0.16500,0.86810,0.9387,0.2650,0.4087,0.12400,


In [70]:
# Remove the "Unnsamed: 32" column
data = data.drop(["id", "Unnamed: 32"], axis=1)

In [71]:
# Nomalize the data using StandardScaler except id and diagnosis columns
scaler = StandardScaler()

# Normalize the data
data.iloc[:, 1:] = scaler.fit_transform(data.iloc[:, 1:])

# Get the normalized data to a data frame
data_normalized = pd.DataFrame(data)

In [72]:
# Get the features and the target
X = data_normalized.drop (["diagnosis"], axis=1)
y = data_normalized["diagnosis"]

In [73]:
# Reduce the features using correlation
# Get the correlation matrix
corr_matrix = X.corr().abs()

# Select upper triangle of correlation matrix
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))

# Find index of feature columns with correlation greater than 0.9
to_drop = [column for column in upper.columns if any(upper[column] > 0.9)]

# Drop the highly correlated features
X = X.drop(X[to_drop], axis=1)

In [74]:
X

Unnamed: 0,radius_mean,texture_mean,smoothness_mean,compactness_mean,concavity_mean,symmetry_mean,fractal_dimension_mean,radius_se,texture_se,smoothness_se,compactness_se,concavity_se,concave points_se,symmetry_se,fractal_dimension_se,smoothness_worst,compactness_worst,concavity_worst,symmetry_worst,fractal_dimension_worst
0,1.097064,-2.073335,1.568466,3.283515,2.652874,2.217515,2.255747,2.489734,-0.565265,-0.214002,1.316862,0.724026,0.660820,1.148757,0.907083,1.307686,2.616665,2.109526,2.750622,1.937015
1,1.829821,-0.353632,-0.826962,-0.487072,-0.023846,0.001392,-0.868652,0.499255,-0.876244,-0.605351,-0.692926,-0.440780,0.260162,-0.805450,-0.099444,-0.375612,-0.430444,-0.146749,-0.243890,0.281190
2,1.579888,0.456187,0.942210,1.052926,1.363478,0.939685,-0.398008,1.228676,-0.780083,-0.297005,0.814974,0.213076,1.424827,0.237036,0.293559,0.527407,1.082932,0.854974,1.152255,0.201391
3,-0.768909,0.253732,3.283553,3.402909,1.915897,2.867383,4.910919,0.326373,-0.110409,0.689702,2.744280,0.819518,1.115007,4.732680,2.047511,3.394275,3.893397,1.989588,6.046041,4.935010
4,1.750297,-1.151816,0.280372,0.539340,1.371011,-0.009560,-0.562450,1.270543,-0.790244,1.483067,-0.048520,0.828471,1.144205,-0.361092,0.499328,0.220556,-0.313395,0.613179,-0.868353,-0.397100
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
564,2.110995,0.721473,1.041842,0.219060,1.947285,-0.312589,-0.931027,2.782080,0.071025,1.086384,0.191805,0.666001,2.067178,-1.138416,0.167980,0.378365,-0.273318,0.664512,-1.360158,-0.709091
565,1.704854,2.085134,0.102458,-0.017833,0.693043,-0.217664,-1.058611,1.300499,2.260938,-0.424010,-0.069758,0.252202,0.808431,-0.189161,-0.490556,-0.691230,-0.394820,0.236573,-0.531855,-0.973978
566,0.702284,2.045574,-0.840484,-0.038680,0.046588,-0.809117,-0.895587,0.184892,-0.257371,-0.379342,0.661277,0.510827,0.612157,-0.891416,0.036727,-0.809587,0.350735,0.326767,-1.104549,-0.318409
567,1.838341,2.336457,1.525767,3.272144,3.296944,2.137194,1.043695,1.157935,0.686088,-0.173000,2.017716,1.302285,0.785721,0.326634,0.904057,1.430427,3.904848,3.197605,1.919083,2.219635


In [75]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Create a random forest classifier
clf_rnd = RandomForestClassifier(n_estimators=100, random_state=0)

# Train the classifier
clf_rnd.fit(X_train, y_train)

# Test the classifier
score_rnd = clf_rnd.score(X_test, y_test)

# Classification Report
y_pred_rnd = clf_rnd.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           B       0.93      0.94      0.93        67
           M       0.91      0.89      0.90        47

    accuracy                           0.92       114
   macro avg       0.92      0.92      0.92       114
weighted avg       0.92      0.92      0.92       114



In [76]:
# Save the model as a pickle file
with open(model_file, 'wb') as file:
    pickle.dump(clf_rnd, file)
model_file = model_dir + "cancer_model.pkl"