In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import pickle

# Load dataset
df = pd.read_csv("breast-cancer.csv")


Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [4]:
# Dropping the 'id' column
df.drop(['id'], axis=1, inplace=True)

# Check for null values (if any)
print(df.isnull().sum())

# Splitting the dataset into features and target variable
X = df[['radius_mean', 'texture_mean', 'perimeter_mean', 'area_mean', 'smoothness_mean', 
        'compactness_mean', 'concavity_mean', 'concave points_mean', 'symmetry_mean', 
        'radius_worst', 'texture_worst', 'perimeter_worst', 'area_worst', 'smoothness_worst', 
        'compactness_worst', 'concavity_worst', 'concave points_worst', 'symmetry_worst', 
        'fractal_dimension_worst']]
Y = df['diagnosis']

# Splitting data into training and testing sets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2)


diagnosis                  0
radius_mean                0
texture_mean               0
perimeter_mean             0
area_mean                  0
smoothness_mean            0
compactness_mean           0
concavity_mean             0
concave points_mean        0
symmetry_mean              0
fractal_dimension_mean     0
radius_se                  0
texture_se                 0
perimeter_se               0
area_se                    0
smoothness_se              0
compactness_se             0
concavity_se               0
concave points_se          0
symmetry_se                0
fractal_dimension_se       0
radius_worst               0
texture_worst              0
perimeter_worst            0
area_worst                 0
smoothness_worst           0
compactness_worst          0
concavity_worst            0
concave points_worst       0
symmetry_worst             0
fractal_dimension_worst    0
dtype: int64


In [5]:
# Initialize and train the RandomForestClassifier
model = RandomForestClassifier(n_estimators=100, criterion='gini', oob_score=True)
model.fit(X_train, Y_train)

# Model Evaluation
print("Model Accuracy on Test Data:", model.score(X_test, Y_test))


Model Accuracy on Test Data: 0.9473684210526315


In [6]:
# Save the trained model using pickle
filename = 'finalized_model.sav'
pickle.dump(model, open(filename, 'wb'))

# This file can now be loaded in a Streamlit app or another Python script


In [7]:
# Example: Making a prediction with a new data point
patient = [14, 16, 89, 575, 0.1023, 0.05, 0.04, 0.234, 0.186, 14.98, 22, 99, 654.24, 0.3, 0.2, 0.1, 0.5, 0.6, 0.001345]
patient1 = np.array([patient])
prediction = model.predict(patient1)
print("Prediction for the patient:", prediction)

# Predicting on the test set
y_pred = model.predict(X_test)
print("Accuracy on Test Data:", accuracy_score(y_pred, Y_test))


Prediction for the patient: ['B']
Accuracy on Test Data: 0.9473684210526315


