In [27]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [28]:
cancer_data = pd.read_csv('../dataset/Breast_cancer_data.csv')

In [29]:
# print first 5 rows of the dataset
cancer_data.head()

Unnamed: 0,mean_radius,mean_texture,mean_perimeter,mean_area,mean_smoothness,diagnosis
0,17.99,10.38,122.8,1001.0,0.1184,0
1,20.57,17.77,132.9,1326.0,0.08474,0
2,19.69,21.25,130.0,1203.0,0.1096,0
3,11.42,20.38,77.58,386.1,0.1425,0
4,20.29,14.34,135.1,1297.0,0.1003,0


In [30]:
# number of rows and columns in the dataset
cancer_data.shape

(569, 6)

In [32]:
cancer_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   mean_radius      569 non-null    float64
 1   mean_texture     569 non-null    float64
 2   mean_perimeter   569 non-null    float64
 3   mean_area        569 non-null    float64
 4   mean_smoothness  569 non-null    float64
 5   diagnosis        569 non-null    int64  
dtypes: float64(5), int64(1)
memory usage: 26.8 KB


In [33]:
# checking for missing values
cancer_data.isnull().sum()

mean_radius        0
mean_texture       0
mean_perimeter     0
mean_area          0
mean_smoothness    0
diagnosis          0
dtype: int64

In [34]:
# statistical measures about the data
cancer_data.describe()

Unnamed: 0,mean_radius,mean_texture,mean_perimeter,mean_area,mean_smoothness,diagnosis
count,569.0,569.0,569.0,569.0,569.0,569.0
mean,14.127292,19.289649,91.969033,654.889104,0.09636,0.627417
std,3.524049,4.301036,24.298981,351.914129,0.014064,0.483918
min,6.981,9.71,43.79,143.5,0.05263,0.0
25%,11.7,16.17,75.17,420.3,0.08637,0.0
50%,13.37,18.84,86.24,551.1,0.09587,1.0
75%,15.78,21.8,104.1,782.7,0.1053,1.0
max,28.11,39.28,188.5,2501.0,0.1634,1.0


In [35]:
# checking the distribution of Target Variable
cancer_data['diagnosis'].value_counts()

diagnosis
1    357
0    212
Name: count, dtype: int64

In [38]:
X = cancer_data.drop(columns='diagnosis', axis=1)
Y = cancer_data['diagnosis']

In [39]:
print(X)

     mean_radius  mean_texture  mean_perimeter  mean_area  mean_smoothness
0          17.99         10.38          122.80     1001.0          0.11840
1          20.57         17.77          132.90     1326.0          0.08474
2          19.69         21.25          130.00     1203.0          0.10960
3          11.42         20.38           77.58      386.1          0.14250
4          20.29         14.34          135.10     1297.0          0.10030
..           ...           ...             ...        ...              ...
564        21.56         22.39          142.00     1479.0          0.11100
565        20.13         28.25          131.20     1261.0          0.09780
566        16.60         28.08          108.30      858.1          0.08455
567        20.60         29.33          140.10     1265.0          0.11780
568         7.76         24.54           47.92      181.0          0.05263

[569 rows x 5 columns]


In [40]:
print(Y)


0      0
1      0
2      0
3      0
4      0
      ..
564    0
565    0
566    0
567    0
568    1
Name: diagnosis, Length: 569, dtype: int64


In [41]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, stratify=Y, random_state=2)

In [42]:
print(X.shape, X_train.shape, X_test.shape)

(569, 5) (455, 5) (114, 5)


In [43]:
model = LogisticRegression()

In [44]:
# training the LogisticRegression model with Training data
model.fit(X_train, Y_train)

In [None]:
Accuracy Score

In [45]:
# accuracy on training data
X_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, Y_train)

In [46]:
print('Accuracy on Training data : ', training_data_accuracy)

Accuracy on Training data :  0.9120879120879121


In [47]:
# accuracy on test data
X_test_prediction = model.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction, Y_test)

In [48]:
print('Accuracy on Test data : ', test_data_accuracy)

Accuracy on Test data :  0.9035087719298246


In [52]:
input_data = (62,0,0,140,268)

# change the input data to a numpy array
input_data_as_numpy_array= np.asarray(input_data)

# reshape the numpy array as we are predicting for only on instance
input_data_reshaped = input_data_as_numpy_array.reshape(1,-1)

prediction = model.predict(input_data_reshaped)
print(prediction)

if (prediction[0]== 0):
  print('The Person does not have Breast cancer')
else:
  print('The Person has Breast Cancer')

[1]
The Person has Breast Cancer


In [53]:
import pickle

In [55]:
filename = 'breast_cancer_model.sav'
pickle.dump(model, open(filename, 'wb'))

In [56]:
for column in X.columns:
  print(column)

mean_radius
mean_texture
mean_perimeter
mean_area
mean_smoothness
