# Breast Cancer Classifier

# Import essential libraries 

In [None]:
# import libraries
import pandas as pd # for data manupulation or analysis
import matplotlib.pyplot as plt # for data visualization
import seaborn as sns # for data visualization

# Data Load

In [None]:
#Load breast cancer dataset
df = pd.read_csv('data.csv')

# Data Manipulation

In [None]:
df.head()

In [None]:
# keys in dataset
df.keys()

In [None]:
# malignant or benign value
df['diagnosis']

In [None]:
# Information of cancer Dataframe
df.info()

In [None]:
# Numerical distribution of data
df.describe() 

In [None]:
df.isnull().sum()

# Data Visualization

In [None]:
# pair plot of sample feature
sns.pairplot(df, hue = 'diagnosis',vars = ['radius_mean', 'texture_mean', 'perimeter_mean', 'area_mean', 'smoothness_mean'] )

In [None]:
# Count the target class
sns.countplot(df['diagnosis'])

In [None]:
# counter plot of feature mean radius
sns.countplot(df['radius_mean'])

In [None]:
from pandas_profiling import ProfileReport

profile = ProfileReport(df, title="Pandas Profiling Report")
profile.to_widgets()
profile.to_notebook_iframe()
profile.to_file("your_report.html")

# Removing Less Significant Columns

In [None]:
df.drop(['id'],axis=1,inplace=True)

# Split DatFrame in Train and Test

In [None]:
# create second DataFrame by droping target
features = df.drop(['diagnosis'], axis = 1)
target = df['diagnosis']

In [None]:
# split dataset into train and test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(features,target, test_size = 0.2, random_state= 5)

In [None]:
X_train

In [None]:
X_test

In [None]:
y_train

In [None]:
y_test

# Feature scaling 

In [None]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train_sc = sc.fit_transform(X_train)
X_test_sc = sc.transform(X_test)

# Machine Learning Model Building

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score

## Support vector Classifier

In [None]:
# Support vector classifier
from sklearn.svm import SVC
svc_classifier = SVC()
svc_classifier.fit(X_train, y_train)
y_pred_scv = svc_classifier.predict(X_test)
accuracy_score(y_test, y_pred_scv)

In [None]:
# Train with Standard scaled Data
svc_classifier2 = SVC()
svc_classifier2.fit(X_train_sc, y_train)
y_pred_svc_sc = svc_classifier2.predict(X_test_sc)
accuracy_score(y_test, y_pred_svc_sc)

# Logistic Regression

In [None]:
# Logistic Regression
from sklearn.linear_model import LogisticRegression
lr_classifier = LogisticRegression()
lr_classifier.fit(X_train, y_train)
y_pred_lr = lr_classifier.predict(X_test)
accuracy_score(y_test, y_pred_lr)

In [None]:
# Train with Standard scaled Data
lr_classifier2 = LogisticRegression()
lr_classifier2.fit(X_train_sc, y_train)
y_pred_lr_sc = lr_classifier2.predict(X_test_sc)
accuracy_score(y_test, y_pred_lr_sc)

In [None]:
cm = confusion_matrix(y_test, y_pred_lr_sc)
plt.title('Heatmap of Confusion Matrix', fontsize = 15)
sns.heatmap(cm, annot = True)
plt.show()

# K – Nearest Neighbor Classifier

In [None]:
# K – Nearest Neighbor Classifier
from sklearn.neighbors import KNeighborsClassifier
knn_classifier = KNeighborsClassifier()
knn_classifier.fit(X_train, y_train)
y_pred_knn = knn_classifier.predict(X_test)
accuracy_score(y_test, y_pred_knn)

In [None]:
# Train with Standard scaled Data
knn_classifier2 = KNeighborsClassifier()
knn_classifier2.fit(X_train_sc, y_train)
y_pred_knn_sc = knn_classifier2.predict(X_test_sc)
accuracy_score(y_test, y_pred_knn_sc)

# Naive Bayes Classifier

In [None]:
# Naive Bayes Classifier
from sklearn.naive_bayes import GaussianNB
nb_classifier = GaussianNB()
nb_classifier.fit(X_train, y_train)
y_pred_nb = nb_classifier.predict(X_test)
accuracy_score(y_test, y_pred_nb)

In [None]:
# Train with Standard scaled Data
nb_classifier2 = GaussianNB()
nb_classifier2.fit(X_train_sc, y_train)
y_pred_nb_sc = nb_classifier2.predict(X_test_sc)
accuracy_score(y_test, y_pred_nb_sc)

# Decision Tree Classifier

In [None]:
# Decision Tree Classifier
from sklearn.tree import DecisionTreeClassifier
dt_classifier = DecisionTreeClassifier()
dt_classifier.fit(X_train, y_train)
y_pred_dt = dt_classifier.predict(X_test)
accuracy_score(y_test, y_pred_dt)

In [None]:
# Train with Standard scaled Data
dt_classifier2 = DecisionTreeClassifier()
dt_classifier2.fit(X_train_sc, y_train)
y_pred_dt_sc = dt_classifier2.predict(X_test_sc)
accuracy_score(y_test, y_pred_dt_sc)

 # Random Forest Classifier

In [None]:
# Random Forest Classifier
from sklearn.ensemble import RandomForestClassifier
rf_classifier = RandomForestClassifier()
rf_classifier.fit(X_train, y_train)
y_pred_rf = rf_classifier.predict(X_test)
accuracy_score(y_test, y_pred_rf)

In [None]:
# Train with Standard scaled Data
rf_classifier2 = RandomForestClassifier()
rf_classifier2.fit(X_train_sc, y_train)
y_pred_rf_sc = rf_classifier2.predict(X_test_sc)
accuracy_score(y_test, y_pred_rf_sc)

In [None]:
cm = confusion_matrix(y_test, y_pred_rf_sc)
plt.title('Heatmap of Confusion Matrix', fontsize = 15)
sns.heatmap(cm, annot = True)
plt.show()

# AdaBoost Classifier

In [None]:
# Adaboost Classifier
from sklearn.ensemble import AdaBoostClassifier
adb_classifier = AdaBoostClassifier()
adb_classifier.fit(X_train, y_train)
y_pred_adb = adb_classifier.predict(X_test)
accuracy_score(y_test, y_pred_adb)

In [None]:
# Train with Standard scaled Data
adb_classifier2 = AdaBoostClassifier()
adb_classifier2.fit(X_train_sc, y_train)
y_pred_adb_sc = adb_classifier2.predict(X_test_sc)
accuracy_score(y_test, y_pred_adb_sc)

In [None]:
cm = confusion_matrix(y_test, y_pred_adb)
plt.title('Heatmap of Confusion Matrix', fontsize = 15)
sns.heatmap(cm, annot = True)
plt.show()

#### Conclusion :- Logistics Regression, Random Forest Classifier and AdaBoost Classifier gives best Accuracy of 97.36%

# Save AdaBoost Classifier model using Pickel

In [None]:
## Pickle
import pickle

# save model
pickle.dump(adb_classifier2, open('breast_cancer_detector.pickle', 'wb'))

# load model
breast_cancer_detector_model = pickle.load(open('breast_cancer_detector.pickle', 'rb'))

# predict the output
y_pred = breast_cancer_detector_model.predict(X_test_sc)

# confusion matrix
print('Confusion matrix of XGBoost model: \n',confusion_matrix(y_test, y_pred),'\n')

# show the accuracy
print('Accuracy of XGBoost model = ',accuracy_score(y_test, y_pred))