# Heart Cancer Biopsy Classifier  

# Dataset Information
The dataset used  to train the model is called the Breast Cancer Wisconsin (Diagnostic) Data Set. It consists of 569 biopsy samples. 

In [None]:
import pandas as pd
from sklearn import metrics

data = pd.read_csv('files/cancer.csv')
data['diagnosis'].replace({'M':1, 'B':0}, inplace = True) # Replaces Malignant with 0 and Benign with 1, to simplify data visualization 
data.to_csv('cancer.csv')
del data

In [None]:
import os             
import numpy as np    
import pandas as pd   
from sklearn.metrics import accuracy_score

In [None]:
# Uploading and preparing data
data_path  = 'cancer.csv'

dataframe = pd.read_csv(data_path)

dataframe = dataframe[['diagnosis', 'perimeter_mean', 'radius_mean', 'texture_mean', 'area_mean', 'smoothness_mean', 'concavity_mean', 'symmetry_mean']]
dataframe['diagnosis_cat'] = dataframe['diagnosis'].astype('category').map({1: '1 (malignant)', 0: '0 (benign)'})

dataframe.head()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt 

In [None]:
# Visualizing the relationship between mean perimeter and identifying Malignant vs Benign 
sns.catplot(x = 'perimeter_mean', y = 'diagnosis_cat', data = dataframe, order=['1 (malignant)', '0 (benign)'])
dataframe.head()

Lower perimenter_mean seems to correlate with benign tumors. It is important to note that perimeter_mean has overlap so it is important to consider other factors 

Lower radius_mean correlates with benign tumors, similar to perimeter. 

In [None]:
sns.catplot(x = 'texture_mean', y = 'diagnosis_cat', data = dataframe, order=['1 (malignant)', '0 (benign)'])
sns.catplot(x = 'area_mean', y = 'diagnosis_cat', data = dataframe, order=['1 (malignant)', '0 (benign)'])
sns.catplot(x = 'concavity_mean', y = 'diagnosis_cat', data = dataframe, order=['1 (malignant)', '0 (benign)'])
sns.catplot(x = 'smoothness_mean', y = 'diagnosis_cat', data = dataframe, order=['1 (malignant)', '0 (benign)'])
sns.catplot(x = 'symmetry_mean', y = 'diagnosis_cat', data = dataframe, order=['1 (malignant)', '0 (benign)'])
sns.catplot(x = 'radius_mean', y = 'diagnosis_cat', data = dataframe, order=['1 (malignant)', '0 (benign)'])


Mean Radius seems to have the most accurate, strongest correlation with the diagnosis. 

In [None]:
boundary = 15 

sns.catplot(x = 'radius_mean', y = 'diagnosis_cat', data = dataframe, order=['1 (malignant)', '0 (benign)'])
plt.plot([boundary, boundary], [-.2, 1.2], 'g', linewidth = 2)

In [None]:
def boundary_classifier(target_boundary, radius_mean_series):
  result = [] 
  for radius in radius_mean_series:
    if radius> target_boundary:
      result.append(1)
    else:
      result.append(0)
  return result

chosen_boundary = 15

y_pred = boundary_classifier(chosen_boundary, dataframe['radius_mean'])
dataframe['predicted'] = y_pred

y_true = dataframe['diagnosis']

sns.catplot(x = 'radius_mean', y = 'diagnosis_cat', hue = 'predicted', data = dataframe, order=['1 (malignant)', '0 (benign)'])
plt.plot([chosen_boundary, chosen_boundary], [-.2, 1.2], 'g', linewidth = 2)


In [None]:
print (list(y_true))
print (y_pred)

accuracy = accuracy_score(y_true,y_pred)
print(accuracy)

Accuracy will increase, as more x variables are introduced. 

In [None]:
from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(dataframe, test_size = 0.2, random_state = 1)

In [None]:
print('Number of rows in training dataframe:', train_df.shape[0])
train_df.head()

In [None]:
print('Number of rows in test dataframe:', test_df.shape[0])
test_df.head()

In [None]:
X = ['radius_mean']
y = 'diagnosis'

X_train = train_df[X]
print('X_train, our input variables:')
print(X_train.head())
print()

y_train = train_df[y]
print('y_train, our output variable:')
print(y_train.head())

In [None]:
from sklearn import linear_model
logreg_model = linear_model.LogisticRegression()

logreg_model.fit(X_train, y_train)
X_test = test_df[X]
y_test = test_df[y]

y_pred = logreg_model.predict(X_test)

In [None]:
test_df['predicted'] = y_pred.squeeze()
sns.catplot(x = X[0], y = 'diagnosis_cat', hue = 'predicted', data=test_df, order=['1 (malignant)', '0 (benign)'])

In [None]:
accuracy = accuracy_score(y_test, y_pred)
print(accuracy)

In [None]:
y_prob = logreg_model.predict_proba(X_test)
X_test_view = X_test[X].values.squeeze()
plt.xlabel('radius_mean')
plt.ylabel('Predicted Probability')
sns.scatterplot(x = X_test_view, y = y_prob[:,1], hue = y_test, palette=['orange','blue'])

The colors represent the true diagnosis, while the the y axis represents the predicted probability. 

In [None]:
multi_X = ['radius_mean', 'perimeter_mean', 'smoothness_mean']
y = 'diagnosis'

# 1. Split data into train and test
multi_train_df, multi_test_df = train_test_split(dataframe, test_size = 0.2, random_state = 5)

# 2. Prepare X_train, X_test, y_train, and y_test variables by extracting the appropriate columns:
X_test = multi_test_df[multi_X]
X_train = multi_train_df[multi_X]
y_test = multi_test_df[y]
y_train = multi_train_df[y]
# 3. Initialize the model object
logreg_model = linear_model.LogisticRegression()
# 4. Fit the model to the training data
logreg_model.fit(X_train, y_train)
# 5. Use this trained model to predict on the test data
y_pred = logreg_model.predict(X_test)
# 6. Evaluate the accuracy by comparing to to the test labels and print out accuracy.
accuracy = accuracy_score(y_test, y_pred)
print(accuracy)

In [None]:

from sklearn import metrics

cnf_matrix = metrics.confusion_matrix(y_test, y_pred)

class_names = [0,1] 

fig, ax = plt.subplots()
tick_marks = np.arange(len(class_names)) 
plt.xticks(tick_marks, class_names)
plt.yticks(tick_marks, class_names)
sns.heatmap(pd.DataFrame(cnf_matrix), annot=True, cmap="YlGnBu" ,fmt='g')
ax.xaxis.set_label_position("top")
plt.tight_layout()
plt.title('Confusion matrix', y = 1.1)
plt.ylabel('Actual diagnosis')
plt.xlabel('Predicted diagnosis')

In [None]:
print(cnf_matrix)
(tn, fp), (fn, tp) = cnf_matrix
print ("True-Negative, False-Positive, False-Negative, True-Positive:", tn, fp, fn, tp)