<a href="https://colab.research.google.com/github/datascience-uniandes/classification_tutorial/blob/master/cancer/cancer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Classification: Estimate if a brain tumor is malignant or benign

MINE-4101: Applied Data Science  
Univerisdad de los Andes  
  
Last update: October, 2023

In [None]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import ConfusionMatrixDisplay, precision_score, recall_score, f1_score

### Reading the dataset

In [None]:
# Reading CSV
cancer_df = pd.read_csv("./data/cancer.csv", sep=",")

In [None]:
# Deleting unused column
del cancer_df["Unnamed: 32"]

In [None]:
cancer_df.shape

In [None]:
cancer_df.dtypes

In [None]:
cancer_df.head()

In [None]:
cancer_df.describe()

In [None]:
cancer_df.corr()

### Splitting train and test datasets

In [None]:
# Selecting features to train the model
features = cancer_df.columns.tolist()[2:]

In [None]:
# Creating feature matrix
X = cancer_df[features]

In [None]:
# Creating target
Y = cancer_df["diagnosis"].replace({"B": 0, "M": 1})

In [None]:
Y.value_counts(normalize = True)

In [None]:
# Splitting feature matrix for training (70%) and test (30%)
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, stratify=Y, random_state=500)

In [None]:
Y_train.value_counts(normalize = True)

In [None]:
Y_test.value_counts(normalize = True)

### Training the model

In [None]:
scaler = StandardScaler()

In [None]:
X_train_scaled = scaler.fit_transform(X_train)

In [None]:
# Initializing the model
model = LogisticRegression(penalty="l2", C=0.1, solver="liblinear", class_weight="balanced", random_state=80)

In [None]:
# Training the model using training dataset
model.fit(X_train_scaled, Y_train)

In [None]:
pd.DataFrame(np.append(np.array(features).reshape(-1, 1), model.coef_.reshape(-1, 1), axis=1), columns=["feature", "coefficient"])

<span style="color: red;">Q: How to interpret coefficients in Logistic Regression?</span>

### Evaluating the model

In [None]:
# Predicting for test dataset
predictions = model.predict(scaler.transform(X_test))

In [None]:
# Plotting confusion matrix
ConfusionMatrixDisplay.from_predictions(Y_test, predictions)

In [None]:
# Calculating confusion matrix derived metrics
print("Precision:", precision_score(Y_test, predictions))
print("Recall:", recall_score(Y_test, predictions))
print("F1", f1_score(Y_test, predictions))

### Analyzing probabilities

In [None]:
probabilities = model.predict_proba(scaler.transform(X_test))[:,1]

In [None]:
probs_true_df = pd.DataFrame(np.append(probabilities.reshape(-1, 1), Y_test.values.reshape(-1, 1), axis=1), columns=["probs", "true"])

In [None]:
plt.figure(figsize=(15, 9))
sns.kdeplot(data=probs_true_df.loc[probs_true_df["true"] == 1], x="probs", label="B")
sns.kdeplot(data=probs_true_df.loc[probs_true_df["true"] == 0], x="probs", label="M")
plt.axvline(x=.5, color="r", linestyle="--")
plt.title('Probabilities estimated by class')
plt.legend()
plt.show()