In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
n_cpu = os.cpu_count()
n_thread = n_cpu*2

In [3]:
df = pd.read_csv('/kaggle/input/breast-cancer/Breast_Cancer.csv')

In [4]:
df.head()

In [5]:
df.info()

In [6]:
df.describe()

In [7]:
df['T Stage '].value_counts()

In [8]:
df['N Stage'].value_counts()

In [9]:
df['6th Stage'].value_counts()

In [10]:
df['differentiate'].value_counts()

In [11]:
df['Status'].value_counts()

In [12]:
df.drop(['Survival Months'], axis=1, inplace=True)

# DATA VISUALIZATION

* Numerical data

In [13]:
df.describe()

In [14]:
sns.heatmap(df.corr(), annot=True)

In [15]:
sns.histplot(data=df, x='Age', hue='Status', kde=True)

In [16]:
sns.boxplot(data=df, x='Status', y='Age')

In [17]:
sns.histplot(data=df, x='Tumor Size', hue='Status', kde=True)

In [18]:
sns.boxplot(data=df, x='Status', y='Tumor Size')

In [19]:
sns.histplot(data=df, x='Regional Node Examined', hue='Status', kde=True)

In [20]:
sns.boxplot(data=df, x='Status', y='Regional Node Examined')

In [21]:
sns.histplot(data=df, x='Reginol Node Positive', hue='Status', kde=True, bins=30)

In [22]:
sns.boxplot(data=df, x='Status', y='Reginol Node Positive')

* Categorical data

In [23]:
df.columns

In [24]:
sns.countplot(data=df, x='Race', hue='Status')

In [25]:
sns.countplot(data=df, x='Marital Status', hue='Status')

In [26]:
sns.countplot(data=df, x='N Stage', hue='Status')

In [27]:
sns.countplot(data=df, x='T Stage ', hue='Status')

In [28]:
sns.countplot(data=df, x='6th Stage', hue='Status')

In [29]:
fig = plt.figure(figsize=(8,8))
sns.countplot(data=df, x='differentiate', hue='Status')
plt.show()

In [30]:
sns.countplot(data=df, x='Grade', hue='Status')

In [31]:
sns.countplot(data=df, x='A Stage', hue='Status')

In [32]:
sns.countplot(data=df, x='Estrogen Status', hue='Status')

In [33]:
sns.countplot(data=df, x='Progesterone Status', hue='Status')

# DATA PREPROCESSING

In [34]:
df.columns

In [35]:
from sklearn.preprocessing import StandardScaler, LabelEncoder

In [36]:
X_num = df[['Age', 'Tumor Size', 'Regional Node Examined', 'Reginol Node Positive']]
X_cat = df.drop(['Age', 'Tumor Size', 'Regional Node Examined', 'Reginol Node Positive', 'Status'], axis=1)
y = df['Status']

In [37]:
le = LabelEncoder()
y = le.fit_transform(y)
for col in X_cat.columns:
  X_cat[col] = le.fit_transform(X_cat[col])

In [38]:
X = pd.concat([X_num, X_cat], axis=1)

In [39]:
X.head()

In [40]:
scaler = StandardScaler()
scaler.fit(X_num)
X_scaled = scaler.transform(X_num)
X_scaled = pd.DataFrame(X_scaled, index=X_num.index, columns=X_num.columns)
X_scaled = pd.concat([X_scaled, X_cat], axis=1)
y_scaled = y

In [41]:
X_scaled.head()

# MODEL LEARNING

* Normal data

In [42]:
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import classification_report, plot_precision_recall_curve

In [43]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=1)

In [44]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import lightgbm as lgb

In [45]:
model_rl = LogisticRegression(solver='liblinear')
model_dt = DecisionTreeClassifier()
model_rf = RandomForestClassifier()
model_lgb = lgb.LGBMClassifier(objective='binary')

In [46]:
param_rl = {
    'penalty':['l1','l2'],
    'C':[0.01,0.1,0.5,0.9,1,5,10],
    'tol':[1e-4,1e-2,1,1e2]
}

In [47]:
GSRL1 = GridSearchCV(model_rl, param_rl, cv=5, scoring='accuracy', n_jobs=n_thread)
GSRL1.fit(X_train, y_train)

In [48]:
print('best_param:', GSRL1.best_params_)
print('best_score:', GSRL1.best_score_)

In [49]:
pred_rl1 = GSRL1.predict(X_test)
print(classification_report(y_test, pred_rl1))

In [50]:
param_dt = {
    'criterion':['gini','entropy'],
    'min_samples_split':[2,5,10,15],
    'max_depth':[None,2],
    'min_samples_leaf':[1,3,10,15],
    'max_features':[None,'sqrt','log2']
}

In [51]:
GSDT1 = GridSearchCV(model_dt, param_dt, cv=5, scoring='accuracy', n_jobs=n_thread)
GSDT1.fit(X_train, y_train)

In [52]:
print('best_param:', GSDT1.best_params_)
print('best_score:', GSDT1.best_score_)

In [53]:
param_rf = {
    'n_estimators':[50,100],
    'criterion':['entropy'],
    'min_samples_split':[2,5],
    'max_depth':[None,2],
    'min_samples_leaf':[1,3,10],
    'max_features':['sqrt']
}

In [54]:
GSRF1 = GridSearchCV(model_rf, param_rf, cv=5, scoring='accuracy', n_jobs=n_thread)
GSRF1.fit(X_train, y_train)

In [55]:
print('best_param:', GSRF1.best_params_)
print('best_score:', GSRF1.best_score_)

In [56]:
pred_rf1 = GSRF1.predict(X_test)
print(classification_report(y_test, pred_rf1))

In [57]:
param_lgb = {
    'num_leaves':[32,64,128],
    'colsample_bytree':[0.8,1],
    'n_estimators':[100,150]
}

In [58]:
GS_LGB1 = GridSearchCV(model_lgb, param_lgb, cv=5, scoring='accuracy', n_jobs=n_thread)
GS_LGB1.fit(X_train, y_train)

In [59]:
print('best_param:', GS_LGB1.best_params_)
print('best_score:', GS_LGB1.best_score_)

In [60]:
pred_lgb1 = GS_LGB1.predict(X_test)
print(classification_report(y_test, pred_lgb1))

In [61]:
fig = plt.figure(figsize=(10,10))
ax = fig.gca()
plot_precision_recall_curve(GSRL1.best_estimator_, X_test, y_test, ax=ax)
plot_precision_recall_curve(GSDT1.best_estimator_, X_test, y_test, ax=ax)
plot_precision_recall_curve(GSRF1.best_estimator_, X_test, y_test, ax=ax)
plot_precision_recall_curve(GS_LGB1.best_estimator_, X_test, y_test, ax=ax)

* Standard data

In [62]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_scaled, test_size=0.25, random_state=1)

In [63]:
GSRL2 = GridSearchCV(model_rl, param_rl, cv=5, scoring='accuracy', n_jobs=n_thread)
GSRL2.fit(X_train, y_train)

In [64]:
print('best_param:', GSRL2.best_params_)
print('best_score:', GSRL2.best_score_)

In [65]:
pred_rl2 = GSRL2.predict(X_test)
print(classification_report(y_test, pred_rl2))

In [66]:
GSDT2 = GridSearchCV(model_dt, param_dt, cv=5, scoring='accuracy', n_jobs=n_thread)
GSDT2.fit(X_train, y_train)

In [67]:
print('best_param:', GSDT2.best_params_)
print('best_score:', GSDT2.best_score_)

In [68]:
pred_dt2 = GSDT2.predict(X_test)
print(classification_report(y_test, pred_dt2))

In [69]:
GSRF2 = GridSearchCV(model_rf, param_rf, cv=5, scoring='accuracy', n_jobs=n_thread)
GSRF2.fit(X_train, y_train)

In [70]:
print('best_param:', GSRF2.best_params_)
print('best_score:', GSRF2.best_score_)

In [71]:
pred_rf2 = GSRF2.predict(X_test)
print(classification_report(y_test, pred_rf2))

In [72]:
GS_LGB2 = GridSearchCV(model_lgb, param_lgb, cv=5, scoring='accuracy', n_jobs=n_thread)
GS_LGB2.fit(X_train, y_train)

In [73]:
print('best_param:', GS_LGB2.best_params_)
print('best_score:', GS_LGB2.best_score_)

In [74]:
pred_lgb2 = GS_LGB2.predict(X_test)
print(classification_report(y_test, pred_lgb2))

In [75]:
fig = plt.figure(figsize=(10,10))
ax = fig.gca()
plot_precision_recall_curve(GSRL2.best_estimator_, X_test, y_test, ax=ax)
plot_precision_recall_curve(GSDT2.best_estimator_, X_test, y_test, ax=ax)
plot_precision_recall_curve(GSRF2.best_estimator_, X_test, y_test, ax=ax)
plot_precision_recall_curve(GS_LGB2.best_estimator_, X_test, y_test, ax=ax)

**We got about 86% accuracy. But look at the recall. we need to balance the classes**

# BALANCING CLASSES

* Normal data

In [76]:
from imblearn.under_sampling import NearMiss

In [77]:
nm = NearMiss()

In [78]:
X, y = nm.fit_resample(X, y)

In [79]:
ax = sns.countplot(x=y)

In [80]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state=1)

In [81]:
GSRL3 = GridSearchCV(model_rl, param_rl, cv=5, scoring='accuracy', n_jobs=n_thread)
GSRL3.fit(X_train, y_train)

In [82]:
print('best_param:', GSRL3.best_params_)
print('best_score:', GSRL3.best_score_)

In [83]:
pred_rl3 = GSRL3.predict(X_test)
print(classification_report(y_test, pred_rl3))

In [84]:
GSDT3 = GridSearchCV(model_dt, param_dt, cv=5, scoring='accuracy', n_jobs=n_thread)
GSDT3.fit(X_train, y_train)

In [85]:
print('best_param:', GSDT3.best_params_)
print('best_score:', GSDT3.best_score_)

In [86]:
pred_dt3 = GSDT3.predict(X_test)
print(classification_report(y_test, pred_dt3))

In [87]:
GSRF3 = GridSearchCV(model_rf, param_rf, cv=5, scoring='accuracy', n_jobs=n_thread)
GSRF3.fit(X_train, y_train)

In [88]:
print('best_param:', GSRF3.best_params_)
print('best_score:', GSRF3.best_score_)

In [89]:
pred_rf3 = GSRF3.predict(X_test)
print(classification_report(y_test, pred_rf3))

In [90]:
GS_LGB3 = GridSearchCV(model_lgb, param_lgb, cv=5, scoring='accuracy', n_jobs=n_thread)
GS_LGB3.fit(X_train, y_train)

In [91]:
print('best_param:', GS_LGB3.best_params_)
print('best_score:', GS_LGB3.best_score_)

In [92]:
pred_lgb3 = GS_LGB3.predict(X_test)
print(classification_report(y_test, pred_lgb3))

In [93]:
fig = plt.figure(figsize=(10,10))
ax = fig.gca()
plot_precision_recall_curve(GSRL3.best_estimator_, X_test, y_test, ax=ax)
plot_precision_recall_curve(GSDT3.best_estimator_, X_test, y_test, ax=ax)
plot_precision_recall_curve(GSRF3.best_estimator_, X_test, y_test, ax=ax)
plot_precision_recall_curve(GS_LGB3.best_estimator_, X_test, y_test, ax=ax)

* Standard data

In [94]:
X_scaled, y_scaled = nm.fit_resample(X_scaled, y_scaled)

In [95]:
ax = sns.countplot(x=y_scaled)

In [96]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_scaled, test_size = 0.25, random_state=1)

In [97]:
GSRL4 = GridSearchCV(model_rl, param_rl, cv=5, scoring='accuracy', n_jobs=n_thread)
GSRL4.fit(X_train, y_train)

In [98]:
print('best_param:', GSRL4.best_params_)
print('best_score:', GSRL4.best_score_)

In [99]:
pred_rl4 = GSRL4.predict(X_test)
print(classification_report(y_test, pred_rl4))

In [100]:
GSDT4 = GridSearchCV(model_dt, param_dt, cv=5, scoring='accuracy', n_jobs=n_thread)
GSDT4.fit(X_train, y_train)

In [101]:
print('best_param:', GSDT4.best_params_)
print('best_score:', GSDT4.best_score_)

In [102]:
pred_dt4 = GSDT4.predict(X_test)
print(classification_report(y_test, pred_dt4))

In [103]:
GSRF4 = GridSearchCV(model_rf, param_rf, cv=5, scoring='accuracy', n_jobs=n_thread)
GSRF4.fit(X_train, y_train)

In [104]:
print('best_param:', GSRF4.best_params_)
print('best_score:', GSRF4.best_score_)

In [105]:
pred_rf4 = GSRF4.predict(X_test)
print(classification_report(y_test, pred_rf4))

In [106]:
GS_LGB4 = GridSearchCV(model_lgb, param_lgb, cv=5, scoring='accuracy', n_jobs=n_thread)
GS_LGB4.fit(X_train, y_train)

In [107]:
print('best_param:', GS_LGB4.best_params_)
print('best_score:', GS_LGB4.best_score_)

In [108]:
pred_lgb4 = GS_LGB4.predict(X_test)
print(classification_report(y_test, pred_lgb4))

In [109]:
fig = plt.figure(figsize=(10,10))
ax = fig.gca()
plot_precision_recall_curve(GSRL4.best_estimator_, X_test, y_test, ax=ax)
plot_precision_recall_curve(GSDT4.best_estimator_, X_test, y_test, ax=ax)
plot_precision_recall_curve(GSRF4.best_estimator_, X_test, y_test, ax=ax)
plot_precision_recall_curve(GS_LGB4.best_estimator_, X_test, y_test, ax=ax)

**We found that the models performed much better with standard scaling and balancing data.**

# FEATURE IMPORTANCES

In [110]:
plt.figure(figsize=(28, 15))

plt.subplot(2,2,1)
coef =sum(GSRL4.best_estimator_.coef_)
indices_rl = np.argsort(coef)

plt.title('Feature Importances-Logistic Regressor Coef')
plt.barh(X_scaled.columns, coef[indices_rl], .25, color='orange')
plt.yticks(X_scaled.columns, X_scaled.columns[indices_rl])
plt.xlabel('Coefficient(Beta)')

plt.subplot(2,2,2)
importances_dt = GSDT4.best_estimator_.feature_importances_
indices_dt = np.argsort(importances_dt)

plt.title('Feature Importance-Decision Tree Classification')
plt.barh(X_scaled.columns, importances_dt[indices_dt], .25, color='purple')
plt.yticks(X_scaled.columns, X_scaled.columns[indices_dt])
plt.xlabel('Relative Importances (MDI)')

plt.subplot(2,2,3)
importances_rf = GSRF4.best_estimator_.feature_importances_
indices_rf = np.argsort(importances_rf)

plt.title('Feature Importances-Random Forest Classification')
plt.barh(X_scaled.columns, importances_rf[indices_rf], .25, color='lightgreen')
plt.yticks(X_scaled.columns, X_scaled.columns[indices_rf])
plt.xlabel('Relative Importances (MDI)')

plt.subplot(2,2,4)
importances_lgb = GS_LGB4.best_estimator_.feature_importances_
indices_lgb = np.argsort(importances_lgb)

plt.title('Feature Importances-LGBM Classification')
plt.barh(X_scaled.columns, importances_lgb[indices_lgb], .25, color='lightblue')
plt.yticks(X_scaled.columns, X_scaled.columns[indices_lgb])
plt.xlabel('Relative Importances (# of splits)')

plt.show()