<h1 style='text-align: center'> Bladder Cancer Prediction Based on Clinical Laboratory Data </h1>

<p style="text-align: justify;">One of the most common urogenital cancers is bladder cancer, It represents 5 to 10% of all male cancers globally, with a male-to-female ratio that varies from 2:1 to 6:1 in different regions. With over 573,000 new cases and 213,000 fatalities each year, bladder cancer is the tenth most frequently diagnosed cancer worldwide. Men are more likely to contract it than women are, with incidence and mortality rates for men of 9.5 and 3.3 per 100,000 respectively, which are almost 4 times higher than those for women globally. However, smoking is thought to be a main risk factor for bladder cancer patients.</p>

<p style="text-align: justify;">
The majority of bladder cancer patients are identified through diagnostic procedures caused by hemoturia. Visible haematuria is one of the symptoms that is most closely associated with the diagnosis of bladder cancer; 3 year positive predictive values for males are 74% (95% CI 68%-81) and for women are 34% (29-40). Patients without haematuria typically take longer to be diagnosed after experiencing symptoms (such as urgency or recurring infections).
</p>


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

!pip install lazypredict
import lazypredict
from lazypredict.Supervised import LazyClassifier

from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, ExtraTreesClassifier, VotingClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV, cross_val_score, StratifiedKFold, learning_curve

from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, cross_validate, RepeatedStratifiedKFold, StratifiedKFold
from sklearn.preprocessing import LabelEncoder, StandardScaler, KBinsDiscretizer, MinMaxScaler
from sklearn.feature_selection import SelectKBest, chi2, mutual_info_classif, f_classif
from sklearn.metrics import accuracy_score
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier 
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier

from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer, make_column_transformer, make_column_selector
from IPython.core.display import display, HTML

# import xgboost as x
pd.set_option('display.max_columns', None)
data = pd.read_csv('/kaggle/input/the-clinical-laboratory-data-of-bladder-cancer/The clinical laboratory data of bladder cancer .csv')

In [None]:
def missing_values_table(df):
    mis_val = df.isnull().sum()
    mis_val_percent = 100 * df.isnull().sum() / len(df)
    mis_val_table = pd.concat([mis_val, mis_val_percent], axis=1)
    mis_val_table_ren_columns = mis_val_table.rename(
    columns = {0 : 'Missing Values', 1 : '% of Total Values'})
    mis_val_table_ren_columns = mis_val_table_ren_columns[
        mis_val_table_ren_columns.iloc[:,1] != 0].sort_values(
    '% of Total Values', ascending=False).round(1)
    print ("Our selected dataframe has " + str(df.shape[1]) + " columns.\n"      
        "There are " + str(mis_val_table_ren_columns.shape[0]) +
            " columns that have missing values.")
    return mis_val_table_ren_columns.T

def missing_data(df):    
    for i in df[categorical].select_dtypes(exclude=['object']).columns:
        df[i] = df[i].fillna(df[i].median())
    
    for i in df[continuous].columns:
        df[i] = df[i].fillna(df[i].mean())
    return df

In [None]:
missing_values_table(data)

In [None]:
data.Disease.value_counts()

In [None]:
data['gender'].value_counts(normalize=True).mul(100).astype(str) + '%'

In [None]:
data[data['gender'] == 1].age.min()

In [None]:
data['age'].max()

In [None]:
data['age'].min()

In [None]:
threshold = 0.5
#Dropping columns with missing value rate higher than threshold
data = data[data.columns[data.isnull().mean() < threshold]]

#Dropping rows with missing value rate higher than threshold
data = data.loc[data.isnull().mean(axis=1) < threshold]


In [None]:
data.shape

In [None]:
uniuqe = pd.DataFrame(data.nunique().sort_values())
uniuqe.columns = ['Unique number']
display(HTML(f"<div style='text-align:center; font-weight:Bold; font-size:16px'><br>Unique numbers</div>"))
display(uniuqe.T)

In [None]:
continuous = [i for i in data.columns if data[i].nunique() > 10]
categorical = [i for i in data.columns if data[i].nunique() < 10]
data = missing_data(data.copy())
print(data.shape)
display(HTML(f"<div style='text-align:center; font-weight:Bold; font-size:20px'><br>Continuous Features</div>"))
display(data[continuous].sample(2))
display(HTML(f"<div style='text-align:center; font-weight:Bold; font-size:16px'><br>Descriptive Analysis of Continuous Features</div>"))
display(data[continuous].describe().iloc[1:].style.set_properties(**{'border': '1px solid black'}))
display(HTML(f"<div style='text-align:center; font-weight:Bold; font-size:20px'><br>Categorical Features</div>"))
display(data[categorical].sample(2))
display(HTML(f"<div style='text-align:center; font-weight:Bold; font-size:16px'><br>Descriptive Analysis of Categorical Features</div>"))
display(data[categorical].describe().iloc[1:].style.set_properties(**{'border': '1px solid black'}))

In [None]:
X = data.copy()

In [None]:
nor_abnor = {
    "A/G Ratio" : [1,2],
    "Alk" : [44, 147],
    "Creatinine": [0.59, 1.35],
    "Albumin" : [3.5, 5.4],
    "Urine epithelium count": [15, 20],
    "Calcium" : [8.5, 10.2],
}

In [None]:
def details(feature, low, high):
    display(HTML(f"<div style='text-align:center; font-weight:Bold; font-size:20px'><br>{feature}</div>"))
    display(HTML(f"<div style='text-align:center; font-weight:Bold; font-size:16px'>Normal range is {low} to {high}</div>"))
    display(X[[feature]].describe().T)    
    display(HTML(f"<h3'>Normal</h3>"))
    print()
    display(X[(X[feature] >= low) & (X[feature] <= high)]['Disease'].value_counts())
    print()
    display(HTML(f"<h3'>Abnormal</h3>"))
    print()
    display(X[(X[feature] < low) | (X[feature] > high)]['Disease'].value_counts())

In [None]:
for i in nor_abnor:
    details(i, nor_abnor[i][0], nor_abnor[i][1])
#     print(i, nor_abnor[i])

In [None]:
plt.figure(figsize = (35, 30))
for i in enumerate(categorical):
    plt.subplot(7, 7, i[0]+1)
    sns.countplot(i[1], hue = 'Disease', data = X)
    plt.xticks(rotation = 45)

In [None]:
print(data.columns)

In [None]:
to_scale = ["A/G Ratio", "Albumin", "Chloride", "Potassium", "Sodium", "Specific Gravity", "Total Cholesterol", "Total Protein", ]
to_log = ['Alk', 'ALT (GPT)', 'AST (GOT)', 'BUN', 'Calcium', 'Creatinine', 'Direct Bilirubin', 'Estimated GFR', 'Glucose AC', 'Total Bilirubin', 'Triglyceride', 'Urine epitheilum (UL)', 
          'Urine epithelium count', 'Uric acid', 'age', ]
category = ['Nitrite', 'Urine occult Blood', 'pH', 'Strip WBC', 'Urine Bilirubin', 'Urine Glucose', 'Urine Ketone', 'Urine Protein', 'Urobilinogen', 'gender', 'Hyper1en1ion', 'Diabe1es', 'Smoking',
           'Drinking', 'Bee1leNu1', 'FamilyHis1ory']

In [None]:
ss = StandardScaler()

X[to_scale] = ss.fit_transform(X[to_scale])

X[to_scale].hist(figsize=(14, 14));

In [None]:
from sklearn.preprocessing import PowerTransformer

# Init
pt = PowerTransformer()

X[to_log] = pd.DataFrame(
    pt.fit_transform(X[to_log]), columns=[to_log]
)

X[to_log].hist(figsize=(14, 14));

In [None]:
# sns.histplot(data[continuous])

In [None]:
data[continuous].hist(figsize=(35, 35))
plt.show()

In [None]:
data[continuous].boxplot(figsize=(35, 10))
plt.show()

In [None]:
temp = X.copy()

In [None]:
ss = StandardScaler()
_ = ss.fit(X[['A/G Ratio']])

# Transform
temp['A/G Ratio'] = pd.DataFrame(ss.transform(X[['A/G Ratio']]), columns=['A/G Ratio'])

# Plot
temp['A/G Ratio'].hist(figsize=(4, 4));

In [None]:
ss = StandardScaler()
_ = ss.fit(X[['Albumin']])

# Transform
temp['Albumin'] = pd.DataFrame(ss.transform(X[['Albumin']]), columns=['Albumin'])

# Plot
temp['Albumin'].hist(figsize=(4, 4));

In [None]:
from sklearn.preprocessing import PowerTransformer

# Init
pt = PowerTransformer()

X[["Albumin"]] = pd.DataFrame(
    pt.fit_transform(X[["Albumin"]]), columns=["Albumin"]
)

X[["Albumin"]].hist(figsize=(14, 5));

In [None]:
sns.kdeplot(x="Albumin", hue='Disease', data=X)

In [None]:
from sklearn.preprocessing import PowerTransformer

# Init
pt = PowerTransformer()

X[["Creatinine"]] = pd.DataFrame(
    pt.fit_transform(X[["Creatinine"]]), columns=["Creatinine"]
)

X[["Creatinine"]].hist(figsize=(14, 5));

In [None]:
X[["Creatinine"]].min()

In [None]:
sns.boxplot(np.log(temp['Alk']))

In [None]:
sns.kdeplot(np.log(X['Alk']))

In [None]:
data[["pH"]].hist(figsize=(14, 5));

In [None]:
from sklearn.preprocessing import PowerTransformer

# Init
pt = PowerTransformer()

X[["Alk"]] = pd.DataFrame(
    pt.fit_transform(X[["Alk"]]), columns=["Alk"]
)

X[["Alk"]].hist(figsize=(14, 5));

In [None]:
continuous

In [None]:
from sklearn.preprocessing import PowerTransformer

# Init
pt = PowerTransformer()

X[["AST (GOT)"]] = pd.DataFrame(
    pt.fit_transform(X[["AST (GOT)"]]), columns=["AST (GOT)"]
)

X[["AST (GOT)"]].hist(figsize=(14, 5));

In [None]:
from sklearn.preprocessing import PowerTransformer

# Init
pt = PowerTransformer()

X[["pH"]] = pd.DataFrame(
    pt.fit_transform(X[["pH"]]), columns=["pH"]
)

X[["pH"]].hist(figsize=(14, 5));

In [None]:
X[["AST (GOT)", "Alk", "pH", "ALT (GPT)"]].var()

In [None]:
from sklearn.preprocessing import PowerTransformer

# Init
pt = PowerTransformer()

X[["ALT (GPT)"]] = pd.DataFrame(
    pt.fit_transform(X[["ALT (GPT)"]]), columns=["ALT (GPT)"]
)

X[["ALT (GPT)"]].hist(figsize=(14, 5));

In [None]:
from scipy.stats.mstats import winsorize

a = winsorize(np.log(data['Creatinine']), limits=[0.1, 0.2])

In [None]:
sns.boxplot(a)

In [None]:
pd.DataFrame(a).hist()

In [None]:
sns.kdeplot(np.log(data['pH']))

In [None]:
X['Alk'].max()

In [None]:
a.max()

In [None]:
# temp = x[['pH', 'Alk']]
# temp.corr()

In [None]:
temp.iloc[:, :-1]

In [None]:
def destribution_scale(df):
    df = df.copy()
    df['Alk'] = np.log(df['Alk'])
    
    return df

In [None]:
def outlier_remove(df):
    df = df.copy()
    df['pH'] = winsorize(df['pH'], limits=[0.1, 0.2])
    
    return df

In [None]:
Disease = {
    "UrinaryBladder": 1,
    "Prostate": 2,
    "Kidney": 3,
    "Uterus": 4,
    "Cystitis": 5    
}

data['Disease1'] = data['Disease'].map(Disease)

In [None]:
X[to_scale].mean()

# Feature Engineering

In [None]:
print(X.columns)

In [None]:
g = sns.FacetGrid(X, col="Disease", row="Diabe1es", margin_titles=True)
g.map(plt.scatter,"Urine epithelium count", "Total Bilirubin", edgecolor="w")
plt.subplots_adjust(top=0.9)

In [None]:
sns.jointplot("Total Bilirubin", "Direct Bilirubin", data=data, kind="reg")

In [None]:
g = sns.FacetGrid(data, col="pH", row="Disease", margin_titles=True)
g.map(plt.scatter,"Total Protein", "Albumin",  edgecolor="w")
plt.subplots_adjust(top=0.9)

In [None]:
data.groupby('Disease').mean()

In [None]:
X.groupby('Disease').mean()

In [None]:
stats = data.copy()

In [None]:
stats = data.groupby('Disease')['ALT (GPT)'].agg(['mean', 'max', 'min'])
stats.columns = ['mean_ALT (GPT)', 'max_ALT (GPT)', 'min_ALT (GPT)']

# Merge with the clients dataframe
stats = data.merge(stats, left_on = 'Disease', right_index=True, how = 'left')

In [None]:
corr = stats.corr()
#
# Set up the matplotlib plot configuration
#
f, ax = plt.subplots(figsize=(30, 30))
#
# Generate a mask for upper traingle
#
# mask = np.triu(np.ones_like(corr, dtype=bool))
#
# Configure a custom diverging colormap
#
cmap = sns.diverging_palette(230, 20, as_cmap=True)
#
# Draw the heatmap
#
sns.heatmap(corr, annot=True)


# Feature selection

In [None]:
from sklearn.feature_selection import VarianceThreshold

# Normalize data
temp = temp / temp.mean()
# temp = temp.drop(labels = ["Disease"], axis = 1)
# Init, fit VT
vt = VarianceThreshold(threshold=.9)
_ = vt.fit(temp)

# Get a boolean mask
mask = vt.get_support()

# Subset the data
X_reduced = temp.loc[:, mask]
X_reduced.shape

In [None]:
X.to_csv("bladder.csv", index=False)

In [None]:
plt.figure(figsize=(26, 26))

sns.heatmap(X.corr(), vmin=-1, vmax=1, annot=True);

In [None]:
temp = pd.get_dummies(data, columns = category)

In [None]:
temp

In [None]:
y = X["Disease"]
# x = X.drop(labels = ["Disease"], axis = 1)
# x = X[category + to_log + to_scale]
x = X_reduced

# x = destribution_scale(x)
# x = outlier_remove(x)

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, stratify=y)
clf = LazyClassifier(verbose=0,ignore_warnings=True)
models, predictions = clf.fit(x_train, x_test, y_train, y_test)
models['Accuracy'].sort_values(ascending=False)