In [None]:
# schiller test --> iodine that stains different on abnormal cells on cervix
# hinsellman --> coloscopy using scope on cervix
# cytology --> pap smear
# biopsy --> biopsy 
# These indicate the RESULTS of these tests, assuming they were carried out. 1 means suspect screening, 0 means okay screening. 

# dx columns mean previous cervical diagnosis 
# stds_number is the sum over all std columns
# Cant figure out what stds_n_diagnosis is, though. 

In [None]:
import gc 
import pandas as pd
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split, RepeatedStratifiedKFold, cross_val_score, GridSearchCV, RandomizedSearchCV, cross_validate, StratifiedKFold
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.metrics import classification_report, RocCurveDisplay, PrecisionRecallDisplay, fbeta_score, make_scorer
from sklearn.impute import MissingIndicator, SimpleImputer
import matplotlib.pyplot as plt
import miceforest as mf
from miceforest import mean_match_default
import seaborn as sns
from lightgbm import LGBMClassifier
import inspect 
import warnings
import scipy.stats as stats
from tempfile import mkdtemp
from joblib import Memory
from shutil import rmtree
from mice_imputer import *
import prince as pr
import pickle 

In [None]:
df = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/00383/risk_factors_cervical_cancer.csv")

# Encode missings

In [None]:
df = df.replace({"?": pd.NA})

# Rename columns to be more manageable

In [None]:
df.columns.values

In [None]:
new_names = df.columns 
to_rep = {
    "Number" : "n",
    "Contraceptives" : "bc", 
    "Num" : "n",
    "-" : "_",
    "of" : "",
    " " : "_", 
    "(" : "",
    ")" : "",
    "/" : "_",
    ":" : "_", 
    "__" : "_"
}

for key, value in to_rep.items(): 
    new_names = new_names.str.replace(key, value, regex = True)

new_names = new_names.str.lower()

df = df.set_axis(new_names, axis = 1)

df.columns.values

In [None]:
df = df.apply(pd.to_numeric, axis = 1).convert_dtypes() # convert_dtypes not working without the apply() call. Probably due to the earlier replace statement, but fiddled for an hour and no dice.

# Verifies that the count of stds is the sum over all std columns. 

In [None]:
df[df.columns[df.columns.str.startswith("stds_")]]

In [None]:
all((df[df.columns[df.columns.str.startswith("stds_")]].drop(["stds_time_since_first_diagnosis", "stds_time_since_last_diagnosis", "stds_n_diagnosis", "stds_number"], axis = 1).sum(axis = 1) == df.stds_number).dropna())

# Drop n_diagnosis col

In [None]:
df.drop(["stds_n_diagnosis"], axis = 1, inplace = True)

# Check missingness 

In [None]:
df.isna().mean().sort_values(ascending = False)

# Drop time since std diagnoses

In [None]:
df.drop(df.columns.values[df.columns.str.startswith("stds_time")], axis = 1, inplace = True)

# Check for constant columns

In [None]:
const = df.nunique() == 1

if any(const):
    print("Deleting constant columns: {}".format(df.columns.values[const]))
    df.drop(df.columns.values[const], axis = 1, inplace = True)

In [None]:
df

# Iud/smoking years are always >0 if you have an IUD/smoke.

In [None]:
np.any((df.iud == 1) & (df.iud_years == 0))

In [None]:
np.any((df.smokes == 1) & (df.smokes_years == 0))

# PCA on STD Columns

In [None]:
stds = df[df.columns.values[df.columns.str.startswith("stds_")]].drop("stds_number", axis = 1).dropna()

In [None]:
scaler = StandardScaler()
pc = PCA()
stds_s = scaler.fit_transform(stds)
pc.fit(stds_s)

In [None]:
eigs = pc.explained_variance_ratio_
eigs_cum = np.cumsum(eigs)
ind = [i + 1 for i in range(len(eigs))]
print(eigs_cum)

In [None]:
plt.plot(ind, eigs)
plt.plot(ind, eigs_cum)

In [None]:
pc.explained_variance_[0:3]

In [None]:
loadings = pc.components_[0:6].T * np.sqrt(pc.explained_variance_[0:6])

loadmat = pd.DataFrame(np.round(loadings, 4), columns=['PC1', 'PC2', 'PC3', "PC4", "PC5", "PC6"], index=stds.columns.values)
loadmat


# Train/Test Data

In [None]:
x = df.drop(["smokes", "hormonal_bc", "iud", "stds", "schiller", "biopsy", "hinselmann"], axis = 1)
x["n_stds"] = x["stds_number"]
x.drop("stds_number", axis = 1, inplace = True)
#x.drop(x.columns.values[x.columns.str.startswith("stds")], axis = 1, inplace = True)
y = df[["biopsy"]].astype("int64")


In [None]:
x.dtypes

In [None]:
x[x.select_dtypes(include=['Int64', 'Float64']).columns.values] = x.select_dtypes(include=['Int64', 'Float64']).astype('float')
# x[x.columns.values[x.columns.str.startswith("stds_")]] = x[x.columns.values[x.columns.str.startswith("stds_")]].astype("bool")
# x[x.columns.values[x.columns.str.startswith("dx")]] = x[x.columns.values[x.columns.str.startswith("dx")]].astype("bool")
# x["citology"] = x.citology.astype("bool")
x.dtypes

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state = 987417)

In [None]:
x.stds_condylomatosis.isna().mean()

In [None]:
print(y_train.shape, y_test.shape)
x_train.head()

In [None]:
n = 2
p = .0075
# x = stats.nbinom.rvs(n = n, p=p, size = 10000)
# plt.hist(x,density=True, bins = 100)
xl = np.floor(np.linspace(0, 1000, 1000))
plt.plot(xl, stats.nbinom.pmf(xl, n = n, p = p, loc = 1))

In [None]:
pd.DataFrame((stats.nbinom.rvs(size = 10000, n = n, p = p, loc = 1))).describe()

In [None]:
xl = np.linspace(0, 1, 1000)
plt.plot(xl, stats.beta.pdf(xl, 1.75, 8))

In [None]:
clf = LGBMClassifier(objective = "binary", class_weight = 'balanced')

In [None]:
impute_grid_template = {
    "objective" : "poisson"
}

impute_grid= {}

keys = x.columns.values[x.isna().any()] 

for i in keys: 

    impute_grid[i] = impute_grid_template.copy()

    if "stds_" in i: 
        impute_grid[i]["objective"] = "binary"

In [None]:
impute_grid

In [None]:
mean_match = mean_match_default.copy()
mean_match.set_mean_match_candidates(5)

In [None]:
stds_indicator = ColumnTransformer(
    [("indicator", MissingIndicator(), ["stds_hpv"])],
    remainder='passthrough'
)

simple_union = FeatureUnion(
    transformer_list=[
         ('features', SimpleImputer(strategy='median')),
         ('indicator', stds_indicator)]
)

mice_union = FeatureUnion(
    transformer_list=[
         ('features', mice_imputer(mean_match_scheme = mean_match)),
         ('indicator', stds_indicator)]
)

std_cols = np.where(df.columns.str.startswith("stds"))[0]

pca_stds = ColumnTransformer(
    [("pca", PCA(n_components = 5),  std_cols)],
    remainder = "passthrough"
)


In [None]:
cachedir = mkdtemp()
memory = Memory(location=cachedir, verbose=0)
pipe = Pipeline(
    memory = memory,
    steps = [
        ("imputer", simple_union),
        ("pca", pca_stds),
        ("classifier", clf)
    ]
)

In [None]:
grid = [
    {
    "imputer" : [simple_union],
    "imputer__features__strategy" : ["mean", "median"],
    "pca__pca__n_components" : stats.randint(1, 8),
    "classifier__n_estimators" : stats.nbinom(n = 2, p = .0075, loc = 1),
    "classifier__max_depth" : stats.randint(1, 10),
    "classifier__learning_rate" : stats.beta(1.5, 9),
    "classifier__min_child_samples" : stats.randint(3, 75),
    "classifier__cat_smooth" : stats.uniform(0, 25)
    },
    {"imputer" : [mice_union],
    "pca__pca__n_components" : stats.randint(1, 8),
    "classifier__n_estimators" : stats.nbinom(n = 2, p = .0075, loc = 1),
    "classifier__max_depth" : stats.randint(1, 10),
    "classifier__learning_rate" : stats.beta(1.5, 9),
    "classifier__min_child_samples" : stats.randint(3, 75),
    "classifier__cat_smooth" : stats.uniform(0, 25),
    "imputer__features__mice_iterations" : stats.randint(5, 20),
    "imputer__features__lgb_iterations" : stats.nbinom(n = 2, p = .0075, loc = 1),
    "imputer__features__lgb_learning_rate" : stats.beta(1.75, 8),
    "imputer__features__lgb_max_depth" : stats.randint(1, 10),
    "imputer__features__lgb_cat_smooth" : stats.uniform(0, 25),
    "imputer__features__lgb_feature_fraction_bynode" : stats.uniform(0, 1)
    }
]

In [None]:
inner_cv = StratifiedKFold(n_splits=5, random_state=874841, shuffle = True)
outer_cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=5, random_state=878571)

In [None]:
gc.collect()

In [None]:
f3_scorer = make_scorer(fbeta_score, beta = 3)

rcv = RandomizedSearchCV(
    estimator = pipe,
    param_distributions = grid, 
    scoring = "recall",
    refit = True, 
    cv = inner_cv,
    return_train_score = True,
    n_jobs = 1,
    n_iter = 2000,
    random_state = 97417
)

nested_scores = cross_validate(
    rcv, 
    X = x, 
    y = y.values.flatten(), 
    cv = outer_cv, 
    return_estimator = True, 
    scoring = ["average_precision", "balanced_accuracy", "f1", "precision", "recall"],
    n_jobs = 19,
    verbose = 999
)


In [None]:
try:
    rmtree(cachedir)
except:
    pass 

gc.collect()

In [None]:
def save_obj(obj, filename):
    with open(filename, 'wb') as outp:  # Overwrites any existing file.
        pickle.dump(obj, outp, pickle.HIGHEST_PROTOCOL)

In [None]:
save_obj(nested_scores, "~/gdrive/github/cervical_cancer/rcv.pkl")

In [None]:
#best_models = nested_scores['estimator']
# mn = nested_scores["test_score"].mean()
# st = nested_scores["test_score"].std()
# [mn - 1.96*st, mn + 1.96 * st]
#for i, model in enumerate(best_models):
#     #print(model.best_estimator_)
      #print(model.best_params_)
#     print(model.best_score_)