In [None]:
# schiller test --> iodine that stains different on abnormal cells on cervix
# hinsellman --> coloscopy using scope on cervix
# cytology --> pap smear
# biopsy --> biopsy 
# These indicate the RESULTS of these tests, assuming they were carried out. 1 means suspect screening, 0 means okay screening. 

# dx columns mean previous cervical diagnosis 
# stds_number is the sum over all std columns
# Cant figure out what stds_n_diagnosis is, though. 

In [None]:
import pandas as pd
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score, GridSearchCV
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.metrics import classification_report, RocCurveDisplay, PrecisionRecallDisplay
from sklearn.impute import MissingIndicator, SimpleImputer
import matplotlib.pyplot as plt
import miceforest as mf
from miceforest import mean_match_default
import seaborn as sns
from lightgbm import LGBMClassifier
import inspect 

In [None]:
df = pd.read_csv("./data.csv")

# Encode missings

In [None]:
df = df.replace({"?": pd.NA})

# Rename columns to be more manageable

In [None]:
df.columns.values

In [None]:
new_names = df.columns 
to_rep = {
    "Number" : "n",
    "Contraceptives" : "bc", 
    "Num" : "n",
    "-" : "_",
    "of" : "",
    " " : "_", 
    "(" : "",
    ")" : "",
    "/" : "_",
    ":" : "_", 
    "__" : "_"

}

for key, value in to_rep.items(): 
    new_names = new_names.str.replace(key, value, regex = True)

new_names = new_names.str.lower()

df = df.set_axis(new_names, axis = 1)

df.columns.values


In [None]:
df = df.apply(pd.to_numeric, axis = 1).convert_dtypes() # convert_dtypes not working without the apply() call. Probably due to the earlier replace statement, but fiddled for an hour and no dice.

# Verifies that the count of stds is the sum over all std columns. 

In [None]:
df[df.columns[df.columns.str.startswith("stds_")]]

In [None]:
all((df[df.columns[df.columns.str.startswith("stds_")]].drop(["stds_time_since_first_diagnosis", "stds_time_since_last_diagnosis", "stds_n_diagnosis", "stds_number"], axis = 1).sum(axis = 1) == df.stds_number).dropna())

# Drop n_diagnosis col

In [None]:
df.drop(["stds_n_diagnosis"], axis = 1, inplace = True)

# Check missingness 

In [None]:
df.isna().mean().sort_values(ascending = False)

# Drop time since std diagnoses

In [None]:
df.drop(df.columns.values[df.columns.str.startswith("stds_time")], axis = 1, inplace = True)

# Check for constant columns

In [None]:
const = df.nunique() == 1

if any(const):
    print("Deleting constant columns: {}".format(df.columns.values[const]))
    df.drop(df.columns.values[const], axis = 1, inplace = True)

# Iud/smoking years are always >0 if you have an IUD/smoke.

In [None]:
df.loc[(df.iud == 1) & (df.iud_years == 0)]

In [None]:
df.loc[(df.smokes == 1) & (df.smokes_years == 0)]

# PCA on STD Columns

In [None]:
stds = df[df.columns.values[df.columns.str.startswith("stds_")]].drop("stds_number", axis = 1).dropna()

In [None]:
scaler = StandardScaler()
pc = PCA()
stds_s = scaler.fit_transform(stds)
pc.fit(stds_s)

In [None]:
eigs = pc.explained_variance_ratio_
eigs_cum = np.cumsum(eigs)
ind = [i + 1 for i in range(len(eigs))]
print(eigs_cum)

In [None]:
plt.plot(ind, eigs)
plt.plot(ind, eigs_cum)

In [None]:
pc.explained_variance_[0:3]

In [None]:
loadings = pc.components_[0:3].T * np.sqrt(pc.explained_variance_[0:3])

loadmat = pd.DataFrame(np.round(loadings, 4), columns=['PC1', 'PC2', 'PC3'], index=stds.columns.values)
loadmat


In [None]:
x = df.drop(["smokes", "hormonal_bc", "iud", "stds", "schiller", "biopsy", "hinselmann"], axis = 1)
x["n_stds"] = x["stds_number"]
x.drop(x.columns.values[x.columns.str.startswith("stds")], axis = 1, inplace = True)
y = df[["biopsy"]].astype("int64")


In [None]:
x.dtypes

In [None]:
x[x.select_dtypes(include=['Int64', 'Float64']).columns.values] = x.select_dtypes(include=['Int64', 'Float64']).astype('float')
x[x.columns.values[x.columns.str.startswith("dx")]] = x[x.columns.values[x.columns.str.startswith("dx")]].astype("category")
x.dtypes

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y)

In [None]:
print(y_train.shape, y_test.shape)
x_train.head()

In [None]:
impute_grid_template = {
    # "boosting" : "gbdt",
    "objective" : "poisson"
    # "num_iterations" : (25, 1000),
    # "max_depth" : (1, 10),
    # "num_leaves" : (4, 25),
    # "min_data_in_leaf" : (1, 15),
    # "min_sum_hessian_in_leaf" : (0, .1),
    # "min_gain_to_split" : (0, .1),
    # "bagging_fraction" : (.1, 1),
    # "feature_fraction" : 1,
    # "feature_fraction_bynode" : (.5, 1),
    # "learning_rate" : (1e-5, .1),
    # "cat_smooth" : (0, 25)
}

impute_grid = {}
keys = x.columns.values[x.isna().any()] 

for i in keys: 

    impute_grid[i] = impute_grid_template.copy()

    if "dx" in i: 
        impute_grid[i]["objective"] = "binary"


In [None]:
impute_grid

In [None]:
# optimal_parameters, losses = impute_kernel.tune_parameters(
#   #variables = keys,
#   variable_parameters = impute_grid,
#   dataset = 0
# )

In [None]:
#imputed = impute_kernel.complete_data(dataset=0)[keys]

In [None]:
# miss = x_train[keys]

# fig, axes = plt.subplots(2,4,figsize=(12, 6))
# for i, ax in enumerate(axes.ravel()):
#     miss.plot.hist(column = miss.columns.values[i], ax = ax, bins = 30)

In [None]:
mean_match = mean_match_default.copy()
mean_match.set_mean_match_candidates(5)

impute_kernel = mf.ImputationKernel(x_train, mean_match_scheme=mean_match, datasets=1)

In [None]:
import warnings
class mice_imputer():
    """
    Wrapper class for miceimputer around sklearn transformers to avoid error in miceimputer which requires the transform method to be called on the same dataset as the fit method was. This is a problem when trying to fit on a training set and 
    transform on a validation set within an sklearn pipeline that is called within gridsearchCV. 

    Pass any arguments as kwargs to this class from miceimputer's ImputationKernel() class, as well as from the ImputationKernel.tune_parameters() method. Appropriate fit and transform methods will then be created such that the miceimputer.trasform
    method will work on new data. 

    Note: miceimputers randomsearch tuning will be performed whenever fit() is called. Tuning miceimputer through an sklearn parameter grid would otherwise be a bit 
    """
    def __init__(self, variable_parameters = None, **kwargs):
        self.all_kwargs = kwargs
        self.lgb_args = {"num_iterations", "learning_rate", "num_leaves", 
                         "max_depth", "min_data_in_leaf", "min_sum_hessian_in_leaf", 
                         "bagging_fraction", "colsample_bytree", "colsample_bynode", 
                         "lambda_l1", "lambda_l2", "min_split_gain", "cat_smooth"}
        self.lgb_args = self.__arg_intersect(self.all_kwargs, self.lgb_args, right_fn = False)
        self.inst_args = self.__arg_intersect(self.all_kwargs, mf.ImputationKernel)
        self.mice_args = self.__arg_intersect(self.all_kwargs, mf.ImputationKernel.mice)
        self.variable_parameters = variable_parameters
        #self.__map_dict(self.lgb_args)
        #self.__map_dict(self.inst_args)
        #self.__map_dict(self.mice_args)
        self.kern = []


        self.invalid = set(self.all_kwargs.keys()).difference(set(self.inst_args.keys()).union(set(self.lgb_args.keys()), set(self.mice_args.keys())))

        if len(self.invalid) > 0: 
            warnings.warn("Invalid **kwargs will be ignored:{}".format(self.invalid))

    def __arg_intersect(self, kwargs_dict, right, right_fn = True):
        right = inspect.getfullargspec(right).args if right_fn else right
        inter = kwargs_dict.keys() & right 
        out_dict = {key: kwargs_dict[key] for key in inter}

        return out_dict 
    
    def __warn_clean(message, category, filename, lineno, file = None, line = None):
        return ("%s:%s %s: %s\n") % (filename, lineno, category.__name__, message) 
    
    warnings.formatwarning = __warn_clean

    # def __map_dict(self, dict):
    #     for k, v in dict.items():
    #         setattr(self, k, v)

    def __merge_dict(self, *args):
        base = dict()
        for i in args:
            base.update(i)
        return(base)
        
        

    def get_params(self, deep = True):
        return self.__merge_dict(self.lgb_args, self.inst_args, self.mice_args)
    
    def set_params(self, **parameters):
        for parameter, value in parameters.items():
            setattr(self, parameter, value)
        return self
        

    
    def fit(self, X, y = None):
        self.kern = mf.ImputationKernel(X, save_models = 2, **self.inst_args)
        self.kern.mice(variable_parameters = self.variable_parameters, **self.mice_args, **self.lgb_args)
        return(self)
    
    def transform(self, X, y = None):
        return self.kern.impute_new_data(X, copy_data = True).complete_data(inplace = False)
        
    def fit_transform(self, X, y = None):
        return self.fit(X).transform(X)


In [None]:
folds = StratifiedKFold(7, shuffle = True)

In [None]:
clf = LGBMClassifier()

In [None]:
pipe = Pipeline([
    ("imputer", mice_imputer()),
    ("classifier", clf) 
])

In [None]:
grid = {"imputer__num_iterations" : [1, 2]}

In [None]:
gcv = GridSearchCV(
    estimator = pipe,
    param_grid = grid, 
    scoring = "recall",
    refit = True, 
    cv = folds,
    return_train_score = True,
    n_jobs = 1 
)

In [None]:
gcv.fit(x_train, y_train.values.flatten())