In [20]:
import pandas as pd
import numpy as np
import sklearn 
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib
import warnings
warnings.filterwarnings('ignore')
np.random.seed(32)
from sklearn.model_selection import train_test_split
from scipy.stats import chi2_contingency
import pickle
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin

In [107]:
with open("train.pkl", "rb") as f:
    X_train, y_train = pickle.load(f)

In [108]:
X_train.head()
len(X_train)

49000

In [109]:
class SmokingImputer(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.distribution_with_never = None
        self.distribution_without_never = None

    def fit(self, X, y=None):
        self.distribution_without_never = X.loc[~X['smoking_history'].isin(['No Info', 'never']), 'smoking_history'].value_counts(normalize=True)
        self.distribution_with_never = X.loc[~X['smoking_history'].isin(['No Info']), 'smoking_history'].value_counts(normalize=True)
        return self

    def transform(self, X):
        # TODO dodac opje do uzywania with never

        X.loc[(X['smoking_history'] == 'No Info'), 'smoking_history'] = np.random.choice(
            self.distribution_without_never.index, 
            size=len(X[(X['smoking_history'] == 'No Info')]), 
            p=self.distribution_without_never.values
            )
        return X
    
    def get_feature_names_out(self, input_features=None):
        return ['smoking_history']


In [110]:
drop_cols = ['year', 'location']
race_cols = [col for col in X_train.columns if col.startswith('race')]
drop_cols.extend(race_cols)

bin_features = ['hypertension', 'heart_disease']
num_features = ['age', 'bmi', 'hbA1c_level', 'blood_glucose_level']
cat_features = ['gender'] # + smoking + notes

gender_encoder = OneHotEncoder(categories=[['Male', 'Female']], drop=None, handle_unknown='ignore')
smoking_encoder = OneHotEncoder(categories=[['never', 'ever', 'current', 'not current', 'former']], drop=None, handle_unknown='ignore')

smoking_imputer = SmokingImputer()

preprocessor = ColumnTransformer(
    transformers=[
        ('drop', 'drop', drop_cols),
        ('scaler', StandardScaler(), num_features),
        ('genderEncoder', gender_encoder, cat_features),
        ('smokingImputer', smoking_imputer, ['smoking_history']),
        #('smokingEncoder', smoking_encoder, ['smoking_history'])
    ],
    remainder='passthrough'
)
# TODO moze kiedys zrobic z tego jeden pipeline

steps = [('preprocessor', preprocessor)]
pipeline = Pipeline(steps = steps)
X_train_transformed = pipeline.fit_transform(X_train)

# print(X_train_transformed[:5, :]) 

columns = preprocessor.get_feature_names_out()
X_train_transformed_df = pd.DataFrame(X_train_transformed, columns=columns)

preprocessor2 = ColumnTransformer(
    transformers=[
        ('smokingEncoder', smoking_encoder, ['smokingImputer__smoking_history'])
    ],
    remainder='passthrough'
)
pipeline2 = Pipeline(steps = [('preprocessor2', preprocessor2)])
X_train_transformed = pipeline2.fit_transform(X_train_transformed_df)
print(X_train_transformed[:5, :]) 

[[0.0 0.0 0.0 0.0 1.0 -1.5046162719441207 -1.1224795416193598
  0.25410474332526123 0.41847133475057363 0.0 1.0 0 0
  'Young patient, generally lower risk but needs lifestyle assessment. Healthy BMI range. Elevated blood glucose levels, potential diabetes concern. History of smoking, potential lung and vascular health impact. Consideration for metabolic syndrome and Type 2 diabetes.']
 [0.0 0.0 0.0 0.0 1.0 -0.6594133233661283 -0.7983636052038867
  -0.49593523546697077 0.05062294505772955 0.0 1.0 0 0
  'Healthy BMI range. Elevated blood glucose levels, potential diabetes concern. History of smoking, potential lung and vascular health impact. Consideration for metabolic syndrome and Type 2 diabetes.']
 [0.0 1.0 0.0 0.0 0.0 -1.771522466231908 -1.2913213317520718
  -1.433485208957261 0.5410874646481884 0.0 1.0 0 0
  'Young patient, generally lower risk but needs lifestyle assessment. Healthy BMI range. Elevated blood glucose levels, potential diabetes concern. History of smoking, potential