In [49]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import zscore
from sklearn.impute import KNNImputer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from skopt import BayesSearchCV
from sklearn.svm import SVC
from imblearn.over_sampling import SMOTE
from sklearn.decomposition import PCA
import pickle

In [51]:
dfs_dir = "dfs"
df_names = ["normal", "minmax", "log", "zscore"]

dfs = []

for name in df_names:
    dfs.append(pd.read_csv(os.path.join(dfs_dir, f"df_{name}.csv")))
    
encoders = []
for i in range(len(df_names)):
    
    encoder_path = f"encoders/encoder_{i}.pkl"
    with open(encoder_path, 'rb') as file:  
        encoder = pickle.load(file)
        encoders.append(encoder)


In [5]:
def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    
    metrics = {}
    
    metrics['accuracy'] = accuracy_score(y_test, y_pred)
    
    metrics['precision'] = precision_score(y_test, y_pred)  # or 'macro' for multiclass
    
    metrics['recall'] = recall_score(y_test, y_pred)  # or 'macro'
    
    metrics['f1_score'] = f1_score(y_test, y_pred)  # or 'macro'
    
    conf_matrix = confusion_matrix(y_test, y_pred)
    
    [[tn, fp], [fn, tp]] = conf_matrix
    metrics['specifity'] = tn / (tn+fp)
    
    metrics['confusion_matrix'] = conf_matrix
    
    return metrics

In [39]:
def train_model(model, X, y, test_size = 0.3, param_grid = None, n_iter = 32, cv = 5, scoring = "f1", verbose =2):
    # X = df.drop(columns=['health_ins'])
    # y = df['health_ins']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size)

    if param_grid is None:
        # print("ok")
        model.fit(X_train, y_train)
        metrics = evaluate_model(model, X_test, y_test)

        for metric, value in metrics.items():
            print(f"{metric}: {value}")
        
        return model, metrics
    
    bayes_search = BayesSearchCV(estimator=model,search_spaces=param_grid,
                            n_iter=n_iter, cv=cv, n_jobs=-1, verbose=verbose, scoring=scoring)
    
    bayes_search.fit(X_train, y_train)
    
    best_model = bayes_search.best_estimator_
    
    metrics = evaluate_model(best_model, X_test, y_test)
    
    for metric, value in metrics.items():
        print(f"{metric}: {value}")
        
    return best_model, metrics
    

In [40]:
def find_best_features_random_forest(df, param_grid= None, scoring="f1"):
    X = df.drop(columns=['health_ins'])
    y = df['health_ins']
    # X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
    
    rfc_model = RandomForestClassifier()  # You can adjust n_estimators

    rfc_model, rfc_metrics = train_model(model = rfc_model, X = X , y = y, param_grid = param_grid, scoring = scoring)
        
        
    importance = rfc_model.feature_importances_

    features = X.columns
    
    
    pair_features_scores = dict()

    for i,v in enumerate(importance):
     pair_features_scores[features[i]] = v
     
    final_features = [(key, value) for key, value in pair_features_scores.items() if value >= 0.025]
    
    return final_features

In [44]:
def find_best_features_pca(df, n_components):
    X = df.drop(columns=['health_ins'])
    y = df['health_ins']
    
    
    pca = PCA(n_components=n_components)
    
    pca_names = [f"PCA{i}" for i in range(n_components)]
    
    principal_components = pca.fit_transform(X)
    
    pca_df = pd.DataFrame(data=principal_components, columns=pca_names)
    
    new_df = pd.concat([pca_df, y], axis=1)
    
    return new_df

In [45]:
def apply_smote(df, test_size=0.3):
    
    sm = SMOTE()
    X = df.drop(columns=['health_ins'])
    y = df['health_ins']
    
    X_sm, y_sm = sm.fit_resample(X, y)
    return X_sm, y_sm
    
    

# First Dataset


In [46]:
new_df_0 = find_best_features_pca(dfs[0], 7)
new_df_0.head()

Unnamed: 0,PCA0,PCA1,PCA2,PCA3,PCA4,PCA5,PCA6,health_ins
0,-20633.575991,-2224.764775,167.31417,-0.4971,-2.463672,-0.025793,0.982364,True
1,-19433.590909,-2225.423261,-42.791875,2.50537,-2.427333,0.386913,-0.534436,True
2,-21633.587302,-2223.574189,-2.597009,-0.507951,0.45308,-1.324238,-0.482524,True
3,-4863.588251,-2238.384712,76.212905,-1.501036,-1.108795,-0.768835,-0.668115,True
4,-3633.596981,-2239.236848,-43.901306,-1.500479,-0.550156,1.058972,0.785114,True


## Apply SMOTE

In [47]:
X_sm, y_sm = apply_smote(new_df_0)

## Train forest

In [48]:
rfc_model = RandomForestClassifier(n_estimators=100)

rfc_model = train_model(model = rfc_model, X = X_sm, y = y_sm)


ok
accuracy: 0.9243083618277899
precision: 0.9151358736906027
recall: 0.9355890113301257
f1_score: 0.9252494244052187
specifity: 0.9129961089494163
confusion_matrix: [[17598  1677]
 [ 1245 18084]]


## Kaggle

In [56]:
kaggle_df = pd.read_csv("dfs_kaggle/df_normal.csv")
kaggle_df['health_ins'] = np.nan
kaggle_df.head()

Unnamed: 0,is_employed,income,num_vehicles,age,code_column,rooms,recent_move_b,gas_payment,gas_bill,sex_Female,...,state_of_res_South Dakota,state_of_res_Tennessee,state_of_res_Texas,state_of_res_Utah,state_of_res_Virginia,state_of_res_Washington,state_of_res_West Virginia,state_of_res_Wisconsin,state_of_res_Wyoming,health_ins
0,False,28900.0,1.0,0.05911,653,6,False,0.0,20.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
1,False,40000.0,1.0,0.030914,404,5,True,0.0,40.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
2,False,203000.0,3.0,0.131355,1291,2,False,0.0,80.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
3,False,0.0,1.0,1.0,8962,2,False,0.0,30.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
4,False,40000.0,4.0,0.218322,2059,1,False,0.0,150.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
