In [1]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import StackingClassifier
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import MinMaxScaler

In [2]:
# Read file CSV Download of https://www.kaggle.com/datasets/jacquelineguzman/claims-of-medical-billing/
file_cvs = "claims.csv"
# Create dataframe with information of file CSV
df = pd.read_csv(file_cvs, delimiter=',', encoding='utf-8')

In [3]:
# Define colums target (CLAIM)
columns = df.columns
target = "CLAIM"
x_columns = columns.drop(target)

In [4]:
# Update datatype int64 to int32 and float64 to float32, because it is necessary to use less RAM
float_columns = ['QUANTITY_PRODUCT_SERVICE', 'SALES_PRICE', 'INVOICED_PRICE']
int_columns = columns.drop(float_columns)
df[float_columns]=df[float_columns].astype(np.float32)
df[int_columns]=df[int_columns].astype(int)

In [5]:
# Normalize data via MinMaxScaler function
scaler = MinMaxScaler()
df = scaler.fit_transform(df)
df = pd.DataFrame(df,columns=columns)

In [6]:
# Create dataframe with int variables (ValX) and target variable (ValY)
ValX = df.drop(columns=target)
ValY = df[target]

In [7]:
# Set training and validation data
X_train, X_validation, Y_train, Y_validation = train_test_split(ValX, ValY, test_size=0.15, random_state=1, shuffle=True)
X_train.shape, X_validation.shape

((5188776, 53), (915667, 53))

In [8]:
# Resampling the minority class using SMOTE stategy (Over-sampling)
sm = SMOTE(sampling_strategy='minority', random_state=42)

In [None]:
estimators = [('rf', RandomForestClassifier(n_estimators=3, max_depth=10)),
              ('knn', KNeighborsClassifier(n_neighbors=5))]
meta_estimator = LogisticRegression()
stacking = StackingClassifier(estimators=estimators, final_estimator=meta_estimator)

In [None]:
stacking.fit(X_train, Y_train)

In [None]:
# Model random forest classifier
RFC_model = RandomForestClassifier(n_estimators=5, max_depth=10)
RFC_model.fit(X_train, Y_train)

In [None]:
# Load the dataset
X, y = make_classification(n_samples=1000, n_features=10, n_classes=2, random_state=42)
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Define the base estimators
estimators = [('rf', RandomForestClassifier(n_estimators=10, random_state=42)),
              ('knn', KNeighborsClassifier(n_neighbors=5))]
# Define the meta-model
meta_estimator = LogisticRegression()
# Create the stacking classifier
stacking = StackingClassifier(estimators=estimators, final_estimator=meta_estimator)
# Train the stacking classifier
stacking.fit(X_train, y_train)
# Make predictions on the testing data
y_pred = stacking.predict(X_test)
# Evaluate the accuracy of the stacking classifier
accuracy = stacking.score(X_test, y_test)
print("Accuracy:", accuracy)