In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import joblib

# Load the dataset
file_path = '/content/Health_Plan_Prior_Authorization_Data.csv'
data = pd.read_csv(file_path)

# Verify that the 'Approval rate' column exists
if 'Approval rate' not in data.columns:
    raise KeyError("The 'Approval rate' column is missing from the dataset.")

# Define the target variable
target = 'Approval rate'

# Identify categorical and numeric columns
categorical_cols = data.select_dtypes(include=['object', 'category']).columns.tolist()
numeric_cols = data.select_dtypes(include=['number']).columns.tolist()

# Assuming 'Index' is not a feature and just a row identifier, we remove it from numeric_cols
if 'Index' in numeric_cols:
    numeric_cols.remove('Index')

# Fill missing values for numeric columns with median
for col in numeric_cols:
    if data[col].isnull().any():
        data[col].fillna(data[col].median(), inplace=True)

# For categorical columns, fill missing values with the mode (most frequent value)
for col in categorical_cols:
    if data[col].isnull().any():
        data[col].fillna(data[col].mode()[0], inplace=True)

# Create dummy variables for categorical columns
data = pd.get_dummies(data, columns=categorical_cols, drop_first=True)
data[target] = data[target].apply(lambda x: 1 if x > 0.5 else 0)

# Drop the 'Index' column if it is present
data = data.drop(columns=['Index'], errors='ignore')  # Use errors='ignore' to avoid KeyError if the column doesn't exist

# Split the data
X = data.drop(columns=[target])
y = data[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create the pipeline
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('classifier', LogisticRegression(solver='saga', max_iter=1000, random_state=42))
])

# Train the model
pipeline.fit(X_train, y_train)

# Evaluate the model
accuracy = accuracy_score(y_test, pipeline.predict(X_test))
print(f"Accuracy on the test set: {accuracy}")

# Save the trained model
joblib.dump(pipeline, 'insurance_claim_predictor_model.pkl')

# Save the feature names
joblib.dump(X_train.columns.tolist(), 'feature_names.pkl')

# Calculate and save the feature importances (coefficients) for logistic regression
coefficients = pipeline.named_steps['classifier'].coef_[0]
feature_importances = pd.Series(coefficients, index=X_train.columns)
joblib.dump(feature_importances, 'feature_importances.pkl')

Accuracy on the test set: 0.9057337220602527


['feature_importances.pkl']

In [2]:
pip install streamlit

Collecting streamlit
  Downloading streamlit-1.33.0-py2.py3-none-any.whl (8.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.1/8.1 MB[0m [31m27.2 MB/s[0m eta [36m0:00:00[0m
Collecting gitpython!=3.1.19,<4,>=3.0.7 (from streamlit)
  Downloading GitPython-3.1.43-py3-none-any.whl (207 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m207.3/207.3 kB[0m [31m26.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pydeck<1,>=0.8.0b4 (from streamlit)
  Downloading pydeck-0.9.0b1-py2.py3-none-any.whl (5.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.8/5.8 MB[0m [31m60.7 MB/s[0m eta [36m0:00:00[0m
Collecting watchdog>=2.1.5 (from streamlit)
  Downloading watchdog-4.0.0-py3-none-manylinux2014_x86_64.whl (82 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m83.0/83.0 kB[0m [31m12.0 MB/s[0m eta [36m0:00:00[0m
Collecting gitdb<5,>=4.0.1 (from gitpython!=3.1.19,<4,>=3.0.7->streamlit)
  Downloading gitdb-4

In [3]:
! wget -q -O - ipv4.icanhazip.com

35.226.71.158


In [None]:

! streamlit run app.py & npx localtunnel --port 8501

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  input_df[col] = 0
  input_df[col] = 0
  input_df[col] = 0
  input_df[col] = 0
  input_df[col] = 0
  input_df[col] = 0
  input_df[col] = 0
  input_df[col] = 0
  input_df[col] = 0
  input_df[col] = 0
  input_df[col] = 0
  input_df[col] = 0
  input_df[col] = 0
  input_df[col] = 0
  input_df[col] = 0
  input_df[col] = 0
  input_df[col] = 0
  input_df[col] = 0
  input_df[col] = 0
  input_df[col] = 0
  input_df[col] = 0
  input_df[col] = 0
  input_df[col] = 0
  input_df[col] = 0
  input_df[col] = 0
  input_df[col] = 0
  input_df[col] = 0
  input_df[col] = 0
  input_df[col] = 0
  input_df[col] = 0
  input_df[col] = 0
  input_df[col] = 0
  input_df[col] = 0
  input_df[col] = 0
  input_df[col] = 0
  input_df[col] = 0
  input_df[col] = 0
  input_df[col] = 0
  input_df[col] = 0
  input_df[col] = 0
  input_df[col] = 0
  input_df[col] = 0
  input_df[col] = 0
  input_df[col] = 0
  input_df[col] = 0
  input_df[col] = 0
  input_df[col]