In [21]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler

dataset = pd.read_excel("data.xlsx")

X = dataset.iloc[:, :-1]
y = dataset.iloc[:, -1]

X.columns = [
    "Age","gender","city","state","year","previous_claims",
    "vehicle_type","Customer_tenure","Policy_source","cp_code",
    "Occupation_Type","DIY_vs_CC_behavior","renewal_count","Claim_status"
]

# Binary encoding
binary_cols = ["gender","vehicle_type","DIY_vs_CC_behavior"]

binary_encoders = {}

for col in binary_cols:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col])
    binary_encoders[col] = le


#for col in binary_cols:
 #   X[col] = LabelEncoder().fit_transform(X[col])

# Embedding columns
embed_cols = ["city", "state", "Occupation_Type", "Policy_source"]

from sklearn.preprocessing import OrdinalEncoder

embed_encoders = {}

for col in embed_cols:
    oe = OrdinalEncoder(
        handle_unknown="use_encoded_value",
        unknown_value=-1
    )
    X[[col]] = oe.fit_transform(X[[col]])
    embed_encoders[col] = oe

# Numeric scaling
numeric_cols = [
    "Age","year","previous_claims","Customer_tenure",
    "cp_code","Claim_status","renewal_count"
]

scaler = StandardScaler()
X[numeric_cols] = scaler.fit_transform(X[numeric_cols])

y = LabelEncoder().fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


In [22]:
import tensorflow as tf
from tensorflow.keras.layers import (
    Input, Embedding, Dense, Flatten,
    Concatenate, Dropout, BatchNormalization
)
from tensorflow.keras.models import Model


In [23]:
inputs = []
embeddings = []

# Embeddings
for col in embed_cols:
    vocab_size = X[col].nunique() + 1
    embed_dim = min(50, vocab_size // 2)

    inp = Input(shape=(1,), name=f"{col}_input")
    emb = Embedding(vocab_size, embed_dim, name=f"{col}_emb")(inp)
    emb = Flatten()(emb)

    inputs.append(inp)
    embeddings.append(emb)

# Numeric & binary inputs
num_input = Input(shape=(len(numeric_cols) + len(binary_cols),), name="num_input")
inputs.append(num_input)

x = Concatenate()(embeddings + [num_input])

# Deep layers
x = Dense(128, activation="relu")(x)
x = BatchNormalization()(x)
x = Dropout(0.4)(x)

x = Dense(64, activation="relu")(x)
x = BatchNormalization()(x)
x = Dropout(0.3)(x)

x = Dense(32, activation="relu")(x)

output = Dense(1, activation="sigmoid")(x)

model = Model(inputs=inputs, outputs=output)


In [24]:
model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
    loss="binary_crossentropy",
    metrics=[
        tf.keras.metrics.AUC(name="auc"),
        tf.keras.metrics.Precision(name="precision"),
        tf.keras.metrics.Recall(name="recall")
    ]
)


In [25]:
from sklearn.utils.class_weight import compute_class_weight

class_weights = compute_class_weight(
    class_weight="balanced",
    classes=np.unique(y_train),
    y=y_train
)

class_weights = dict(enumerate(class_weights))


In [26]:
callbacks = [
    tf.keras.callbacks.EarlyStopping(
        monitor="val_auc",
        patience=5,
        restore_best_weights=True,
        mode="max"
    )
]

history = model.fit(
    [
        X_train[col] for col in embed_cols
    ] + [X_train[numeric_cols + binary_cols].values],
    y_train,
    validation_split=0.1,
    epochs=50,
    batch_size=256,
    class_weight=class_weights,
    callbacks=callbacks
)


Epoch 1/50
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 28ms/step - auc: 0.8457 - loss: 0.4522 - precision: 0.6545 - recall: 0.7777 - val_auc: 0.9998 - val_loss: 0.2911 - val_precision: 0.9914 - val_recall: 0.9858
Epoch 2/50
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - auc: 0.9982 - loss: 0.0497 - precision: 0.9848 - recall: 0.9892 - val_auc: 1.0000 - val_loss: 0.1486 - val_precision: 1.0000 - val_recall: 1.0000
Epoch 3/50
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - auc: 0.9999 - loss: 0.0169 - precision: 0.9929 - recall: 0.9969 - val_auc: 1.0000 - val_loss: 0.0816 - val_precision: 1.0000 - val_recall: 1.0000
Epoch 4/50
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step - auc: 0.9999 - loss: 0.0116 - precision: 0.9929 - recall: 0.9973 - val_auc: 1.0000 - val_loss: 0.0441 - val_precision: 1.0000 - val_recall: 1.0000
Epoch 5/50
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m

In [37]:

raw_input = {
    "Age": 63,
    "gender": "M",
    "city": "Ranbirsinghpora",
    "state": "JAMMU & KASHMIR",
    "year": 2022,
    "previous_claims": 0,
    "vehicle_type": "N",
    "Customer_tenure": 1,
    "Policy_source": "Dealer",
    "cp_code": 21869,
    "Occupation_Type": "Self Employeed",
    "DIY_vs_CC_behavior": 0,
    "Claim_status": 0,
    "renewal_count": 2
}

import pandas as pd
import numpy as np

df = pd.DataFrame([raw_input])

# Binary encoding
# Standardize binary text values first
#for col, mapping in STANDARDIZE_MAP.items():
 #   df[col] = df[col].map(mapping)

# Encode binary columns
for col in binary_cols:
    df[col] = binary_encoders[col].transform(df[col])

# Encode embedding columns (SAFE for unseen)
for col in embed_cols:
    df[[col]] = embed_encoders[col].transform(df[[col]])

df[embed_cols] = df[embed_cols] + 1

# Scale numeric columns
df[numeric_cols] = scaler.transform(df[numeric_cols])

# Prepare model inputs
model_inputs = (
    [df[col].values for col in embed_cols] +
    [df[numeric_cols + binary_cols].values]
)

probability = model.predict(model_inputs)[0][0]

print("Predicted Probability:", probability)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 245ms/step
Predicted Probability: 0.6002482
