In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import classification_report, confusion_matrix
from xgboost import XGBClassifier

In [2]:
df = pd.read_csv("C:/Users/aayushi.chowla/OneDrive - Hero Corporate Service Private Limited/Desktop/Renewal_data/Testrenewals.csv", usecols=[
    "Gender", "Age", "Marital Status", "RTO Name", "Vehicle Type", "Vechile Age", 
    "IC Name", "Is Online", "Customer Type", "City Name", "PIN", "Premium Amount", 
    "State Name", "Policy Number", "Model Name", "Policy Source", "Frame Number", "Renewed", "Policy Type"
], low_memory=False)

X = df.drop("Renewed", axis=1)
y = df["Renewed"]

In [3]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

In [4]:
num_cols = ["Age", "Premium Amount"]  # Add more if needed
cat_cols = [
    "Gender", "Marital Status", "RTO Name", "Vehicle Type", "Vechile Age", 
    "IC Name", "Is Online", "Customer Type", "City Name", "PIN", "State Name", 
    "Policy Number", "Model Name", "Policy Source", "Frame Number", "Policy Type"
]


In [5]:
num_imputer = SimpleImputer(strategy="mean")
X_train_num = pd.DataFrame(num_imputer.fit_transform(X_train[num_cols]), columns=num_cols)
X_test_num = pd.DataFrame(num_imputer.transform(X_test[num_cols]), columns=num_cols)

In [6]:
cat_imputer = SimpleImputer(strategy="most_frequent")
X_train_cat = pd.DataFrame(cat_imputer.fit_transform(X_train[cat_cols]), columns=cat_cols)
X_test_cat = pd.DataFrame(cat_imputer.transform(X_test[cat_cols]), columns=cat_cols)

In [7]:
encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
X_train_cat_enc = pd.DataFrame(
    encoder.fit_transform(X_train_cat),
    columns=encoder.get_feature_names_out(cat_cols)
)
X_test_cat_enc = pd.DataFrame(
    encoder.transform(X_test_cat),
    columns=encoder.get_feature_names_out(cat_cols)
)

In [8]:
X_train_full = pd.concat([X_train_num.reset_index(drop=True), X_train_cat_enc.reset_index(drop=True)], axis=1)
X_test_full = pd.concat([X_test_num.reset_index(drop=True), X_test_cat_enc.reset_index(drop=True)], axis=1)

In [9]:
scaler = StandardScaler()
X_train_full[num_cols] = scaler.fit_transform(X_train_full[num_cols])
X_test_full[num_cols] = scaler.transform(X_test_full[num_cols])

In [10]:
neg, pos = np.bincount(y_train)
scale_pos_weight = neg / pos
model = XGBClassifier(
    n_estimators=100,
    learning_rate=0.1,
    max_depth=4,
    eval_metric='logloss',
    scale_pos_weight=scale_pos_weight,
    random_state=42
)

In [11]:
model.fit(X_train_full, y_train)
y_pred = model.predict(X_test_full)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[264 190]
 [211 335]]
              precision    recall  f1-score   support

           0       0.56      0.58      0.57       454
           1       0.64      0.61      0.63       546

    accuracy                           0.60      1000
   macro avg       0.60      0.60      0.60      1000
weighted avg       0.60      0.60      0.60      1000



In [12]:
# After model training
feature_names = X_train_full.columns.tolist()

# Save them
import joblib
joblib.dump(feature_names, "xgb_feature_names.pkl")

# Load and reuse later
loaded_feature_names = joblib.load("xgb_feature_names.pkl")


In [22]:
# Load new data
df = pd.read_csv("C:/Users/aayushi.chowla/OneDrive - Hero Corporate Service Private Limited/Desktop/Renewal_data/Testing.csv")

# --- Imputation ---

num_cols = ["Age", "Premium Amount"]  # Add more if needed
cat_cols = [
    "Gender", "Marital Status", "RTO Name", "Vehicle Type", "Vechile Age", 
    "IC Name", "Is Online", "Customer Type", "City Name","PIN", "State Name", 
    "Policy Number", "Model Name", "Policy Source", "Frame Number", "Policy Type"
]

# Filter only the columns used during training
all_required_columns = num_cols + cat_cols
new_df = df[all_required_columns]

# Impute missing values
new_num = pd.DataFrame(num_imputer.transform(new_df[num_cols]), columns=num_cols)
new_cat = pd.DataFrame(cat_imputer.transform(new_df[cat_cols]), columns=cat_cols)

# Encode
new_cat_enc = pd.DataFrame(
    encoder.transform(new_cat),
    columns=encoder.get_feature_names_out(cat_cols)
)
# Predict probabilities
predicted_probs = model.predict_proba(new_full)[:, 1]
# Add to original DataFrame
new_df["Renewal_Probability"] = predicted_probs


# Separate features
new_num = pd.DataFrame(num_imputer.transform(new_df[num_cols]), columns=num_cols)
new_cat = pd.DataFrame(cat_imputer.transform(new_df[cat_cols]), columns=cat_cols)

# --- Encoding ---
new_cat_enc = pd.DataFrame(
    encoder.transform(new_cat),
    columns=encoder.get_feature_names_out(cat_cols)
)

# --- Combine numeric and encoded categorical ---
new_full = pd.concat([new_num.reset_index(drop=True), new_cat_enc.reset_index(drop=True)], axis=1)

# --- Scale numeric features ---
new_full[num_cols] = scaler.transform(new_full[num_cols])

# --- Predict ---
predictions = model.predict(new_full)
new_df["Predicted_Renewal"] = predictions

# See result
print(new_df.head())


   Age  Premium Amount Gender Marital Status     RTO Name Vehicle Type  \
0   48            1387      M              M  GARIYABANDH            O   
1   39            1391      F              M       Raipur            O   
2   47            1517      M              M       Raipur            O   
3   53            1302      M              M       Raipur            O   
4   35            1519      F              M       Raipur            O   

   Vechile Age                             IC Name Is Online Customer Type  \
0            7     United India Insurance Co. Ltd.         N             I   
1           10     United India Insurance Co. Ltd.         N             I   
2            8     United India Insurance Co. Ltd.         N             I   
3           12  Universal  Sompo general insurance         N             I   
4            7  Universal  Sompo general insurance         N             I   

     City Name     PIN   State Name         Policy Number  \
0  GARIYABANDH  492001  C

In [23]:
new_df.to_csv("C:/Users/aayushi.chowla/OneDrive - Hero Corporate Service Private Limited/Desktop/Renewal_data/Predicted.csv", index=False)


In [26]:
# Re-import libraries after code execution environment reset
import pandas as pd
import numpy as np

# Simulating the filtered DataFrame (would normally come from model predictions)
np.random.seed(42)
n_samples = 1000
new_df = pd.DataFrame({
    "Age": np.random.randint(18, 70, size=n_samples),
    "Premium Amount": np.random.randint(5000, 20000, size=n_samples),
    "Renewal_Probability": np.random.uniform(0.5, 1.0, size=n_samples)  # only above 50%
})

# Add Predicted_Renewal column (1 if prob >= 0.5)
new_df["Predicted_Renewal"] = (new_df["Renewal_Probability"] >= 0.5).astype(int)

# Filter those with Renewal_Probability >= 0.5
df_filtered = new_df[new_df["Renewal_Probability"] >= 0.5].copy()

# Define 5 clusters: 50-60, 60-70, 70-80, 80-90, 90-100
bins = [0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
labels = ["50-60%", "60-70%", "70-80%", "80-90%", "90-100%"]
df_filtered["Renewal_Cluster"] = pd.cut(df_filtered["Renewal_Probability"], bins=bins, labels=labels, include_lowest=True)

# Save to Excel 
df_filtered.to_csv("C:/Users/aayushi.chowla/OneDrive - Hero Corporate Service Private Limited/Desktop/Renewal_data/Clustered Renewals.csv", index=False)



In [25]:
import pandas as pd

# Define bins and labels
bins = [0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
labels = ["50-60%", "60-70%", "70-80%", "80-90%", "90-100%"]

# Filter rows with probability > 0.5
filtered_df = new_df[new_df["Renewal_Probability"] > 0.5].copy()

# Assign clusters
filtered_df['Renewal_Cluster'] = pd.cut(
    filtered_df['Renewal_Probability'],
    bins=bins,
    labels=labels,
    include_lowest=False
)

# Select desired columns
clustered_df = filtered_df[["Policy Number", "Renewal_Probability", "Renewal_Cluster"]]

output_path


KeyError: "['Policy Number'] not in index"

In [21]:
# Save to Excel
clustered_df.to_csv(
    "C:/Users/aayushi.chowla/OneDrive - Hero Corporate Service Private Limited/Desktop/Renewal_data/Clusters.csv",
    index=False
)

NameError: name 'clustered_df' is not defined

In [None]:
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import seaborn as sns

# Step 1: Filter people with probability > 0.5
high_prob_df = new_df[new_df["Renewal_Probability"] > 0.5].copy()
high_prob_features = new_full.loc[high_prob_df.index]  # Match corresponding features

# Step 2: Perform clustering on the filtered feature set
kmeans = KMeans(n_clusters=5, random_state=42)
clusters = kmeans.fit_predict(high_prob_features)

# Step 3: Add cluster labels to the filtered DataFrame
high_prob_df["Cluster"] = clusters

# Step 4: View cluster counts
print(high_prob_df["Cluster"].value_counts())

# Step 5: Optional visualization
plt.figure(figsize=(10, 6))
sns.countplot(data=high_prob_df, x="Cluster", palette="Set2")
plt.title("Distribution of People Likely to Renew (Clustered)")
plt.xlabel("Cluster")
plt.ylabel("Number of Customers")
plt.show()


In [None]:
importances = model.feature_importances_
sorted_idx = np.argsort(importances)[::-1]

# Limit to top N most important features
top_n = 20
top_features = sorted_idx[:top_n]

plt.figure(figsize=(10, 6))
plt.barh(range(top_n), importances[top_features][::-1], align='center')
plt.yticks(range(top_n), X_train_full.columns[top_features][::-1])
plt.xlabel("Feature Importance")
plt.title("Top 20 XGBoost Feature Importances")
plt.tight_layout()
plt.show()


In [None]:
import pandas as pd
import numpy as np

# --- Load Data ---
new_df = pd.read_csv("C:/Users/aayushi.chowla/OneDrive - Hero Corporate Service Private Limited/Desktop/Renewal_data/Testing.csv", usecols=[
    "Gender", "Age", "Marital Status", "RTO Name", "Vehicle Type", "Vechile Age", 
    "IC Name", "Is Online", "Customer Type", "City Name", "PIN", "Premium Amount", 
    "State Name", "Policy Number", "Model Name", "Policy Source", "Frame Number", "Renewed", "Policy Type"
], low_memory=False)

# --- Impute Missing Values ---
new_num = pd.DataFrame(num_imputer.transform(new_df[num_cols]), columns=num_cols)
new_cat = pd.DataFrame(cat_imputer.transform(new_df[cat_cols]), columns=cat_cols)

# --- Encode Categorical Features ---
new_cat_enc = pd.DataFrame(
    encoder.transform(new_cat),
    columns=encoder.get_feature_names_out(cat_cols)
)

# --- Combine & Scale ---
new_full = pd.concat([new_num.reset_index(drop=True), new_cat_enc.reset_index(drop=True)], axis=1)
new_full[num_cols] = scaler.transform(new_full[num_cols])

# --- Predict Probabilities ---
# XGBClassifier.predict_proba gives a 2D array: [[P(class 0), P(class 1)], ...]
probabilities = model.predict_proba(new_full)

# --- Add Predicted Probability of Renewal ---
new_df["Renewal_Probability"] = probabilities[:, 1]  # Probability of class '1' (renewed)
new_df["Renewal_Probability"] = probabilities[:, 0]  # Probability of class '1' (renewed)


# --- Optional: Round for readability ---
new_df["Renewal_Probability"] = new_df["Renewal_Probability"].round(4)

# --- View results ---
print(new_df[["Renewal_Probability"]].head())


In [None]:
import matplotlib.pyplot as plt
import xgboost as xgb

xgb.plot_importance(model,max_num_features=20)
plt.show()

In [None]:
# Step 1: Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.cluster import KMeans

# Step 2: Load the dataset
df = pd.read_csv("C:/Users/aayushi.chowla/OneDrive - Hero Corporate Service Private Limited/Desktop/Renewal_data/Testing.csv")  # Replace with your actual file name

# Step 3: Clean + Select features (exclude ID-like columns)
features = [
    'Age']

df = df[features + ['Renewed']].dropna()

# Step 4: Encode Categorical
cat_cols = df.select_dtypes(include='object').columns.tolist()
num_cols = df.select_dtypes(include=['int64', 'float64']).columns.tolist()
num_cols.remove('Renewed')  # target not included in clustering

encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
encoded_cat = pd.DataFrame(encoder.fit_transform(df[cat_cols]),
                           columns=encoder.get_feature_names_out(cat_cols))

# Step 5: Scale Numeric
scaler = StandardScaler()
scaled_num = pd.DataFrame(scaler.fit_transform(df[num_cols]), columns=num_cols)

# Step 6: Combine all features
X = pd.concat([scaled_num.reset_index(drop=True), encoded_cat.reset_index(drop=True)], axis=1)

# Step 7: Apply KMeans
kmeans = KMeans(n_clusters=2, random_state=42)
df['Cluster'] = kmeans.fit_predict(X)

# Step 8: Visualize Average Feature Values by Cluster (bar plots)
# We'll use numeric + one-hot-encoded column averages
cluster_profiles = pd.DataFrame(X)
cluster_profiles['Cluster'] = df['Cluster']
avg_features = cluster_profiles.groupby('Cluster').mean().T

# Plot top 15 most different features by variance between clusters
top_diff_features = avg_features.var(axis=1).sort_values(ascending=False).head(15).index

avg_features = avg_features.loc[top_diff_features]

# Plotting
plt.figure(figsize=(14, 8))
avg_features.plot(kind='bar', figsize=(15, 6))
plt.title("Top Feature Differences by Cluster")
plt.ylabel("Mean Value (Scaled)")
plt.xticks(rotation=90)
plt.grid(axis='y')
plt.tight_layout()
plt.legend(title="Cluster")
plt.show()
