In [None]:
# Step 2: Set up Kaggle API credentials
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

# Step 3: Install the Kaggle library (if not already installed)
!pip install kaggle

# Step 4: Download the Customer Life Time Value dataset
!kaggle datasets download shibumohapatra/customer-life-time-value

# Step 5: Extract the downloaded zip file
import zipfile
with zipfile.ZipFile("customer-life-time-value.zip", 'r') as zip_ref:
    zip_ref.extractall("customer-life-time-value")

# Step 6: List the extracted files to confirm 'train_BRCpofr.csv' is present
!ls customer-life-time-value

# Step 7: Load the 'train_BRCpofr.csv' file into a Pandas DataFrame
import pandas as pd
ds = pd.read_csv("customer-life-time-value/train_BRCpofr.csv")


In [None]:
ds.shape

In [None]:
(1.6-1.8)/1.6

In [None]:
ds.sample(10)

In [None]:
ds.info()

In [None]:
ds.describe()

In [None]:
ds.columns

In [None]:
ds['gender'].value_counts().plot(kind='bar')

In [None]:
ds['area'].value_counts().plot(kind='bar')

In [None]:
ds['qualification'].value_counts()

In [None]:
ds['income'].value_counts()

In [None]:
ds['marital_status'].value_counts()

In [None]:
ds['vintage'].value_counts().plot(kind='bar')

In [None]:
ds['claim_amount'].plot(kind='kde')

In [None]:
ds[ds['claim_amount']==0].shape

In [None]:
ds[ds['claim_amount']!=0]['claim_amount'].mean()

In [None]:
ds[ds['claim_amount']!=0]['claim_amount'].median()

In [None]:
ds['num_policies'].value_counts()

In [None]:
ds['policy'].value_counts()

In [None]:
ds['type_of_policy'].value_counts().plot(kind='pie',autopct='%1.1f%%')

In [None]:
ds['cltv'].describe()

In [None]:
ds['cltv'].plot(kind='kde')

In [None]:
ds.columns

In [None]:
# income column range inta single number
# check for the relationships between the other column and estimate the close limit.
#1. num_policies have the direct correlation between the income
#2. 5-10L 60% male with respect to all the males where as 57% females  as well as 26% females where as 16% males for income less than 2 lpa

In [None]:
ds[ds['income']=='More than 10L'][['num_policies']].value_counts()

In [None]:
ds[['income','num_policies']].sample(30)

In [None]:
gender = ds.groupby('gender')

In [None]:
gender['income'].value_counts()

In [None]:
(ds[ds['gender']=='Female']).shape

In [None]:
22220 + 9493 + 6144 + 1038

In [None]:
(gender['income'].value_counts()['Female']/38895)

In [None]:
(gender['income'].value_counts()['Male'])/50497

In [None]:
(gender['income'].value_counts()['Female']/38865).plot(color='Black',label='Female')
(gender['income'].value_counts()['Male']/50497).plot(color='Red',label='Male')


In [None]:
# Qualificaton
# not conclusive
quali= ds.groupby('qualification')
quali['income'].value_counts()

In [None]:
quali= ds.groupby('marital_status')
quali['income'].value_counts()

In [None]:
# marital_status
def income_per(ds,var):
  temp_df=pd.DataFrame(columns=['status','income','distribution_percent'])
  for i in ds[var].unique():
    T=pd.DataFrame(columns=['status','income','distribution_percent'])
    t=ds[ds[var]==i]
    for j in t['income'].unique():
      Tt=(t[t['income']==j].shape[0]/t.shape[0])
      T[['status','income','distribution_percent']] = [[j,i,Tt]]
      temp_df =pd.concat([temp_df,T],ignore_index=True)

  return temp_df.set_index('status').pivot(columns='income')
income_per(ds,'marital_status')
#income_percentage.set_index('status').pivot(columns='income')

In [None]:
# Area
income_per(ds,'area')

In [None]:
area=ds.groupby('area')
t = area['income'].value_counts()
t

In [None]:
t['Rural']/t['Rural'].values.sum()

In [None]:
t['Urban']/t['Urban'].values.sum()

In [None]:
ds.columns


In [None]:
income_per(ds,'policy')

In [None]:
# type_of_policy
# quite descent realationship
income_per(ds,'type_of_policy')

In [None]:
inc = ds.groupby('income')
inc['cltv'].mean()

In [None]:
ds.columns

In [None]:
ds['claim_amount'].describe()

In [None]:
sigma = ds['claim_amount'].std()
mean = ds['claim_amount'].mean()
upperlimit = mean  + 3 * (sigma)
lowerlimit = mean - 3*(sigma)
print(upperlimit,lowerlimit)

In [None]:
sigma/mean

In [None]:
ds = ds[((ds['claim_amount']>lowerlimit) & (ds['claim_amount'] < upperlimit))]

In [None]:
# basic training
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder,OrdinalEncoder,StandardScaler,LabelEncoder
from sklearn.metrics import r2_score,mean_absolute_error,mean_absolute_error

In [None]:
X=ds.iloc[:,1:-1]
y=ds.iloc[:,-1]

In [None]:
X_train,x_test,y_train,y_test = train_test_split(X,y,test_size=0.8)

In [None]:
ds['type_of_policy'].unique()

In [None]:
scaling= StandardScaler()

In [None]:
ohe = OneHotEncoder(drop='first',sparse_output=False)
ord_quali = OrdinalEncoder(categories=[['Others','High School','Bachelor']])
ord_inco = OrdinalEncoder(categories=[['<=2L','2L-5L','5L-10L','More than 10L']])
ord_num_policies = OrdinalEncoder(categories=[['1','More than 1']])
ohe_policy = OneHotEncoder(drop='first')
ord_type_policy = OrdinalEncoder(categories=[['Silver','Platinum','Gold']])

In [None]:
label_encoder = LabelEncoder()

In [None]:
ds.head(3)

In [None]:
tr = ColumnTransformer(
    transformers=[
        #('ohe', ohe, ['gender','area','marital_status']),
        ('ord_quali', ord_quali, ['qualification']),
        ('ord_inco', ord_inco, ['income']),
        ('ord_num_policies', ord_num_policies, ['num_policies']),
        # ('ohe_policy', ohe_policy, ['policy']),
        ('label_gender',label_encoder,['gender']),
        ('label_area',label_encoder,['area']),
        ('label_marital_status',label_encoder,['marital_status']),
        ('label_policy',label_encoder,['policy']),
        ('ord_type_policy', ord_type_policy, ['type_of_policy']),
        ('scaling',scaling,['vintage','claim_amount'])
    ],
    remainder='passthrough'
)

In [None]:
X_train

In [None]:
X_train_tr= tr.fit_transform(X_train)
x_test_tr = tr.transform(x_test)


# Feature Importances

In [None]:
import numpy as np

In [None]:
X_trans = tr.fit_transform(X)
y_trans = np.log1p(y)

# RandomForest Feature importances

In [None]:
rf = RandomForestRegressor()
rf.fit(X_trans,y_trans)


In [None]:
rf.feature_importances_.shape

In [None]:
X.columns.shape

In [None]:
rf.feature_importances_
rf_importances = pd.DataFrame({'Features':X.columns,'rf_importances':rf.feature_importances_})
rf_importances

In [None]:
rf.feature_importances_

In [None]:
# linear regression
lr = LinearRegression()
lr.fit(X_train_tr,y_train)
y_pred = lr.predict(x_test_tr)
r2_score(y_test,y_pred)

In [None]:
y_pred_lr_tr= lr.predict(X_train_tr)
r2_score()

In [None]:
# random forest
rf = RandomForestRegressor()

In [None]:
rf.fit(X_train_tr,y_train)

In [None]:
#y_pred = rf.predict(x_test_tr)
y_pred = rf.predict(x_test_tr)

In [None]:
r2_score(y_test,y_pred)

In [None]:
# Gradient boosting
Gr = GradientBoostingRegressor()

In [None]:
Gr.fit(X_train_tr,y_train)

In [None]:
y_pred_gr = Gr.predict(x_test_tr)

In [None]:
r2_score(y_test,y_pred_gr)