# Importing libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from lifelines import WeibullAFTFitter, LogNormalAFTFitter, LogLogisticAFTFitter, ExponentialFitter
from lifelines.utils import k_fold_cross_validation
import seaborn as sns
import warnings

KeyboardInterrupt: 

In [None]:
warnings.filterwarnings("ignore")

# Reading and transforming the data

In [None]:
data= pd.read_csv('telco.csv')
clvdata = pd.read_csv('telco.csv')
data.head()

In [None]:
data = data.copy()
data.drop(['ID'], axis=1, inplace=True)
cols = ['region', 'retire', 'marital', 'ed', 'gender', 'voice', 'internet', 'custcat', 'churn', 'forward']
data = data.copy()
data = pd.get_dummies(data, columns=cols, drop_first=True)
data = data.rename(columns={'churn_Yes': 'churn'})
data.head()

# Fitting AFT Models

In [None]:
weibull_model = WeibullAFTFitter()
log_norm_model = LogNormalAFTFitter()
log_logistic_model = LogLogisticAFTFitter()


### Weibull 

In [None]:
weibull = weibull_model.fit(data, duration_col='tenure', event_col='churn')
weibull_prediction = weibull.predict_survival_function(data).T
weibull_prediction_avg = weibull_prediction.mean()
weibull.print_summary()

### Log Normal

In [None]:
log_norm = log_norm_model.fit(data, duration_col='tenure', event_col='churn')
log_norm_prediction = log_norm.predict_survival_function(data).T
log_norm_prediction_avg = log_norm_prediction.mean()
log_norm.print_summary()

### Log Logistic

In [None]:
log_logistic = log_logistic_model.fit(data, duration_col='tenure', event_col='churn')
log_logistic_prediction = log_logistic.predict_survival_function(data).T
log_logistic_prediction_avg = log_logistic_prediction.mean()
log_logistic.print_summary()

### Compare the 3 models

In [None]:
plt.figure(figsize=(15,9))
plt.plot(weibull_prediction_avg, label='Weibull')
plt.plot(log_norm_prediction_avg, label='LogNorm')
plt.plot(log_logistic_prediction_avg, label='LogLogistic')
plt.show()

The Akaike Information Criterion (AIC) helps to figure out how well a model works. It takes into account both the goodness of fit of the model and the number of parameters used in the model. 

In [None]:

print(f'Log-Normal AIC: {log_norm.AIC_}')
print(f'Log-Logistic AIC: {log_logistic.AIC_}')
print(f'Weibull AIC: {weibull.AIC_}')

scores = { 'Log-normal': log_norm.AIC_, 'Log-logistic': log_logistic.AIC_, 'Weibull': weibull.AIC_}
print(f'\nThe best model based on AIC scores is: \033[1m{min(scores, key=scores.get)}\033[0m')

In this case, it is the Log-Normal model, which has an AIC score of 2954.0240102517128. Lower AIC scores indicate a better balance of model fit and complexity, making the Log-Normal model the preferable choice among the three based on these scores.

The complexity of the model is another important factor which is described by the number of the parameters the model has. Log-Normal and Log-Logistic models have both three parameters, while weibull has two. So, weibull model is the least complex model from these 3.



> However, I will continue with Log -normal Model

# Plot

### Keep significant features

Taking those features as significant that are 99% significant. Here is the list:

In [None]:
significant_columns = ["address", "age", "internet_Yes", "marital_Unmarried", "tenure", "churn", "custcat_E-service", "custcat_Plus service", "custcat_Total service"]

In [None]:

significant_data = data[significant_columns]
significant_data.head()

In [None]:
log_norm = log_norm_model.fit(significant_data, duration_col='tenure', event_col='churn')
log_norm_prediction = log_norm.predict_survival_function(significant_data).T
log_norm_prediction_avg = log_norm_prediction.mean()
log_norm.print_summary()

# CLV

In [None]:
clv_data = log_norm_prediction.copy()

In [None]:
margin = 1000
sequence = range(1,len(clv_data.columns)+1)
r = 0.1

In [None]:
for i in sequence:
    clv_data.loc[:, i] = clv_data.loc[:, i]/((1+r/12)**(sequence[i-1]-1))

In [None]:
clv_data["CLV"] = margin * clv_data.sum(axis = 1)
clv_data

In [None]:
clvdata["CLV"] = clv_data.CLV
clvdata.head()

We can see the probability of a person to remain our customer

# Visualizing CLV vs. Features

In [None]:
sns.displot(data=clvdata, kind='kde', x='CLV', hue='custcat')

In [None]:
sns.displot(data=clvdata, kind='kde', x='CLV', hue='ed')

In [None]:
sns.displot(data=clvdata, kind='kde', x='CLV', hue='gender')

In [None]:
sns.displot(data=clvdata, kind='kde', x='CLV', hue='retire')

In [None]:
sns.displot(data=clvdata, kind='kde', x='CLV', hue='region')

In [None]:
sns.displot(data=clvdata, kind='kde', x='CLV', hue='marital')

In [None]:
sns.displot(data=clvdata, kind='kde', x='CLV', hue='internet')

In [None]:
sns.displot(data=rclvdata, kind='kde', x='CLV', hue='forward')

In [None]:
sns.displot(data=clvdata, kind='kde', x='CLV', hue='voice')

In [None]:
sns.displot(data=clvdata, kind='kde', x='CLV', hue='churn')

In [None]:
print(clvdata.groupby(["gender", "marital", "region"])[["CLV"]].mean())
print(clvdata.groupby(["voice","retire", "ed"] )[["CLV"]].mean())
print(clvdata.groupby("forward")[["CLV"]].mean())
print(clvdata.groupby("internet")[["CLV"]].mean())
print(clvdata.groupby(["marital", "retire"])[["CLV"]].mean())
print(clvdata.groupby(["region", "retire"])[["CLV"]].mean())
print(clvdata.groupby(["custcat", "voice"])[["CLV"]].mean())
print(clvdata.groupby("retire")[["CLV"]].mean())
print(clvdata.groupby(["ed", "marital", "retire"])[["CLV"]].mean())

# Conclusions

Analyzing the provided data reveals interesting insights into customer segments based on various demographic factors and their corresponding Customer Lifetime Value (CLV). The data is segmented by gender, marital status, region, voice service usage, retirement status, education, internet service, and customer category.

- Marital Status, Gender, and Region: Married females, especially in Zone 2, show a higher CLV compared to unmarried females across all zones. For males, married ones in Zone 1 shows the highest CLV. This suggests that married individuals, particularly in certain regions, are more valuable customers.

- Retirement Status and Education: Retired individuals: those who did not complete high school show high CLVs. This could be due to their stable lifestyle and possibly lower inclination to change service providers.

- Voice Service and Education: Customers without voice services who have not completed high school or are retired with some college education have higher CLVs. This indicates a segment less inclined towards additional services but loyal to the basic offerings.

- Internet Service: Customers without internet services have a significantly higher CLV than those with internet. This might point to issues with the internet service leading to lower satisfaction and CLV.

- Retirement Impact: Non-retired individuals have a lower CLV compared to retired ones, indicating that retirees form a more loyal and valuable customer base.

- Education, Marital Status, and Retirement: Segments based on education, marital status, and retirement status show varied CLVs. Notably, married individuals who did not complete high school and are retired have the highest CLV, suggesting a niche but highly valuable segment.

## Retention budget

Taking some arbitrary values for retention rate and cost per customer

In [None]:
significant_data["CLV"] = clv_data.CLV

In [None]:
retained_customers = significant_data[significant_data['churn'] == 0]
retained_clv = retained_customers['CLV'].sum()

In [None]:
retention_rate = 0.8
cost_per_customer = 5000
retention_cost = len(significant_data) * retention_rate * cost_per_customer

In [None]:
annual_budget = retained_clv - retention_cost
annual_budget

# Scaling the features

I will scale the data using MinMaxScaler and try the same experiment to see the differences.