In [48]:
import os
from dotenv import load_dotenv
import pandas as pd
import numpy as np
# import xgboost as xgb
# import shap
import os
import random


from langchain_community.vectorstores import DocArrayInMemorySearch
from langchain_core.runnables import RunnableParallel, RunnablePassthrough
from langchain_community.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.prompts import ChatPromptTemplate
from langchain_openai.chat_models import ChatOpenAI
from langchain_openai.embeddings import OpenAIEmbeddings
from langchain_core.output_parsers import StrOutputParser
from langchain_community.vectorstores import Chroma


load_dotenv()
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
LANGCHAIN_PROJECT="SHAP-LLM-Telco-Local-Explanations"


In [2]:
import sys
sys.path.insert(0, '../')

from xgb_process import shap_summary,xgb_model
from utils import generate_text_summary,create_context,process_data

  from .autonotebook import tqdm as notebook_tqdm


#### Load the Telco Churn Data 

1. Process and Clean Data
2. Create data summary file
3. Create the XGBOOST Model
4. Create global SHAP explanation summary file and table 

In [3]:
###Load the table and add to database
df0=pd.read_csv("..\\data\\telco_churn_data.csv",index_col=False)
df0.columns = [x.lower() for x in df0.columns]
cat_cols = ['gender', 'seniorcitizen', 'partner', 'dependents', 'phoneservice', 'multiplelines', 'internetservice',
            'onlinesecurity', 'onlinebackup', 'deviceprotection', 'techsupport', 'streamingtv', 'streamingmovies',
            'contract', 'paperlessbilling', 'paymentmethod']
num_cols = ['tenure', 'monthlycharges', 'totalcharges']
target = 'churn'
id_feats=['customerID']


df0=process_data(df=df0,cat_cols=cat_cols,num_cols=num_cols,id_feats=id_feats,target=target)




All required columns are in the DataFrame.


In [None]:
##Creating Data Summary
data_summary=(generate_text_summary(df=df0,
                            cat_feats=[col.lower() for col in cat_cols],
                            num_feats=[col.lower() for col in num_cols]))
with open("..//documents//data_summary.txt", 'w') as data_summary_file:
        data_summary_file.write(data_summary)

In [80]:
## Create the XGBOOST Model
model_instance = xgb_model.XGBoostModel(df=df0, cat_features=cat_cols, num_features=num_cols, target=target, mode='dev')
model,dtrain,X_train,dtest,X_test=model_instance.train_model()

predictions = model.predict(dtest)
X_test['predicted_churn']=predictions
X_test=X_test.reset_index(drop=True)

analyzer=shap_summary.ShapAnalyzer(model=model,
                                X_train=X_test,
                                dtrain=dtest,
                                cat_features=cat_cols,
                                num_features=num_cols)

##Get the individual shap contribution
shap_data = pd.DataFrame(analyzer.get_shap_value(), columns=X_train.columns)

###Adding proxy ID for the test data and test shap explaantion data for the local explanation
X_test['customer_id']=X_test.index
shap_data['customer_id']=X_test.index

print(f"The test data with model prediction is {X_test.shape}")
print(f"The local shap explantion fopr test data is {shap_data.shape}")

##Get the global summary of the shap values
result_df = analyzer.analyze_shap_values()
summary_df = analyzer.summarize_shap_df()
# Convert object types to category and numeric types to their respective numeric types
for col in summary_df.columns:
    if summary_df[col].dtype == 'object':
        summary_df[col] = summary_df[col].astype('category')
    elif pd.api.types.is_numeric_dtype(summary_df[col]):
        summary_df[col] = pd.to_numeric(summary_df[col])




##Create the local explanation and save as text file
print("The Local Feature Contribution Summary is Saved to documents folder")
summary_text = analyzer.summarize_shap_text()
with open("..//documents//shap_summary.txt", 'w') as f:
    f.write("SHAP contribution:\n")
    f.write(summary_text)



print("The Global Feature Contribution Summary is:")
(result_df.sort_values(by=['Importance Rank'],ascending=True))

ROC AUC Score: 0.84
The test data with model prediction is (1409, 21)
The local shap explantion fopr test data is (1409, 20)
The Local Feature Contribution Summary is Saved to documents folder
The Global Feature Contribution Summary is:


Unnamed: 0,Group,SHAP Value,Adjusted Probability,Probability Change (%),Feature,Feature Importance,Importance Rank
36,Two year,-1.973576,0.041839,-19.726410,contract,1.052163,1
35,One year,-0.651780,0.140713,-9.839052,contract,1.052163,1
34,Month-to-month,0.768994,0.404053,16.494917,contract,1.052163,1
43,"(-0.001, 2.0]",0.658306,0.377706,13.860287,tenure,0.624847,2
44,"(2.0, 5.0]",0.375170,0.313796,7.469271,tenure,0.624847,2
...,...,...,...,...,...,...,...
22,No,0.012901,0.241458,0.235492,deviceprotection,0.047952,18
24,Yes,0.032170,0.245005,0.590184,deviceprotection,0.047952,18
23,No internet service,-0.029519,0.233774,-0.532909,deviceprotection,0.047952,18
9,Yes,-0.020926,0.235317,-0.378630,phoneservice,0.031241,19


In [41]:
X_test.head()

Unnamed: 0,gender,seniorcitizen,partner,dependents,phoneservice,multiplelines,internetservice,onlinesecurity,onlinebackup,deviceprotection,...,streamingtv,streamingmovies,contract,paperlessbilling,paymentmethod,tenure,monthlycharges,totalcharges,predicted_churn,customer_id
0,Female,0,Yes,No,No,No phone service,DSL,No,No,No,...,No,No,Month-to-month,Yes,Electronic check,1,24.8,24.8,0.690529,0
1,Male,0,No,No,Yes,Yes,No,No internet service,No internet service,No internet service,...,No internet service,No internet service,Month-to-month,Yes,Bank transfer (automatic),41,25.25,996.45,0.12523,1
2,Female,0,Yes,Yes,Yes,No,No,No internet service,No internet service,No internet service,...,No internet service,No internet service,Two year,No,Mailed check,52,19.35,1031.7,0.004447,2
3,Female,0,No,No,Yes,No,Fiber optic,No,No,Yes,...,No,No,Month-to-month,No,Electronic check,1,76.35,76.35,0.895309,3
4,Male,0,No,No,Yes,No,DSL,No,No,No,...,No,No,Two year,No,Bank transfer (automatic),67,50.55,3260.1,0.005606,4


In [42]:
shap_data.head()

Unnamed: 0,gender,seniorcitizen,partner,dependents,phoneservice,multiplelines,internetservice,onlinesecurity,onlinebackup,deviceprotection,techsupport,streamingtv,streamingmovies,contract,paperlessbilling,paymentmethod,tenure,monthlycharges,totalcharges,customer_id
0,-0.102232,-0.105849,0.155218,0.032582,0.088018,0.069093,-0.295211,0.241861,0.110617,-0.032341,0.223736,-0.036183,-0.078387,0.918752,0.093678,0.146934,0.503,-1.172744,1.199655,0
1,-0.067199,-0.073062,0.110045,0.067358,-0.044077,0.140747,-0.175746,-0.263869,-0.010545,-0.023098,-0.310132,0.011213,0.001208,0.572561,-0.019287,0.12999,-0.477642,-0.6068,0.252129,1
2,-0.087921,-0.03904,0.059961,-0.150591,-0.013555,-0.156847,-0.090246,-0.134053,-0.270844,-0.042321,-0.102994,-0.054719,-0.051765,-2.118956,-0.262128,-0.195892,-0.693687,-0.20823,0.36035,2
3,0.051693,-0.053829,-0.009532,-0.016093,-0.011935,-0.129384,0.243259,0.194123,0.135595,0.077127,0.123931,-0.077947,-0.070839,1.129284,-0.32857,0.139885,0.601609,-0.000473,1.305848,3
4,0.006635,-0.03839,0.105773,-0.073348,-0.016043,0.118434,-0.018866,0.279936,-0.046831,0.090431,-0.048842,-0.280024,0.14427,-1.876663,-0.218388,-0.239664,-1.026271,-0.485574,-0.397264,4


#### Create Vector Database using Chroma DB 

 Load the text document files created above to VectorStore for RAG

In [83]:
text = [
    "..//documents//data_dictionary.txt",
    "..//documents//data_summary.txt",
    "..//documents//shap_summary.txt",
    "..//documents//shap_explanation.txt"
]

docs = [TextLoader(url).load() for url in text]
docs_list = [item for sublist in docs for item in sublist]

text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
    chunk_size=500, chunk_overlap=50
)

documents = text_splitter.split_documents(docs_list)

# Add to vectorDB
vectorstore = Chroma.from_documents(
    documents=documents,
    collection_name="churn-rag-chroma-1",
    embedding=OpenAIEmbeddings(),
    
)

Create the GPT Chain which take a prompt, uses simple RAG and then answers the question based on retrived context

In [40]:
embeddings = OpenAIEmbeddings()
vectorstore = DocArrayInMemorySearch.from_documents(documents, embeddings)
parser = StrOutputParser()

template = """
You are an assistant to customer service agent. Answer the question based on the context below to help the agent.

Context: {context}

Question: {question}
"""

prompt = ChatPromptTemplate.from_template(template)
#model = ChatOpenAI(openai_api_key=OPENAI_API_KEY, model="gpt-4-turbo")
model = ChatOpenAI(openai_api_key=OPENAI_API_KEY, model="gpt-3.5-turbo")
chain = (
    {"context": vectorstore.as_retriever(), "question": RunnablePassthrough()}
    | prompt
    | model
    | StrOutputParser()
)

In [85]:
print(chain.invoke("""What are the most important features in the model?
             """))

The most important features in the model, based on the mean absolute SHAP values and their importance ranks, are:
1. Contract
2. Tenure
3. Monthlycharges
4. Totalcharges
5. Onlinesecurity


In [88]:
print(chain.invoke("""What are the features that increases churn the most?
             """))

Based on the information provided in the documents, the features that increase the probability of class 1 (likely indicating churn) the most are:
1. MonthlyCharges
2. TotalCharges

Specifically, when the values of these features fall within certain ranges, they have a significant impact on increasing the probability of churn. For example, for MonthlyCharges within (102.75, 118.75], the probability of churn increases by 15.17%, and for TotalCharges within (18.898999999999997, 79.35], the probability of churn increases by 18.09%.


In [89]:
print(chain.invoke("""What are different contract types and internet service types and their distribution in the dataset?
             """))

Based on the provided context, the different contract types and internet service types in the dataset are as follows:

Contract Types:
1. Month-to-month: 3875
2. Two year: 1695
3. One year: 1473

Internet Service Types:
1. Fiber optic: 3096
2. DSL: 2421
3. No: 1526

These values represent the distribution of each contract and internet service type in the dataset.


In [97]:
##Asking something not in the context
print(chain.invoke("""Why are Arsenal fans so annoying?"""))

I'm sorry, but I cannot provide an answer to that question as it is unrelated to the context provided. If you have any other questions or need assistance related to the context given, please feel free to ask.


Customized Prompt to explain each customers information

In [92]:
def create_explanation_prompt(data,shap_data,n=None):

    if n==None:
        n = random.randint(0, shap_data.shape[0] - 1)

    predicted_churn=data['predicted_churn'].values[n]
    customer = data.drop(columns={'predicted_churn'},axis=0).iloc[n]
    shap = shap_data.iloc[n]

    # Create a formatted string where each column name is followed by its value
    customer_string = ', '.join([f"{col}={value}" for col, value in customer.items()])
    shap_string = ', '.join([f"{col}={value}" for col, value in shap.items()])

    prompt = f"""
            YOU ARE AN ASSISTANT TO A CUSTOMER SERVICE AGENT.

            Here are the details for a customer the agent is assisting:
            - Customer Features: {customer_string}
            - Model's Predicted Churn Probability: {predicted_churn}
            - Individual SHAP Contributions for Each Feature: {shap_string}
            Based on this information, explain to the agent in non-technical terms:
            1. Provide summary of who the customer is from user context features and his current status of action context features.
                - A short concise summary in under 100 words.
            2. What is the customer's predicted churn status with probability? Use the following scale:
            - Under 0.1: Less likely
            - 0.1 to 0.5: Likely
            - 0.5 to 0.7: Very likely
            - Above 0.7: Highly likely
            Answer in 10 words.
            3. Identify the top 3 reasons for the customer's potential churn. Provide a brief explanation (within 50 words for each) of why these 
                features significantly influence the churn prediction.
            4. Suggest the top 3 actions the agent can take to reduce the likelihood of churn, based on the feature impacts. Each suggestion should include:
            - A concise recommendation (20 words)
            - Provide an estimated churn reduction made by this action. Make this estimate from the shap value for each feature.
            - Justify why and how this action helps reduce churn by the % you have mentioned, based solely on the data provided and stats.
            Remember:
            - The magnitude of a SHAP value indicates the strength of a feature's influence on the prediction.
            - **Positive SHAP values increase the likelihood of churn; negative values decrease it.**
            - **Positive SHAP values increase the likelihood of churn; negative values decrease it.**
            - Recommendations should be strictly based on the information provided in the SHAP contributions , customer features and patterns from decision tree stats.
            """

    
    return prompt
    
def agent_response(N=49):

    response=chain.invoke(create_explanation_prompt(data=X_test,shap_data=shap_data,n=N))
    parts = response.split('\n\n')

    # Extract each part
    who_is_customer = parts[0].strip()
    churn_probability_explanation = parts[1].strip()
    reasons_for_churn = ' '.join(parts[2:3]).strip()
    action_recommendations = ' '.join(parts[3:4]).strip()

    print(f"""Who is the customer?\n{who_is_customer}\n\n""")
    print(f"""Likelihood for Churn\n{churn_probability_explanation}\n\n""")
    print(f"""Probable Reasons for Churn\n{reasons_for_churn}\n\n""")
    print(f"""Recommended Actions to Prevent Churn\n{action_recommendations}\n\n""")

In [94]:
agent_response(N=49)

Who is the customer?
1. The customer is a male without a partner or dependents, using DSL internet with online backup. He has tech support but no streaming services, with a one-year contract and paperless billing.


Likelihood for Churn
2. Likely


Probable Reasons for Churn
3. 
   - Lack of a partner: Partnerships often lead to increased customer satisfaction and loyalty.
   - One-year contract: Longer contracts suggest commitment, so a one-year term could indicate potential dissatisfaction.
   - No tech support: Lack of technical assistance can result in frustration and dissatisfaction with the service.


Recommended Actions to Prevent Churn
4. 
   - Offer a partnership promotion: Recommend a partnership deal to increase customer satisfaction and loyalty. Estimated churn reduction: 0.0231. By offering a partnership promotion, the customer may feel more valued and less likely to churn.
   - Provide tech support incentives: Incentivize tech support usage to improve customer experience.

In [95]:
agent_response(N=0)

Who is the customer?
1. The customer is a female without dependents, has a partner, no phone service, DSL internet, and is on a month-to-month contract with paperless billing and electronic check payment. She has been with the company for 1 month.


Likelihood for Churn
2. Very likely


Probable Reasons for Churn
3. 
   - Monthly charges: High monthly charges increase churn as customers may find it expensive.
   - Total charges: Higher total charges could indicate dissatisfaction with the service over time.
   - Contract: Month-to-month contracts are more prone to churn due to their flexible nature.


Recommended Actions to Prevent Churn
4. 
   - Offer a discount on monthly charges: Estimated churn reduction of 1.17. Lower charges may retain price-sensitive customers.
   - Incentivize a longer contract: Estimated churn reduction of 0.92. Longer contracts provide stability and commitment.
   - Provide online security: Estimated churn reduction of 0.24. Security features enhance customer

In [96]:
agent_response(N=22)

Who is the customer?
1. The customer is a male without senior citizenship, with a partner and dependents. He has phone and internet services, uses fiber optic, online backup, but lacks device protection and tech support. He is on a month-to-month contract with paperless billing and pays through automatic bank transfer, having been with the company for 46 months.


Likelihood for Churn
2. Likely


Probable Reasons for Churn
3. 
   - Total Charges: The high total charges negatively impact churn prediction, suggesting dissatisfaction due to cost.
   - Tenure: The longer tenure decreases churn likelihood as loyal customers are less likely to leave.
   - Contract: Month-to-month contract increases churn probability as customers have more flexibility to switch providers.


Recommended Actions to Prevent Churn
4. 
   - Offer a loyalty discount: Estimated churn reduction of 0.939%.
     Loyalty discounts can reduce total charges, addressing dissatisfaction and encouraging customer retention.
 

In [193]:

##This function will run the LLM explanation for all the customers in the test data and save as a dataframe

llm_shap = pd.DataFrame()

for i in range(100):

    print(f"Processing customer {i}")
    response=chain.invoke(create_explanation_prompt(data=data,shap_data=shap_data,n=i))
    # Split the response into main parts using double newlines as delimiters
    parts = response.split('\n\n')

    # Extract each part
    who_is_customer = parts[0].strip()
    churn_probability_explanation = parts[1].strip()
    reasons_for_churn = ' '.join(parts[2:3]).strip()
    action_recommendations = ' '.join(parts[3:4]).strip()
    # Create a temporary dictionary to hold the current row's data
    temp_dict = {
        'Who is Customer': [who_is_customer],
        'Churn Probability Explanation': [churn_probability_explanation],
        'Reasons for Churn': [reasons_for_churn],
        'Action Recommendations': [action_recommendations]
    }

    temp=pd.DataFrame(temp_dict)
    temp['customer_id']=X_test.iloc[i]['id']
    # Append the dictionary to the list
    llm_shap=pd.concat([llm_shap,temp],axis=0,ignore_index=True)



Processing customer 0
Processing customer 1
Processing customer 2
Processing customer 3
Processing customer 4
Processing customer 5
Processing customer 6
Processing customer 7
Processing customer 8
Processing customer 9
Processing customer 10
Processing customer 11
Processing customer 12
Processing customer 13
Processing customer 14
Processing customer 15
Processing customer 16
Processing customer 17
Processing customer 18
Processing customer 19
Processing customer 20
Processing customer 21
Processing customer 22
Processing customer 23
Processing customer 24
Processing customer 25
Processing customer 26
Processing customer 27
Processing customer 28
Processing customer 29
Processing customer 30
Processing customer 31
Processing customer 32
Processing customer 33
Processing customer 34
Processing customer 35
Processing customer 36
Processing customer 37
Processing customer 38
Processing customer 39
Processing customer 40
Processing customer 41
Processing customer 42
Processing customer 4

In [194]:
llm_shap

Unnamed: 0,Who is Customer,Churn Probability Explanation,Reasons for Churn,Action Recommendations,customer_id
0,1. The customer is a new subscriber with month...,2. Very likely to churn with a probability of ...,3. Top 3 reasons for potential churn:\n - Hi...,4. Suggestions to reduce churn:\n - Offer a ...,0
1,"1. The customer is a non-senior, single, tech-...",2. Likely,3. Top 3 reasons for potential churn:\n - Mo...,4. Suggestions to reduce churn:\n - Offer di...,1
2,"1. Summary: The customer is a long-tenured, lo...",2. Likely: The customer is predicted to churn ...,3. Top 3 reasons for potential churn:\n - Lo...,4. Actions to reduce churn:\n - Offer intern...,2
3,1. The customer is a new user with Fiber optic...,2. Highly likely to churn with a 0.895 probabi...,3. Top 3 reasons for potential churn:\n - To...,4. Top 3 actions to reduce churn:\n - Offer ...,3
4,1. The customer is a non-senior citizen who do...,2. The customer is less likely to churn with a...,3. Top 3 reasons for potential churn:\n - Tw...,4. Top 3 actions to reduce churn:\n - Offer ...,4
...,...,...,...,...,...
95,"1. The customer is a long-term, loyal user wit...",2. Very likely to stay with a churn probabilit...,3. Top 3 reasons for potential churn:\n - La...,4. Top 3 actions to reduce churn:\n - Offer ...,95
96,1. The customer is a male without senior citiz...,2. Likely churn status with a probability of 0...,3. Top 3 reasons for potential churn:\n - Te...,4. Actions to reduce churn:\n - Encourage lo...,96
97,1. The customer is a new user with low monthly...,2. Likely,3. \n - High total charges: Indicates potent...,4. \n - Promote online payment methods: Enco...,97
98,1. The customer is a senior citizen with Fiber...,2. Highly likely to churn with a probability o...,3. Top 3 reasons for potential churn:\n - Mo...,4. Top 3 actions to reduce churn:\n - Offer ...,98


In [None]:
# llm_shap.to_csv('..//results//telco//llm_output.csv',index=False)
# X_test.head(100).to_csv('..//results//telco//llm_input_data.csv',index=False)
# shap_data.head(100).to_csv('..//results//telco//llm_input_shap_data.csv',index=False)