In [81]:
import streamlit as st
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

# Set the page title and layout
st.set_page_config(page_title="A/B Test Demo for Group 7")

# Custom CSS for styling (bold bordeaux headers and system default text and background)
st.markdown("""
    <style>
        /* Set all headers to bold and bordeaux color */
        h1, h2, h3, h4, h5, h6 {
            color: #800000;  /* Bordeaux color */
            font-weight: bold;  /* Make headers bold */
        }
        /* Ensure text follows system default color */
        body {
            background-color: transparent;  /* Let the system decide the background */
            color: inherit;  /* Text color follows the system default */
        }
        .stText {
            color: inherit;  /* Let text follow system color */
        }
        .stMarkdown {
            color: inherit;  /* Let markdown text follow system color */
        }
        /* Set the background for the button to bordeaux with black text */
        .stButton {
            color: black;
            background-color: #800000;  /* Bordeaux button */
        }
    </style>
""", unsafe_allow_html=True)

# Function to show unique values in categorical columns
def show_unique_values_in_categorical_columns(df):
    st.title("Unique Values in Categorical Columns")

    # Identify categorical columns
    categorical_columns = df.select_dtypes(include=['object', 'category']).columns.tolist()

    if not categorical_columns:
        st.warning("No categorical columns found in the file.")
        return

    st.subheader("Unique Values in Categorical Columns:")
    for column in categorical_columns:
        unique_values = df[column].unique()
        st.write(f"Column: {column}")
        st.write(f"Unique values: {unique_values}")

# Function to show basic statistics of numeric columns
def show_basic_statistics(df):
    numeric_df = df.select_dtypes(include=['number'])

    if numeric_df.empty:
        st.warning("No numeric columns found in the file.")
        return
    
    st.subheader("Basic Statistics for Numeric Columns:")
    statistics = numeric_df.describe().T  # Transpose for better readability
    st.write(statistics)

# Function to show demographics analysis
def show_demographics_analysis(df):
    st.title("Demographics Analysis")
    
    if 'gender' in df.columns:
        st.subheader("Gender Distribution:")
        gender_counts = df['gender'].value_counts()
        st.write(gender_counts)

    age_column = 'clnt_age' if 'clnt_age' in df.columns else 'age'
    
    if age_column in df.columns:
        st.subheader("Age Distribution:")
        bins = [0, 30, 40, 50, 100]
        labels = ['Under 30', '30-39', '40-49', '50 and above']
        df['age_group'] = pd.cut(df[age_column], bins=bins, labels=labels)

        age_group_counts = df['age_group'].value_counts()
        st.write(age_group_counts)

        st.subheader("Basic Statistics for Age:")
        age_stats = df[age_column].describe()
        st.write(age_stats)
    else:
        st.warning("No 'age' or 'clnt_age' column found in the file.")

# Function to calculate completion rate based on visit_id
def calculate_within_visit_completion_rate_by_visit(group):
    # Total unique visits that started
    started_visits = group[group['process_step'] == 'start']['visit_id'].nunique()

    # Unique visits that completed each step
    completed_visits = (
        group[group['process_step'] != 'start']
        .groupby('process_step')['visit_id']
        .nunique()
        .reset_index(name='completed_visits')
    )

    # Add the total started visits as a constant column
    completed_visits['started_visits'] = started_visits

    # Calculate the completion rate
    completed_visits['completion_rate'] = (
        completed_visits['completed_visits'] / completed_visits['started_visits']
    ) * 100

    return completed_visits

# Function to calculate completion rate based on client_id
def calculate_within_visit_completion_rate_by_client(group):
    # Total unique visits that started
    started_visits = group[group['process_step'] == 'start']['client_id'].nunique()

    # Unique visits that completed each step
    completed_visits = (
        group[group['process_step'] != 'start']
        .groupby('process_step')['client_id']
        .nunique()
        .reset_index(name='completed_visits')
    )

    # Add the total started visits as a constant column
    completed_visits['started_visits'] = started_visits

    # Calculate the completion rate
    completed_visits['completion_rate'] = (
        completed_visits['completed_visits'] / completed_visits['started_visits']
    ) * 100

    return completed_visits

# Add page for completion rate calculation based on visit_id and client_id
def show_completion_rate_page(df):
    st.title("Completion Rates Based on visit_id and client_id")

    # Separate the control and test groups
    control_group = df[df['variation'] == 'Control']
    test_group = df[df['variation'] == 'Test']

    # Calculate the completion rates for the control and test groups
    control_completion_rate_by_visit = calculate_within_visit_completion_rate_by_visit(control_group)
    test_completion_rate_by_visit = calculate_within_visit_completion_rate_by_visit(test_group)

    control_completion_rate_by_client = calculate_within_visit_completion_rate_by_client(control_group)
    test_completion_rate_by_client = calculate_within_visit_completion_rate_by_client(test_group)

    # Display results for visit_id based completion rate
    st.subheader("Completion Rate Based on visit_id")

    st.write("Control Group Completion Rate (visit_id):")
    st.write(control_completion_rate_by_visit)

    st.write("Test Group Completion Rate (visit_id):")
    st.write(test_completion_rate_by_visit)

    # Display results for client_id based completion rate
    st.subheader("Completion Rate Based on client_id")

    st.write("Control Group Completion Rate (client_id):")
    st.write(control_completion_rate_by_client)

    st.write("Test Group Completion Rate (client_id):")
    st.write(test_completion_rate_by_client)

# Function to perform a two-proportion z-test
def two_proportion_z_test(p1, p2, n1, n2):
    # Calculate the pooled proportion
    P = (p1 * n1 + p2 * n2) / (n1 + n2)
    
    # Calculate the standard error
    SE = (P * (1 - P) * (1 / n1 + 1 / n2)) ** 0.5
    
    # Calculate the z-statistic
    z = (p1 - p2) / SE
    
    # Calculate the p-value
    p_value = 2 * (1 - stats.norm.cdf(abs(z)))  # Two-tailed test
    
    return z, p_value

# Updated hypothesis testing function
def show_hypothesis_testing_page(df):
    st.title("Hypothesis Testing for Completion Rates")

    # Separate the control and test groups
    control_group = df[df['variation'] == 'Control']
    test_group = df[df['variation'] == 'Test']

    # Assuming `control_completion_rate` and `test_completion_rate` are pre-calculated
    steps = ['confirm', 'step_1', 'step_2', 'step_3']
    
    # Displaying the results for each step
    for step in steps:
        st.subheader(f"Step: {step}")
        
        # Get completion rates for both control and test groups for the current step
        control_completions = control_group[control_group['process_step'] == step]['completion_rate'].values[0]
        test_completions = test_group[test_group['process_step'] == step]['completion_rate'].values[0]
        
        control_total = control_group[control_group['process_step'] == step]['started_visits'].values[0]
        test_total = test_group[test_group['process_step'] == step]['started_visits'].values[0]

        # Calculate proportions (completion rate)
        p_control = control_completions / 100  # Convert completion rate to a proportion
        p_test = test_completions / 100  # Convert completion rate to a proportion
        
        # Perform the two-proportion z-test
        z_stat, p_value = two_proportion_z_test(p_control, p_test, control_total, test_total)
        
        # Displaying the results
        st.write(f"Z-statistic: {z_stat:.4f}")
        st.write(f"P-value: {p_value:.4f}")
        
        # Hypothesis testing interpretation
        if p_value < 0.05:
            st.write(f"**Reject the null hypothesis**: There is a significant difference in completion rates between control and test group for step: {step}.")
        else:
            st.write(f"**Fail to reject the null hypothesis**: There is no significant difference in completion rates between control and test group for step: {step}.")
        st.write("\n")

    # ----------------------------------------
    # Additional Hypothesis Testing: Tenure
    st.subheader("Hypothesis Test: Tenure")
    
    # Remove duplicates based on 'client_id' to get unique clients
    control_uniqe = control_group.drop_duplicates(subset='client_id')
    test_unique = test_group.drop_duplicates(subset='client_id')

    # Extract the tenure data for both groups
    control_tenure = control_uniqe['clnt_tenure_yr']
    test_tenure = test_unique['clnt_tenure_yr']
        
    # Perform two-sample t-test for tenure
    _, p_value_tenure = st.ttest_ind(control_tenure, test_tenure, equal_var=True)  # assuming equal variance

    # Display the results
    st.write(f"Average Tenure in Control group: {control_tenure.mean():.2f} years")
    st.write(f"Average Tenure in Test group: {test_tenure.mean():.2f} years")
    st.write(f"T-statistic: {_:.4f}")
    st.write(f"P-value: {p_value_tenure:.4f}")

    # Hypothesis test: Is there a significant difference in tenure between the two groups?
    if p_value_tenure < 0.05:
        st.write("**Reject the null hypothesis**: The average tenure is significantly different between the Test and Control groups.")
    else:
        st.write("**Fail to reject the null hypothesis**: The average tenure is not significantly different between the Test and Control groups.")
    
    st.write("\n")

    # ----------------------------------------
    # Additional Hypothesis Testing: Age
    st.subheader("Hypothesis Test: Age")
    
    # Extract the age data for both groups
    control_age = control_uniqe['clnt_age']
    test_age = test_unique['clnt_age']

    # Perform two-sample t-test for age
    _, p_value_age = st.ttest_ind(control_age, test_age, equal_var=True)  # assuming equal variance

    # Display the results
    st.write(f"Control Group Mean Age: {control_age.mean():.2f} years")
    st.write(f"Test Group Mean Age: {test_age.mean():.2f} years")
    st.write(f"T-statistic: {_:.4f}")
    st.write(f"P-value: {p_value_age:.4f}")

    # Hypothesis test: Is there a significant difference in age between the two groups?
    if p_value_age < 0.05:
        st.write("**Reject the null hypothesis**: The average age is different between the Test and Control groups.")
    else:
        st.write("**Fail to reject the null hypothesis**: The average age is not significantly different between the Test and Control groups.")

    st.write("\n")

# Function to show completion rate with a cost-effectiveness threshold
def show_cost_effectiveness_page(df):
    st.title("Completion Rate with a Cost-Effectiveness Threshold")

    # Set the threshold value
    cost_effectiveness_threshold = 5  # Vanguard's threshold for completion rate increase

    # Separate the control and test groups
    control_group = df[df['variation'] == 'Control']
    test_group = df[df['variation'] == 'Test']

    # Calculate the completion rate for each group across all steps
    control_mean = control_group['completion_rate'].mean()
    test_mean = test_group['completion_rate'].mean()

    # Perform a two-sample t-test to check if the completion rates are significantly different
    t_stat, p_value = ttest_ind(
        control_group['completion_rate'], 
        test_group['completion_rate'], 
        alternative='two-sided'
    )

    # Display the completion rate analysis and t-test results
    st.subheader("Average Completion Rates")
    st.write(f"Control Group Average Completion Rate: {control_mean:.2f}%")
    st.write(f"Test Group Average Completion Rate: {test_mean:.2f}%")

    st.subheader("T-Test Results")
    st.write(f"T-statistic: {t_stat:.4f}")
    st.write(f"P-value: {p_value:.4f}")

    # Hypothesis test interpretation
    alpha = 0.05  # Significance level
    if p_value < alpha:
        st.write("**Reject the null hypothesis**: The completion rates are significantly different between the Test and Control groups.")
    else:
        st.write("**Fail to reject the null hypothesis**: The completion rates are not significantly different between the Test and Control groups.")

    # Check if the increase in completion rate meets the 5% threshold
    completion_rate_increase = test_mean - control_mean
    st.subheader("Cost-Effectiveness Analysis")
    st.write(f"Completion Rate Increase: {completion_rate_increase:.2f}%")

    # Compare against the 5% threshold
    if completion_rate_increase >= cost_effectiveness_threshold:
        st.write("The completion rate increase meets the 5% threshold, justifying the cost of the new design.")
    else:
        st.write("The completion rate increase does not meet the 5% threshold. The new design may not justify its cost.")

# Sidebar navigation
app_mode = st.sidebar.selectbox("Choose a page", ["Home", "About", "Completion Rate Analysis", "Unique Values in Categorical Columns", "Basic Statistics", "Demographics Analysis", "Contact"])

# Handle file upload only on the Home page
if app_mode == "Home":
    st.write("This is the home page of the A/B Test demo.")
    
    # File uploader widget (only on Home page)
    uploaded_file = st.file_uploader("Upload a CSV file for analysis", type="csv")

    if uploaded_file is not None:
        try:
            # Read the uploaded file into a DataFrame and store it in session state
            df = pd.read_csv(uploaded_file)
            st.session_state.df = df  # Store the DataFrame in session state
            st.write(f"Data loaded successfully with {df.shape[0]} rows and {df.shape[1]} columns.")
        except Exception as e:
            st.error(f"Error reading the file: {e}")
    else:
        if 'df' in st.session_state:
            st.write(f"Data already uploaded with {st.session_state.df.shape[0]} rows and {st.session_state.df.shape[1]} columns.")
        else:
            st.warning("Please upload a CSV file for analysis.")

# Check if file exists in session state before navigating to other pages
if 'df' in st.session_state:
    df = st.session_state.df  # Retrieve the uploaded DataFrame from session state
else:
    df = None

# Page logic (with file upload check)
if app_mode == "Basic Statistics":
    if df is not None:
        show_basic_statistics(df)
    else:
        st.error("Please upload a file on the Home page first.")

elif app_mode == "Demographics Analysis":
    if df is not None:
        show_demographics_analysis(df)
    else:
        st.error("Please upload a file on the Home page first.")

elif app_mode == "Completion Rate Analysis":
    if df is not None:
        show_completion_rate_page(df)
    else:
        st.error("Please upload a file on the Home page first.")

elif app_mode == "Hypothesis Testing":
    if df is not None:
        show_hypothesis_testing_page(df)
    else:
        st.error("Please upload a file on the Home page first.")

elif app_mode == "Cost-Effectiveness Analysis":
    if df is not None:
        show_cost_effectiveness_page(df)
    else:
        st.error("Please upload a file on the Home page first.")

elif app_mode == "About":
    st.subheader("The Digital Challenge")
    st.write("""
        The digital world is evolving, and so are Vanguard’s clients. Vanguard believed that a more intuitive and modern User Interface (UI), 
        coupled with timely in-context prompts (cues, messages, hints, or instructions provided to users directly within the context of their 
        current task or action), could make the online process smoother for clients. The critical question was: 
        Would these changes encourage more clients to complete the process?
    """)