In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
import statsmodels.formula.api as smf
from statsmodels.genmod.families.family import NegativeBinomial
import tkinter as tk
from tkinter import filedialog
from scipy import stats
from docx import Document
from docx.shared import Inches
from io import BytesIO
import statsmodels.discrete.discrete_model as discrete
from statsmodels.regression.mixed_linear_model import MixedLM

In [None]:
# Multilevel Analysis
# Multilevel mixed effects model with all variables

    # Check for heterogeneity across states if us_state is in the data
if 'us_state_enc' in df_clean_nb.columns or 'us_state' in df_clean_nb.columns:
    state_var = 'us_state_enc' if 'us_state_enc' in df_clean_nb.columns else 'us_state'
    state_means = df_clean_nb.groupby(state_var)['los_capped'].mean().sort_values()
    
    plt.figure(figsize=(12, 8))
    state_means.plot(kind='bar')
    plt.xlabel('State')
    plt.ylabel('Average Length of Stay')
    plt.title('Mean Length of Stay by State')
    plt.xticks(rotation=90)
    plt.tight_layout()
    los_by_state_img = BytesIO()
    plt.savefig(los_by_state_img, format='png')
    los_by_state_img.seek(0)
    plt.close()
    
    # Add state analysis to document
    doc.add_paragraph('\n')
    doc.add_heading('State Analysis', level=2)
    doc.add_picture(los_by_state_img, width=Inches(6))
    doc.add_paragraph('Figure 4: Mean Length of Stay by State')

# Approximated Multilevel Model
doc.add_paragraph('\n')
doc.add_heading('Approximated Multilevel Model', level=1)
doc.add_paragraph('Using MixedLM to approximate a multilevel model with random effects for states.')

# Debugging: Check df_clean_nb and the condition
print("Columns in df_clean_nb:", df_clean_nb.columns.tolist())
print("Checking if 'us_state_enc' is in df_clean_nb.columns:", 'us_state_enc' in df_clean_nb.columns)

   
   

# Check if us_state variable exists for multilevel modeling
if 'us_state_enc' in df_clean_nb.columns:
   
    
    # Ensure no missing values in variables used for mixed effects model
    model_vars = ['los_capped'] + continuous_vars + categorical_model_vars 
    df_clean_model = df_clean_nb[model_vars].dropna()

    df_clean_model = df_clean_model.reset_index(drop=True)

    df_clean_model_old= df_clean_model.copy()

  
    y = df_clean_model['los_capped']
    
    # Create X matrix for fixed effects
    X_vars = []
    for var in continuous_vars:
        if var in df_clean_model.columns:
            X_vars.append(var)
    
    X = df_clean_model[X_vars].copy()
    
    # Add categorical variables (one-hot encoded)
    for var in categorical_model_vars:
        if var in df_clean_model.columns and var != 'us_state_enc':  # Exclude the grouping variable
            dummies = pd.get_dummies(df_clean_model[var], prefix=var, drop_first=True)
            X = pd.concat([X, dummies], axis=1)
    
    # Add intercept
    X = sm.add_constant(X)
    
    X=X.astype(float)
    print("X dtypes after converting to float:")
    print(X.dtypes)

  
    
    # Define groups for random effects
    groups = df_clean_model['us_state_enc']

    print(f"Length of df_clean_model: {len(df_clean_model)}")
    print(f"Length of y: {len(y)}")
    print(f"Number of rows in X: {X.shape[0]}")
    print(f"Length of groups: {len(groups)}")

    if len(y) != X.shape[0] or len(y) != len(groups):
        print('Length mismatch between y, X, and groups. Check data preparation.')
        print(f"y length:{len(y)}")
        print(f"X rows: {X.shape[0]}")
        print(f"groups length: {len(groups)}")
        # Check for NaN values in X
        print("NaN counts in X columns:")
        print(X.isnull().sum())
        raise ValueError("Lengths of y, X, and groups do not match!")


    
    
    # Fit mixed effects model
    mixed_model = MixedLM(y, X, groups)
    try:
        mixed_results = mixed_model.fit()
        mixed_summary = str(mixed_results.summary())
        print("\nApproximated Multilevel Model Results (with logged values):")
        print(mixed_summary)
        
        # Add to document
        doc.add_paragraph('Model Summary:')
        summary_paragraph = doc.add_paragraph()
        summary_run = summary_paragraph.add_run(mixed_summary)
        summary_run.font.name = 'Courier New'  # Use monospace font
        #summary_run.font.size = Pt(10)  # Optional: Adjust font size
        #for line in mixed_summary.split('\n'):
            #doc.add_paragraph(line)
        
        # Add variance components
        doc.add_paragraph('\nVariance Components:')
        vc_table = doc.add_table(rows=3, cols=2)
        vc_table.style = 'Table Grid'
        vc_table.cell(0, 0).text = 'Component'
        vc_table.cell(0, 1).text = 'Estimate'
        vc_table.cell(1, 0).text = 'State Random Effect Variance'
        vc_table.cell(1, 1).text = f"{mixed_results.cov_re.iloc[0, 0]:.4f}"
        vc_table.cell(2, 0).text = 'Residual Variance'
        vc_table.cell(2, 1).text = f"{mixed_results.scale:.4f}"
        
        # Calculate intraclass correlation coefficient (ICC)
        state_var = mixed_results.cov_re.iloc[0, 0]
        residual_var = mixed_results.scale
        icc = state_var / (state_var + residual_var)

        
        
        # Convert coefficients to incident rate ratios (IRR)
        print("\nIncident Rate Ratios (IRR) Mixed Model:")
        irr_mixed = np.exp(mixed_results.params)
        irr_conf_mixed = np.exp(mixed_results.conf_int())
        irr_df_mixed = pd.DataFrame({'IRR': irr_mixed, 'Lower CI': irr_conf_mixed[0], 'Upper CI': irr_conf_mixed[1], 
                            'P-value': mixed_results.pvalues})
        print(irr_df_mixed)
        
        # Add IRR table to document
        doc.add_paragraph('\n')
        doc.add_heading('Incident Rate Ratios (IRR) Mixed Model', level=2)
        irr_table_mixed = doc.add_table(rows=len(irr_df_mixed)+1, cols=5)
        irr_table_mixed.style = 'Table Grid'
        irr_table_mixed.cell(0, 0).text = 'Variable'
        irr_table_mixed.cell(0, 1).text = 'IRR'
        irr_table_mixed.cell(0, 2).text = 'Lower CI'
        irr_table_mixed.cell(0, 3).text = 'Upper CI'
        irr_table_mixed.cell(0, 4).text = 'P-value'
        
        for i, (var, row) in enumerate(irr_df_mixed.iterrows(), 1):
            irr_table_mixed.cell(i, 0).text = str(var)
            irr_table_mixed.cell(i, 1).text = f"{row['IRR']:.4f}"
            irr_table_mixed.cell(i, 2).text = f"{row['Lower CI']:.4f}"
            irr_table_mixed.cell(i, 3).text = f"{row['Upper CI']:.4f}"
            irr_table_mixed.cell(i, 4).text = f"{row['P-value']:.4f}"
        
        doc.add_paragraph(f'\nIntraclass Correlation Coefficient (ICC): {icc:.4f}')
        doc.add_paragraph('The ICC represents the proportion of the total variance in length of stay ' +
                            'that is attributable to differences between states.')
        # After fitting the mixed model
        # Compute residuals and fitted values
        df_clean_model['fitted'] = mixed_results.fittedvalues
        df_clean_model['residuals'] = mixed_results.resid

        # Plot residuals vs fitted values
        plt.figure(figsize=(10, 6))
        plt.scatter(df_clean_model['fitted'], df_clean_model['residuals'], alpha=0.5)
        plt.axhline(y=0, color='r', linestyle='-')
        plt.xlabel('Fitted Values')
        plt.ylabel('Residuals')
        plt.title('Residuals vs Fitted Values')
        plt.tight_layout()
        residuals_vs_fitted_img = BytesIO()
        plt.savefig(residuals_vs_fitted_img, format='png')
        residuals_vs_fitted_img.seek(0)
        plt.close()

        # Add to document
        doc.add_paragraph('\n')
        doc.add_heading('Post-Estimation Diagnostics', level=2)
        doc.add_picture(residuals_vs_fitted_img, width=Inches(6))
        doc.add_paragraph('Figure 4: Residuals vs Fitted Values')

        # Q-Q plot for normality
        plt.figure(figsize=(10, 6))
        stats.probplot(df_clean_model['residuals'], dist="norm", plot=plt)
        plt.title('Q-Q Plot of Residuals')
        plt.tight_layout()
        qq_plot_img = BytesIO()
        plt.savefig(qq_plot_img, format='png')
        qq_plot_img.seek(0)
        plt.close()

        # Add to document
        doc.add_picture(qq_plot_img, width=Inches(6))
        doc.add_paragraph('Figure 5: Q-Q Plot of Residuals')

        # Fit a simple linear model (no random effects)
        ols_model = smf.ols(formula, df_clean_nb)
        ols_results = ols_model.fit()

        # Compute the likelihood ratio test
        lr_stat = -2 * (ols_results.llf - mixed_results.llf)
        p_value = stats.chi2.sf(lr_stat, df=1)  # df=1 for one random effect
        doc.add_paragraph(f'\nLikelihood Ratio Test for Random Effects: Statistic = {lr_stat:.2f}, P-value = {p_value:.4f}')


        # Pseudo-R² (McFadden's R² approximation)
        null_model = smf.mixedlm("los_capped ~ 1", df_clean_model, groups=df_clean_model['us_state_enc'])
        null_results = null_model.fit()
        pseudo_r2 = 1 - (mixed_results.llf / null_results.llf)
        doc.add_paragraph(f'\nPseudo-R² (McFadden): {pseudo_r2:.4f}')

        

        # Check VIF for continuous variables
        from statsmodels.stats.outliers_influence import variance_inflation_factor
        X_continuous = X[[col for col in X.columns if col != 'const']]  # Exclude intercept
        vif_data = pd.DataFrame()
        vif_data["Variable"] = X_continuous.columns
        vif_data["VIF"] = [variance_inflation_factor(X_continuous.values, i) for i in range(X_continuous.shape[1])]
        print("VIF for continuous and dummy variables:")
        print(vif_data)
        doc.add_paragraph('\nVariance Inflation Factor (VIF) for Continuous and Categorical Variables:')
        vif_table = doc.add_table(rows=len(vif_data)+1, cols=2)
        vif_table.style = 'Table Grid'
        vif_table.cell(0, 0).text = 'Variable'
        vif_table.cell(0, 1).text = 'VIF'
        for i, (var, vif) in enumerate(zip(vif_data["Variable"], vif_data["VIF"]), 1):
            vif_table.cell(i, 0).text = str(var)
            vif_table.cell(i, 1).text = f"{vif:.4f}"
        doc.add_paragraph('VIF values above 10 indicate potential multicollinearity issues.')
    except ValueError as ve:
        error_msg = f"ValueError in mixed model fitting: {str(ve)}"
        print(error_msg)
        doc.add_paragraph(error_msg)
        doc.add_paragraph("The mixed effects model failed to converge. This can happen due to " +
                            "insufficient variation in the grouping variable or other model specification issues.")
    except RuntimeError as re:
        error_msg = f"RuntimeError in mixed model fitting: {str(re)}"
        print(error_msg)
        doc.add_paragraph(error_msg)
        doc.add_paragraph("The mixed effects model failed to converge. This can happen due to " +
                            "insufficient variation in the grouping variable or other model specification issues.")    


    except Exception as e:
        error_msg = f"Error fitting mixed model: {str(e)}"
        print(error_msg)
        doc.add_paragraph(error_msg)
        doc.add_paragraph("The mixed effects model failed to converge. This can happen due to " +
                            "insufficient variation in the grouping variable or other model specification issues.")#
    
   

In [None]:
#New mixed effects model starts here this one is a simpler version

  #Create new X's which will be simplified i.e. no multicollinearity
X_simplified = X.copy()

# Drop columns 'distance_miles' and 'state_unemployment' from X
X_simplified = X_simplified.drop(columns=['import_from_slu_log', 'immigrant_population_log', 'state_unemployment'], errors='ignore')


# Fit mixed effects model with fewer variables
mixed_model_simple = MixedLM(y, X_simplified, groups)
try:
    mixed_results_simple = mixed_model_simple.fit()
    mixed_summary_simple = str(mixed_results_simple.summary())
    print("\nApproximated Multilevel Model Results (with logged values):")
    print(mixed_summary_simple)
    
    # Add to document
    doc.add_paragraph('Model Summary with fewer variables:')
    summary_paragraph_simple = doc.add_paragraph()
    summary_run_simple = summary_paragraph_simple.add_run(mixed_summary_simple)
    summary_run_simple.font.name = 'Courier New'  # Use monospace font
    #summary_run.font.size = Pt(10)  # Optional: Adjust font size
    #for line in mixed_summary.split('\n'):
        #doc.add_paragraph(line)
    
    # Add variance components
    doc.add_paragraph('\nVariance Components with fewer variables:')
    vc_table = doc.add_table(rows=3, cols=2)
    vc_table.style = 'Table Grid'
    vc_table.cell(0, 0).text = 'Component'
    vc_table.cell(0, 1).text = 'Estimate'
    vc_table.cell(1, 0).text = 'State Random Effect Variance'
    vc_table.cell(1, 1).text = f"{mixed_results_simple.cov_re.iloc[0, 0]:.4f}"
    vc_table.cell(2, 0).text = 'Residual Variance'
    vc_table.cell(2, 1).text = f"{mixed_results_simple.scale:.4f}"
    
    # Calculate intraclass correlation coefficient (ICC)
    state_var_simple = mixed_results_simple.cov_re.iloc[0, 0]
    residual_var_simple = mixed_results_simple.scale
    icc_simple = state_var_simple / (state_var_simple + residual_var_simple)

    # Add model summary to document
    #doc.add_paragraph('\nMixed Summary:')
    #for line in mixed_summary.split('\n'):
        #doc.add_paragraph(line)
    
    # Convert coefficients to incident rate ratios (IRR)
    print("\nIncident Rate Ratios (IRR) Mixed Model:")
    irr_mixed_simple = np.exp(mixed_results_simple.params)
    irr_conf_mixed_simple = np.exp(mixed_results_simple.conf_int())
    irr_df_mixed_simple = pd.DataFrame({'IRR': irr_mixed_simple, 'Lower CI': irr_conf_mixed_simple[0], 'Upper CI': irr_conf_mixed_simple[1], 
                        'P-value': mixed_results_simple.pvalues})
    print(irr_df_mixed_simple)
    
    # Add IRR table to document
    doc.add_paragraph('\n')
    doc.add_heading('Incident Rate Ratios (IRR) Mixed Model', level=2)
    irr_table_mixed = doc.add_table(rows=len(irr_df_mixed_simple)+1, cols=5)
    irr_table_mixed.style = 'Table Grid'
    irr_table_mixed.cell(0, 0).text = 'Variable'
    irr_table_mixed.cell(0, 1).text = 'IRR'
    irr_table_mixed.cell(0, 2).text = 'Lower CI'
    irr_table_mixed.cell(0, 3).text = 'Upper CI'
    irr_table_mixed.cell(0, 4).text = 'P-value'
    
    for i, (var, row) in enumerate(irr_df_mixed_simple.iterrows(), 1):
        irr_table_mixed.cell(i, 0).text = str(var)
        irr_table_mixed.cell(i, 1).text = f"{row['IRR']:.4f}"
        irr_table_mixed.cell(i, 2).text = f"{row['Lower CI']:.4f}"
        irr_table_mixed.cell(i, 3).text = f"{row['Upper CI']:.4f}"
        irr_table_mixed.cell(i, 4).text = f"{row['P-value']:.4f}"
    
    doc.add_paragraph(f'\nIntraclass Correlation Coefficient (ICC): {icc:.4f}')
    doc.add_paragraph('The ICC represents the proportion of the total variance in length of stay ' +
                        'that is attributable to differences between states.')
    # After fitting the mixed model
    # Compute residuals and fitted values
    df_clean_model['fitted_simple'] = mixed_results_simple.fittedvalues
    df_clean_model['residuals_simple'] = mixed_results_simple.resid

    # Plot residuals vs fitted values
    plt.figure(figsize=(10, 6))
    plt.scatter(df_clean_model['fitted_simple'], df_clean_model['residuals_simple'], alpha=0.5)
    plt.axhline(y=0, color='r', linestyle='-')
    plt.xlabel('Fitted Values')
    plt.ylabel('Residuals')
    plt.title('Residuals vs Fitted Values')
    plt.tight_layout()
    residuals_vs_fitted_img_simple = BytesIO()
    plt.savefig(residuals_vs_fitted_img_simple, format='png')
    residuals_vs_fitted_img_simple.seek(0)
    plt.close()

    # Add to document
    doc.add_paragraph('\n')
    doc.add_heading('Post-Estimation Diagnostics', level=2)
    doc.add_picture(residuals_vs_fitted_img_simple, width=Inches(6))
    doc.add_paragraph('Figure 5: Residuals vs Fitted Values for Simpler Model')

    # Q-Q plot for normality
    plt.figure(figsize=(10, 6))
    stats.probplot(df_clean_model['residuals_simple'], dist="norm", plot=plt)
    plt.title('Q-Q Plot of Residuals Simpler Model')
    plt.tight_layout()
    qq_plot_img_simple = BytesIO()
    plt.savefig(qq_plot_img_simple, format='png')
    qq_plot_img_simple.seek(0)
    plt.close()

    # Add to document
    doc.add_picture(qq_plot_img_simple, width=Inches(6))
    doc.add_paragraph('Figure 6: Q-Q Plot of Residuals with fewer variables')

    # Fit a simple linear model (no random effects)
    ols_model = smf.ols(formula, df_clean_nb)
    ols_results = ols_model.fit()

    # Compute the likelihood ratio test
    lr_stat_simple = -2 * (ols_results.llf - mixed_results_simple.llf)
    p_value_simple = stats.chi2.sf(lr_stat_simple, df=1)  # df=1 for one random effect
    doc.add_paragraph(f'\nLikelihood Ratio Test for Random Effects with Simple Model: Statistic = {lr_stat_simple:.2f}, P-value = {p_value_simple:.4f}')


    # Pseudo-R² (McFadden's R² approximation)
    null_model_simple = smf.mixedlm("los_capped ~ 1", df_clean_model, groups=df_clean_model['us_state_enc'])
    null_results_simple = null_model_simple.fit()
    pseudo_r2_simple = 1 - (mixed_results_simple.llf / null_results_simple.llf)
    doc.add_paragraph(f'\nPseudo-R² (McFadden) fewer variables: {pseudo_r2_simple:.4f}')

    

    # Check VIF for continuous variables
    from statsmodels.stats.outliers_influence import variance_inflation_factor
    X_continuous_simple = X_simplified[[col for col in X_simplified.columns if col != 'const']]  # Exclude intercept
    vif_data_simple = pd.DataFrame()
    vif_data_simple["Variable"] = X_continuous_simple.columns
    vif_data_simple["VIF"] = [variance_inflation_factor(X_continuous_simple.values, i) for i in range(X_continuous_simple.shape[1])]
    print("VIF for continuous and dummy variables with fewer variables:")
    print(vif_data_simple)
    doc.add_paragraph('\nVariance Inflation Factor (VIF) for Continuous and Categorical Variables with fewer variables:')
    vif_table = doc.add_table(rows=len(vif_data_simple)+1, cols=2)
    vif_table.style = 'Table Grid'
    vif_table.cell(0, 0).text = 'Variable'
    vif_table.cell(0, 1).text = 'VIF'
    for i, (var, vif) in enumerate(zip(vif_data_simple["Variable"], vif_data_simple["VIF"]), 1):
        vif_table.cell(i, 0).text = str(var)
        vif_table.cell(i, 1).text = f"{vif:.4f}"
    doc.add_paragraph('VIF values above 10 indicate potential multicollinearity issues.')
except ValueError as ve:
    error_msg = f"ValueError in mixed model fitting: {str(ve)}"
    print(error_msg)
    doc.add_paragraph(error_msg)
    doc.add_paragraph("The mixed effects model failed to converge. This can happen due to " +
                        "insufficient variation in the grouping variable or other model specification issues.")
#except RuntimeError as re:
    #error_msg = f"RuntimeError in mixed model fitting: {str(re)}"
    #print(error_msg)
    #doc.add_paragraph(error_msg)
    #doc.add_paragraph("The mixed effects model failed to converge. This can happen due to " +
                    # "insufficient variation in the grouping variable or other model specification issues.")    


#except Exception as e:
    #error_msg = f"Error fitting mixed model: {str(e)}"
    #print(error_msg)
    #doc.add_paragraph(error_msg)
    #doc.add_paragraph("The mixed effects model failed to converge. This can happen due to " +
                        #"insufficient variation in the grouping variable or other model specification issues.")

#second mixed effects model ends here



In [None]:
#Third mixed effects model with log transformations
# Log transformations will address issues with hetroscedasticity and also model fit

#Step 1: Log transform length of stay but only if it was non zero
df_clean_model['log_los_capped'] = np.where(df_clean_model['los_capped'] > 0, np.log(df_clean_model['los_capped']), 0)



#With these transformations, now we set new X's and y's for the mixed effects model
y_log = df_clean_model['log_los_capped']
# Create X matrix for fixed effects
X_log= X.copy()

#Drop the original continuous variables
#X_log = X_log.drop(columns=['age', 'distance_miles', 'immigrant_population', 'import_from_slu',  ])
                            




# Create orthogonal polynomials up to 3rd degree for age and distance_miles
X_log['age_orth1'] = X_log['age'] -X_log['age'].mean()
X_log['age_orth2'] = X_log['age_orth1']**2
X_log['age_orth3'] = X_log['age_orth1']**3
X_log['log_distance_orth1'] = X_log['distance_miles_log'] -X_log['distance_miles_log'].mean()
X_log['log_distance_orth2'] = X_log['log_distance_orth1']**2
X_log['log_distance_orth3'] = X_log['log_distance_orth1']**3

#X_log = X_log.drop(columns=['age', 'log_distance','log_distance_orth3'])

#We create interaction terms for the log transformed variables
# Create interaction terms
# Create interactions between purpose_simple_2.0 (Events) and summer month (season_enc_7)
X_log['purpose_2_X_month_7'] = X_log['purpose_simple_2.0'] * X_log['season_enc_7']
                                

# Events in December
X_log['purpose_2_X_month_12'] = X_log['purpose_simple_2.0'] * X_log['season_enc_12']

# Purpose 5.0 (Other) with month 8 (which has the most negative coefficient)
X_log['purpose_5_X_month_8'] = X_log['purpose_simple_5.0'] * X_log['season_enc_8']


X_log['age_X_purpose2'] = X_log['age_orth1']* X_log['purpose_simple_2.0']
X_log['age_X_purpose5'] = X_log['age_orth1']* X_log['purpose_simple_5.0']
X_log['distance_X_purpose2'] = X_log['log_distance_orth1']* X_log['purpose_simple_2.0']
X_log['distance_X_purpose5'] = X_log['log_distance_orth1']* X_log['purpose_simple_5.0']
                                
#VIF and regression results show that some interaction terms are not significant and other variables are not needed
#X_log= X_log.drop(columns=[ 'age_orth1','log_state_unemployment', 'distance_X_purpose5','age_X_purpose5','log_import_from_slu',
                            # 'employment_status_enc_1', 'employment_status_enc_2'])

# Fit mixed effects model with log-transformed variables
mixed_model_log = MixedLM(y_log, X_log, groups)
try:
    mixed_results_log = mixed_model_log.fit()
    mixed_summary_log = str(mixed_results_log.summary())
    print("\nApproximated Multilevel Model Results With Log Transformations:")
    print(mixed_summary_log)
    
    # Add to document
    doc.add_paragraph('Model Summary With Log Transformations:')
    summary_paragraph_log = doc.add_paragraph()
    summary_run_log = summary_paragraph_log.add_run(mixed_summary_log)
    summary_run_log.font.name = 'Courier New'  # Use monospace font
    #summary_run.font.size = Pt(10)  # Optional: Adjust font size
    #for line in mixed_summary.split('\n'):
        #doc.add_paragraph(line)
    
    # Add variance components
    doc.add_paragraph('\nVariance Components With Log Transformations:')
    vc_table = doc.add_table(rows=3, cols=2)
    vc_table.style = 'Table Grid'
    vc_table.cell(0, 0).text = 'Component'
    vc_table.cell(0, 1).text = 'Estimate'
    vc_table.cell(1, 0).text = 'State Random Effect Variance'
    vc_table.cell(1, 1).text = f"{mixed_results_log.cov_re.iloc[0, 0]:.4f}"
    vc_table.cell(2, 0).text = 'Residual Variance'
    vc_table.cell(2, 1).text = f"{mixed_results_log.scale:.4f}"
    
    # Calculate intraclass correlation coefficient (ICC)
    state_var_log = mixed_results_log.cov_re.iloc[0, 0]
    residual_var_log = mixed_results_log.scale
    icc_log = state_var_log / (state_var_log + residual_var_log)

    # Add model summary to document
    #doc.add_paragraph('\nMixed Summary:')
    #for line in mixed_summary.split('\n'):
        #doc.add_paragraph(line)
    
    # Convert coefficients to incident rate ratios (IRR)
    print("\nIncident Rate Ratios (IRR) Mixed Model With Log Transformations:")
    irr_mixed_log = np.exp(mixed_results_log.params)
    irr_conf_mixed_log = np.exp(mixed_results_log.conf_int())
    irr_df_mixed_log = pd.DataFrame({'IRR': irr_mixed_log, 'Lower CI': irr_conf_mixed_log[0], 'Upper CI': irr_conf_mixed_log[1], 
                        'P-value': mixed_results_log.pvalues})
    print(irr_df_mixed_log)
    
    # Add IRR table to document
    doc.add_paragraph('\n')
    doc.add_heading('Incident Rate Ratios (IRR) Mixed Model With Log Transformations', level=2)
    irr_table_mixed = doc.add_table(rows=len(irr_df_mixed_log)+1, cols=5)
    irr_table_mixed.style = 'Table Grid'
    irr_table_mixed.cell(0, 0).text = 'Variable'
    irr_table_mixed.cell(0, 1).text = 'IRR'
    irr_table_mixed.cell(0, 2).text = 'Lower CI'
    irr_table_mixed.cell(0, 3).text = 'Upper CI'
    irr_table_mixed.cell(0, 4).text = 'P-value'
    
    for i, (var, row) in enumerate(irr_df_mixed_log.iterrows(), 1):
        irr_table_mixed.cell(i, 0).text = str(var)
        irr_table_mixed.cell(i, 1).text = f"{row['IRR']:.4f}"
        irr_table_mixed.cell(i, 2).text = f"{row['Lower CI']:.4f}"
        irr_table_mixed.cell(i, 3).text = f"{row['Upper CI']:.4f}"
        irr_table_mixed.cell(i, 4).text = f"{row['P-value']:.4f}"
    
    doc.add_paragraph(f'\nIntraclass Correlation Coefficient (ICC): {icc:.4f}')
    doc.add_paragraph('The ICC represents the proportion of the total variance in length of stay ' +
                        'that is attributable to differences between states.')
    # After fitting the mixed model
    # Compute residuals and fitted values
    df_clean_model['fitted_log'] = mixed_results_log.fittedvalues
    df_clean_model['residuals_log'] = mixed_results_log.resid

    # Plot residuals vs fitted values
    plt.figure(figsize=(10, 6))
    plt.scatter(df_clean_model['fitted_log'], df_clean_model['residuals_log'], alpha=0.5)
    plt.axhline(y=0, color='r', linestyle='-')
    plt.xlabel('Fitted Values')
    plt.ylabel('Residuals')
    plt.title('Residuals vs Fitted Values')
    plt.tight_layout()
    residuals_vs_fitted_img_log = BytesIO()
    plt.savefig(residuals_vs_fitted_img_log, format='png')
    residuals_vs_fitted_img_log.seek(0)
    plt.close()

    # Add to document
    doc.add_paragraph('\n')
    doc.add_heading('Post-Estimation Diagnostics', level=2)
    doc.add_picture(residuals_vs_fitted_img_log, width=Inches(6))
    doc.add_paragraph('Figure 8: Residuals vs Fitted Values With Log Transformations')

    # Q-Q plot for normality
    plt.figure(figsize=(10, 6))
    stats.probplot(df_clean_model['residuals_log'], dist="norm", plot=plt)
    plt.title('Q-Q Plot of Residuals With Log Transformations')
    plt.tight_layout()
    qq_plot_img_log = BytesIO()
    plt.savefig(qq_plot_img_log, format='png')
    qq_plot_img_log.seek(0)
    plt.close()

    # Add to document
    doc.add_picture(qq_plot_img_log, width=Inches(6))
    doc.add_paragraph('Figure 9: Q-Q Plot of Residuals With Log Transformations')

    # Fit a log linear model (no random effects)
    ols_model = smf.ols(formula, df_clean_nb)
    ols_results = ols_model.fit()

    # Compute the likelihood ratio test
    lr_stat_log = -2 * (ols_results.llf - mixed_results_log.llf)
    p_value_log = stats.chi2.sf(lr_stat_log, df=1)  # df=1 for one random effect
    doc.add_paragraph(f'\nLikelihood Ratio Test for Random Effects with log Model: Statistic = {lr_stat_log:.2f}, P-value = {p_value_log:.4f}')


    # Pseudo-R² (McFadden's R² approximation)
    null_model_log = smf.mixedlm("log_los_capped ~ 1", df_clean_model, groups=df_clean_model['us_state_enc'])
    null_results_log = null_model_log.fit()
    pseudo_r2_log = 1 - (mixed_results_log.llf / null_results_log.llf)
    doc.add_paragraph(f'\nPseudo-R² (McFadden) With Log Transformations: {pseudo_r2_log:.4f}')

    

    # Check VIF for continuous variables
    from statsmodels.stats.outliers_influence import variance_inflation_factor
    X_continuous_log = X_log[[col for col in X_log.columns if col != 'const']]  # Exclude intercept
    vif_data_log = pd.DataFrame()
    vif_data_log["Variable"] = X_continuous_log.columns
    vif_data_log["VIF"] = [variance_inflation_factor(X_continuous_log.values, i) for i in range(X_continuous_log.shape[1])]
    print("VIF for continuous and dummy variables With Log Transformations:")
    print(vif_data_log)
    doc.add_paragraph('\nVariance Inflation Factor (VIF) for Continuous and Categorical Variables With Log Transformations:')
    vif_table = doc.add_table(rows=len(vif_data_log)+1, cols=2)
    vif_table.style = 'Table Grid'
    vif_table.cell(0, 0).text = 'Variable'
    vif_table.cell(0, 1).text = 'VIF'
    for i, (var, vif) in enumerate(zip(vif_data_log["Variable"], vif_data_log["VIF"]), 1):
        vif_table.cell(i, 0).text = str(var)
        vif_table.cell(i, 1).text = f"{vif:.4f}"
    doc.add_paragraph('VIF values above 10 indicate potential multicollinearity issues.')
except ValueError as ve:
    error_msg = f"ValueError in mixed model fitting: {str(ve)}"
    print(error_msg)
    doc.add_paragraph(error_msg)
    doc.add_paragraph("The mixed effects model failed to converge. This can happen due to " +
                        "insufficient variation in the grouping variable or other model specification issues.")
except RuntimeError as re:
    error_msg = f"RuntimeError in mixed model fitting: {str(re)}"
    print(error_msg)
    doc.add_paragraph(error_msg)
    doc.add_paragraph("The mixed effects model failed to converge. This can happen due to " +
                        "insufficient variation in the grouping variable or other model specification issues.")    


except Exception as e:
    error_msg = f"Error fitting mixed model: {str(e)}"
    print(error_msg)
    doc.add_paragraph(error_msg)
    doc.add_paragraph("The mixed effects model failed to converge. This can happen due to " +
                        "insufficient variation in the grouping variable or other model specification issues.")


#Log transform model ends here

       
    

except Exception as e:
    error_msg = f"\nError in model fitting: {str(e)}"
    print(error_msg)
    doc.add_paragraph(error_msg)
    doc.add_paragraph("You may need to check your data or consider using a different modeling approach.")



