In [7]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
import statsmodels.formula.api as smf
from statsmodels.genmod.families.family import NegativeBinomial
import tkinter as tk
from tkinter import filedialog
from scipy import stats
from docx import Document
from docx.shared import Inches
from io import BytesIO
import statsmodels.discrete.discrete_model as discrete
from statsmodels.regression.mixed_linear_model import MixedLM

def select_file(title, file_types, save=False):
    """Allow user to select a file"""
    root = tk.Tk()
    root.withdraw()
    root.attributes('-topmost', True)
    
    try:
        if save:
            file_path = filedialog.asksaveasfilename(
                title=title,
                filetypes=file_types,
                defaultextension=file_types[0][1]
            )
        else:
            file_path = filedialog.askopenfilename(
                title=title,
                filetypes=file_types
            )
    finally:
        root.destroy()
    
    return file_path if file_path else None

# Allow user to select input file
print("Please select the input Excel file...")
file_path = select_file(
    "Select Excel Data File", 
    [("Excel files", "*.xlsx *.xls"), ("All files", "*.*")]
)

if not file_path:
    print("No file selected. Exiting.")
    exit()

# Import the data
print(f"Loading data from: {file_path}")
df = pd.read_excel(file_path, sheet_name="Sheet")

# Setup
pd.set_option('display.max_columns', None)

# Encode categorical variables if not already encoded
categorical_vars = ['sex', 'marital_status', 'employment_status', 'purpose', 'accomd_type', 'us_state']
encoded_vars = {}

for var in categorical_vars:
    if var in df.columns:
        # Check if variable is already numeric
        if not pd.api.types.is_numeric_dtype(df[var]):
            new_var = f"{var}_enc"
            df[new_var] = pd.Categorical(df[var]).codes
            encoded_vars[var] = new_var
        else:
            encoded_vars[var] = var

# Set the truncation point for los (assuming truncation at 0)
df['los_trunc'] = df['los'].copy()
df.loc[df['los_trunc'] <= 0, 'los_trunc'] = np.nan

# Check for missing data
print("\nMissing data summary:")
missing_data_summary = df.isnull().sum()
print(missing_data_summary)

print("\nMissing data patterns:")
missing_patterns = df.isnull().sum(axis=1)
missing_patterns_counts = missing_patterns.value_counts().sort_index()
print(missing_patterns_counts)

# Visualize los distribution
plt.figure(figsize=(10, 6))
sns.histplot(df['los_trunc'], discrete=True)
plt.title('Histogram of Length of Stay')
plt.tight_layout()
los_hist_img = BytesIO()
plt.savefig(los_hist_img, format='png')
los_hist_img.seek(0)
plt.close()

# Summarize los by purpose
purpose_stats = None
if 'purpose_enc' in df.columns:
    print("\nLength of stay by purpose:")
    purpose_stats = df.groupby('purpose_enc')['los_trunc'].agg(['count', 'mean', 'median', 'min', 'max', 'std'])
    print(purpose_stats)

# Detailed summary of los_trunc
print("\nDetailed summary of length of stay:")
los_describe = df['los_trunc'].describe(percentiles=[.25, .5, .75, .90, .95, .99])
print(los_describe)

# Cleaning process
# Step 1: Drop missing datapoints for key variables
key_vars = ['los', 'immigrant_population', 'import_from_slu', 'age', 
            encoded_vars.get('sex', 'sex_enc'), 
            encoded_vars.get('marital_status', 'marital_status_enc'), 
            encoded_vars.get('employment_status', 'employment_status_enc'), 
            'distance_miles', 
            encoded_vars.get('purpose', 'purpose_enc'), 
            encoded_vars.get('accomd_type', 'accomd_type_enc'), 
            'month_travel', 'state_percapita_income', 'state_unemployment']

# Count missing values per row for key variables
df['missing'] = df[key_vars].isnull().sum(axis=1)
print("\nNumber of missing values per observation:")
missing_values_count = df['missing'].value_counts().sort_index()
print(missing_values_count)

# Drop observations with missing values in key variables
df_clean = df[df['missing'] == 0].drop('missing', axis=1)
print(f"\nRemaining observations after dropping missing values: {len(df_clean)}")

# Step 2: Drop outliers in length of stay
los_p95 = np.percentile(df_clean['los_trunc'].dropna(), 95)
df_clean['los_capped'] = df_clean['los_trunc'].copy()
df_clean.loc[df_clean['los_capped'] > los_p95, 'los_capped'] = los_p95

df_clean = df_clean[df_clean['los_trunc'] <= los_p95]
print(f"After filtering to 95th percentile, remaining observations: {len(df_clean)}")

# Remove any instance of los_capped that is less than 2. Prior to this, over 1000 persons had stays less than 1 including honeymooners. This appears to be a data entry error.
df_clean = df_clean[df_clean['los_capped'] >= 2]

# Visualize the capped los distribution
plt.figure(figsize=(10, 6))
sns.histplot(df_clean['los_capped'], discrete=True)
plt.title('Histogram of Capped Length of Stay')
plt.tight_layout()
los_capped_img = BytesIO()
plt.savefig(los_capped_img, format='png')
los_capped_img.seek(0)
plt.close()

# Step 3: Clean up the purpose of trip column
# Create a new simplified purpose variable
purpose_mapping = {
    1: 1,  # BUSINESS/MEETING -> Business
    2: 1,  # CONVENTION -> Business
    3: 1,  # CREW -> Business
    5: 2,  # EVENT -> Events
    6: 2,  # EVENTS -> Events
    7: 4,  # HONEYMOON -> Pleasure
    8: 5,  # INTRANSIT PASSEN -> Other
    9: 5,  # OTHER -> Other
    10: 4, # PLEASURE/HOLIDAY -> Pleasure
    11: 5, # RESIDENT -> Other
    12: 2, # SAINT LUCIA CARN -> Events
    13: 2, # SAINT LUCIA JAZZ -> Events
    14: 5, # SPORTS -> Other
    15: 5, # STUDY -> Other
    16: 5, # VISITING FRIENDS -> Other
    17: 3, # WEDDING -> Wedding
    18: 4, # pLEASURE/HOLIDAY -> Pleasure
    4: 5,  # CRICKET -> Other
}

purpose_labels = {
    1: "Business",
    2: "Events",
    3: "Wedding",
    4: "Pleasure",
    5: "Other"
}

# Add the simplified purpose variable
purpose_enc_col = encoded_vars.get('purpose', 'purpose_enc')
df_clean['purpose_simple'] = df_clean[purpose_enc_col].map(purpose_mapping)

# Check the new variable
print("\nPurpose simple distribution:")
purpose_counts = df_clean['purpose_simple'].value_counts().sort_index()
purpose_distribution = []
for code, count in purpose_counts.items():
    purpose_line = f"{code} ({purpose_labels.get(code, 'Unknown')}): {count}"
    purpose_distribution.append(purpose_line)
    print(purpose_line)

# Create a Word document for output
doc = Document()
doc.add_heading('Multilevel Truncated Negative Binomial Regression for Length of Stay Analysis', 0)
doc.add_heading('Data Preparation and Cleaning', level=1)

# Add missing data information
doc.add_paragraph('Missing Data Summary:')
missing_table = doc.add_table(rows=len(missing_data_summary)+1, cols=2)
missing_table.style = 'Table Grid'
missing_table.cell(0, 0).text = 'Variable'
missing_table.cell(0, 1).text = 'Missing Count'
for i, (var, count) in enumerate(missing_data_summary.items(), 1):
    missing_table.cell(i, 0).text = str(var)
    missing_table.cell(i, 1).text = str(count)

doc.add_paragraph('\nMissing Data Patterns:')
patterns_table = doc.add_table(rows=len(missing_patterns_counts)+1, cols=2)
patterns_table.style = 'Table Grid'
patterns_table.cell(0, 0).text = 'Number of Missing Variables'
patterns_table.cell(0, 1).text = 'Count'
for i, (pattern, count) in enumerate(missing_patterns_counts.items(), 1):
    patterns_table.cell(i, 0).text = str(pattern)
    patterns_table.cell(i, 1).text = str(count)

# Add Length of Stay histogram
doc.add_paragraph('\n')
doc.add_heading('Length of Stay Distribution', level=2)
doc.add_picture(los_hist_img, width=Inches(6))
doc.add_paragraph('Figure 1: Histogram of Length of Stay (Before Capping)')

# Add Capped LOS histogram
doc.add_paragraph('\n')
doc.add_heading('Capped Length of Stay Distribution', level=2)
doc.add_picture(los_capped_img, width=Inches(6))
doc.add_paragraph('Figure 2: Histogram of Length of Stay (After Capping at 95th Percentile)')

# Add LOS summary statistics
doc.add_paragraph('\n')
doc.add_heading('Length of Stay Summary Statistics 95% Capped', level=2)
los_stats_table = doc.add_table(rows=len(los_describe)+1, cols=2)
los_stats_table.style = 'Table Grid'
los_stats_table.cell(0, 0).text = 'Statistic'
los_stats_table.cell(0, 1).text = 'Value'
for i, (stat, value) in enumerate(los_describe.items(), 1):
    los_stats_table.cell(i, 0).text = str(stat)
    los_stats_table.cell(i, 1).text = f"{value:.4f}" if isinstance(value, (int, float)) else str(value)

# Add Purpose distribution
doc.add_paragraph('\n')
doc.add_heading('Purpose of Visit Distribution', level=2)
purpose_table = doc.add_table(rows=len(purpose_distribution)+1, cols=1)
purpose_table.style = 'Table Grid'
purpose_table.cell(0, 0).text = 'Purpose Category'
for i, purpose_text in enumerate(purpose_distribution, 1):
    purpose_table.cell(i, 0).text = purpose_text

# Fit simple negative binomial regression with continuous variables correctly specified
print("\nFitting simple negative binomial regression model...")
doc.add_paragraph('\n')
doc.add_heading('Negative Binomial Regression Model', level=1)

# Define continuous variables and create proper formula
continuous_vars = ['immigrant_population', 'import_from_slu', 'age', 'distance_miles', 
                   'state_percapita_income', 'state_unemployment']

# Make sure all continuous variables are properly formatted as numeric
for var in continuous_vars:
    if var in df_clean.columns:
        df_clean[var] = pd.to_numeric(df_clean[var], errors='coerce')

# Create formula with continuous variables properly treated
formula_parts = []
for var in continuous_vars:
    if var in df_clean.columns:
        formula_parts.append(var)

# Add categorical variables with proper C() notation
categorical_model_vars = ['sex_enc', 'marital_status_enc', 'employment_status_enc', 
                         'purpose_simple', 'accomd_type_enc', 'month_travel', 'us_state_enc']

for var in categorical_model_vars:
    if var in df_clean.columns:
        # Use the encoded variable name or the original if available
        var_to_use = var
        formula_parts.append(f"C({var_to_use})")

# Combine into final formula
formula = 'los_capped ~ ' + ' + '.join(formula_parts)
print(f"Model formula: {formula}")

# Add formula to document
doc.add_paragraph(f"Model formula: {formula}")

# Drop rows with missing values in formula variables
##formula_vars = ['los_capped'] + continuous_vars + categorical_model_vars
formula_vars = ['los_capped'] + continuous_vars + categorical_model_vars
df_clean_nb = df_clean[formula_vars].dropna()
df_clean_nb = df_clean_nb.reset_index(drop=True)
print(f"Number of rows in df_clean_nb after dropping missing values: {len(df_clean_nb)}")


# Fit negative binomial model
nb_model = smf.glm(formula=formula, 
                  data=df_clean_nb, 
                  family=sm.families.NegativeBinomial(link=sm.families.links.log()))

try:
    nb_results = nb_model.fit()
    print("\nNegative Binomial Regression Results:")
    summary_text = str(nb_results.summary())
    print(summary_text)
    
    # Add model summary to document
    #doc.add_paragraph('\nModel Summary:')
    summary_paragraph_neg = doc.add_paragraph()
    summary_run_neg = summary_paragraph_neg.add_run(summary_text)
    summary_run_neg.font.name = 'Courier New'  # Use monospace font
    #for line in summary_text.split('\n'):
        #doc.add_paragraph(line)
    
    # Convert coefficients to incident rate ratios (IRR)
    print("\nIncident Rate Ratios (IRR):")
    irr = np.exp(nb_results.params)
    irr_conf = np.exp(nb_results.conf_int())
    irr_df = pd.DataFrame({'IRR': irr, 'Lower CI': irr_conf[0], 'Upper CI': irr_conf[1], 
                          'P-value': nb_results.pvalues})
    print(irr_df)
    
    # Add IRR table to document
    doc.add_paragraph('\n')
    doc.add_heading('Incident Rate Ratios (IRR)', level=2)
    irr_table = doc.add_table(rows=len(irr_df)+1, cols=5)
    irr_table.style = 'Table Grid'
    irr_table.cell(0, 0).text = 'Variable'
    irr_table.cell(0, 1).text = 'IRR'
    irr_table.cell(0, 2).text = 'Lower CI'
    irr_table.cell(0, 3).text = 'Upper CI'
    irr_table.cell(0, 4).text = 'P-value'
    
    for i, (var, row) in enumerate(irr_df.iterrows(), 1):
        irr_table.cell(i, 0).text = str(var)
        irr_table.cell(i, 1).text = f"{row['IRR']:.4f}"
        irr_table.cell(i, 2).text = f"{row['Lower CI']:.4f}"
        irr_table.cell(i, 3).text = f"{row['Upper CI']:.4f}"
        irr_table.cell(i, 4).text = f"{row['P-value']:.4f}"
    
    
    # Predictions and diagnostics
    df_clean_nb['predicted'] = nb_results.predict()
    df_clean_nb['residuals'] = df_clean_nb['los_capped'] - df_clean_nb['predicted']
    
    # Plot residuals
    plt.figure(figsize=(10, 6))
    plt.scatter(df_clean_nb['predicted'], df_clean_nb['residuals'], alpha=0.5)
    plt.axhline(y=0, color='r', linestyle='-')
    plt.xlabel('Predicted Values')
    plt.ylabel('Residuals')
    plt.title('Residual Plot')
    plt.tight_layout()
    residuals_img = BytesIO()
    plt.savefig(residuals_img, format='png')
    residuals_img.seek(0)
    plt.close()
    
    # Add residuals plot to document
    doc.add_paragraph('\n')
    doc.add_heading('Diagnostics', level=2)
    doc.add_picture(residuals_img, width=Inches(6))
    doc.add_paragraph('Figure 3: Residuals Plot')
    
    # Check for heterogeneity across states if us_state is in the data
    if 'us_state_enc' in df_clean_nb.columns or 'us_state' in df_clean_nb.columns:
        state_var = 'us_state_enc' if 'us_state_enc' in df_clean_nb.columns else 'us_state'
        state_means = df_clean_nb.groupby(state_var)['los_capped'].mean().sort_values()
        
        plt.figure(figsize=(12, 8))
        state_means.plot(kind='bar')
        plt.xlabel('State')
        plt.ylabel('Average Length of Stay')
        plt.title('Mean Length of Stay by State')
        plt.xticks(rotation=90)
        plt.tight_layout()
        los_by_state_img = BytesIO()
        plt.savefig(los_by_state_img, format='png')
        los_by_state_img.seek(0)
        plt.close()
        
        # Add state analysis to document
        doc.add_paragraph('\n')
        doc.add_heading('State Analysis', level=2)
        doc.add_picture(los_by_state_img, width=Inches(6))
        doc.add_paragraph('Figure 4: Mean Length of Stay by State')
    
    # Approximated Multilevel Model
    doc.add_paragraph('\n')
    doc.add_heading('Approximated Multilevel Model', level=1)
    doc.add_paragraph('Using MixedLM to approximate a multilevel model with random effects for states.')
    
    # Debugging: Check df_clean_nb and the condition
    print("Columns in df_clean_nb:", df_clean_nb.columns.tolist())
    print("Checking if 'us_state_enc' is in df_clean_nb.columns:", 'us_state_enc' in df_clean_nb.columns)

    # Update variable lists
    continuous_vars = ['immigrant_population', 'import_from_slu', 'age', 'distance_miles', 'state_unemployment']
    categorical_model_vars = ['sex_enc', 'marital_status_enc', 'employment_status_enc', 'purpose_simple', 'accomd_type_enc','us_state_enc', 'month_travel']

    

    # Check if us_state variable exists for multilevel modeling
    if 'us_state_enc' in df_clean_nb.columns:
        # For demonstration, we'll use a linear mixed model as an approximation
        # Prepare model variables
        
        # Ensure no missing values in variables used for mixed effects model
        model_vars = ['los_capped'] + continuous_vars + categorical_model_vars 
        df_clean_model = df_clean_nb[model_vars].dropna()

        df_clean_model = df_clean_model.reset_index(drop=True)

        df_clean_model_old= df_clean_model.copy()

        # Remove rows where sex_enc is -1
        df_clean_model = df_clean_model[df_clean_model['sex_enc'] != -1]
        # Remove rows where age > 100
        df_clean_model = df_clean_model[df_clean_model['age'] <= 100]
        # Remove rows where marital_status_enc is -1
        df_clean_model = df_clean_model[df_clean_model['marital_status_enc'] != -1]
        
        y = df_clean_model['los_capped']
        
        # Create X matrix for fixed effects
        X_vars = []
        for var in continuous_vars:
            if var in df_clean_model.columns:
                X_vars.append(var)
        
        X = df_clean_model[X_vars].copy()
        
        # Add categorical variables (one-hot encoded)
        for var in categorical_model_vars:
            if var in df_clean_model.columns and var != 'us_state_enc':  # Exclude the grouping variable
                dummies = pd.get_dummies(df_clean_model[var], prefix=var, drop_first=True)
                X = pd.concat([X, dummies], axis=1)
        
        # Add intercept
        X = sm.add_constant(X)
        
        X=X.astype(float)
        print("X dtypes after converting to float:")
        print(X.dtypes)

        #Create new X's which will be simplified i.e. no multicollinearity
        X_simplified = X.copy()
        # Drop columns 'distance_miles' and 'state_unemployment' from X
        X_simplified = X_simplified.drop(columns=['distance_miles', 'state_unemployment'])
        
        # Define groups for random effects
        groups = df_clean_model['us_state_enc']

        print(f"Length of df_clean_model: {len(df_clean_model)}")
        print(f"Length of y: {len(y)}")
        print(f"Number of rows in X: {X.shape[0]}")
        print(f"Length of groups: {len(groups)}")

        if len(y) != X.shape[0] or len(y) != len(groups):
            print('Length mismatch between y, X, and groups. Check data preparation.')
            print(f"y length:{len(y)}")
            print(f"X rows: {X.shape[0]}")
            print(f"groups length: {len(groups)}")
            # Check for NaN values in X
            print("NaN counts in X columns:")
            print(X.isnull().sum())
            raise ValueError("Lengths of y, X, and groups do not match!")


       
        
        # Fit mixed effects model
        mixed_model = MixedLM(y, X, groups)
        try:
            mixed_results = mixed_model.fit()
            mixed_summary = str(mixed_results.summary())
            print("\nApproximated Multilevel Model Results:")
            print(mixed_summary)
            
            # Add to document
            doc.add_paragraph('Model Summary:')
            summary_paragraph = doc.add_paragraph()
            summary_run = summary_paragraph.add_run(mixed_summary)
            summary_run.font.name = 'Courier New'  # Use monospace font
            #summary_run.font.size = Pt(10)  # Optional: Adjust font size
            #for line in mixed_summary.split('\n'):
                #doc.add_paragraph(line)
            
            # Add variance components
            doc.add_paragraph('\nVariance Components:')
            vc_table = doc.add_table(rows=3, cols=2)
            vc_table.style = 'Table Grid'
            vc_table.cell(0, 0).text = 'Component'
            vc_table.cell(0, 1).text = 'Estimate'
            vc_table.cell(1, 0).text = 'State Random Effect Variance'
            vc_table.cell(1, 1).text = f"{mixed_results.cov_re.iloc[0, 0]:.4f}"
            vc_table.cell(2, 0).text = 'Residual Variance'
            vc_table.cell(2, 1).text = f"{mixed_results.scale:.4f}"
            
            # Calculate intraclass correlation coefficient (ICC)
            state_var = mixed_results.cov_re.iloc[0, 0]
            residual_var = mixed_results.scale
            icc = state_var / (state_var + residual_var)

            # Add model summary to document
            #doc.add_paragraph('\nMixed Summary:')
            #for line in mixed_summary.split('\n'):
                #doc.add_paragraph(line)
            
            # Convert coefficients to incident rate ratios (IRR)
            print("\nIncident Rate Ratios (IRR) Mixed Model:")
            irr_mixed = np.exp(mixed_results.params)
            irr_conf_mixed = np.exp(mixed_results.conf_int())
            irr_df_mixed = pd.DataFrame({'IRR': irr_mixed, 'Lower CI': irr_conf_mixed[0], 'Upper CI': irr_conf_mixed[1], 
                                'P-value': mixed_results.pvalues})
            print(irr_df_mixed)
            
            # Add IRR table to document
            doc.add_paragraph('\n')
            doc.add_heading('Incident Rate Ratios (IRR) Mixed Model', level=2)
            irr_table_mixed = doc.add_table(rows=len(irr_df_mixed)+1, cols=5)
            irr_table_mixed.style = 'Table Grid'
            irr_table_mixed.cell(0, 0).text = 'Variable'
            irr_table_mixed.cell(0, 1).text = 'IRR'
            irr_table_mixed.cell(0, 2).text = 'Lower CI'
            irr_table_mixed.cell(0, 3).text = 'Upper CI'
            irr_table_mixed.cell(0, 4).text = 'P-value'
            
            for i, (var, row) in enumerate(irr_df_mixed.iterrows(), 1):
                irr_table_mixed.cell(i, 0).text = str(var)
                irr_table_mixed.cell(i, 1).text = f"{row['IRR']:.4f}"
                irr_table_mixed.cell(i, 2).text = f"{row['Lower CI']:.4f}"
                irr_table_mixed.cell(i, 3).text = f"{row['Upper CI']:.4f}"
                irr_table_mixed.cell(i, 4).text = f"{row['P-value']:.4f}"
            
            doc.add_paragraph(f'\nIntraclass Correlation Coefficient (ICC): {icc:.4f}')
            doc.add_paragraph('The ICC represents the proportion of the total variance in length of stay ' +
                             'that is attributable to differences between states.')
            # After fitting the mixed model
            # Compute residuals and fitted values
            df_clean_model['fitted'] = mixed_results.fittedvalues
            df_clean_model['residuals'] = mixed_results.resid

            # Plot residuals vs fitted values
            plt.figure(figsize=(10, 6))
            plt.scatter(df_clean_model['fitted'], df_clean_model['residuals'], alpha=0.5)
            plt.axhline(y=0, color='r', linestyle='-')
            plt.xlabel('Fitted Values')
            plt.ylabel('Residuals')
            plt.title('Residuals vs Fitted Values')
            plt.tight_layout()
            residuals_vs_fitted_img = BytesIO()
            plt.savefig(residuals_vs_fitted_img, format='png')
            residuals_vs_fitted_img.seek(0)
            plt.close()

            # Add to document
            doc.add_paragraph('\n')
            doc.add_heading('Post-Estimation Diagnostics', level=2)
            doc.add_picture(residuals_vs_fitted_img, width=Inches(6))
            doc.add_paragraph('Figure 4: Residuals vs Fitted Values')

            # Q-Q plot for normality
            plt.figure(figsize=(10, 6))
            stats.probplot(df_clean_model['residuals'], dist="norm", plot=plt)
            plt.title('Q-Q Plot of Residuals')
            plt.tight_layout()
            qq_plot_img = BytesIO()
            plt.savefig(qq_plot_img, format='png')
            qq_plot_img.seek(0)
            plt.close()

            # Add to document
            doc.add_picture(qq_plot_img, width=Inches(6))
            doc.add_paragraph('Figure 5: Q-Q Plot of Residuals')

            # Fit a simple linear model (no random effects)
            ols_model = smf.ols(formula, df_clean_nb)
            ols_results = ols_model.fit()

            # Compute the likelihood ratio test
            lr_stat = -2 * (ols_results.llf - mixed_results.llf)
            p_value = stats.chi2.sf(lr_stat, df=1)  # df=1 for one random effect
            doc.add_paragraph(f'\nLikelihood Ratio Test for Random Effects: Statistic = {lr_stat:.2f}, P-value = {p_value:.4f}')


            # Pseudo-R² (McFadden's R² approximation)
            null_model = smf.mixedlm("los_capped ~ 1", df_clean_model, groups=df_clean_model['us_state_enc'])
            null_results = null_model.fit()
            pseudo_r2 = 1 - (mixed_results.llf / null_results.llf)
            doc.add_paragraph(f'\nPseudo-R² (McFadden): {pseudo_r2:.4f}')

            

            # Check VIF for continuous variables
            from statsmodels.stats.outliers_influence import variance_inflation_factor
            X_continuous = X[[col for col in X.columns if col != 'const']]  # Exclude intercept
            vif_data = pd.DataFrame()
            vif_data["Variable"] = X_continuous.columns
            vif_data["VIF"] = [variance_inflation_factor(X_continuous.values, i) for i in range(X_continuous.shape[1])]
            print("VIF for continuous and dummy variables:")
            print(vif_data)
            doc.add_paragraph('\nVariance Inflation Factor (VIF) for Continuous and Categorical Variables:')
            vif_table = doc.add_table(rows=len(vif_data)+1, cols=2)
            vif_table.style = 'Table Grid'
            vif_table.cell(0, 0).text = 'Variable'
            vif_table.cell(0, 1).text = 'VIF'
            for i, (var, vif) in enumerate(zip(vif_data["Variable"], vif_data["VIF"]), 1):
                vif_table.cell(i, 0).text = str(var)
                vif_table.cell(i, 1).text = f"{vif:.4f}"
            doc.add_paragraph('VIF values above 10 indicate potential multicollinearity issues.')
        except ValueError as ve:
            error_msg = f"ValueError in mixed model fitting: {str(ve)}"
            print(error_msg)
            doc.add_paragraph(error_msg)
            doc.add_paragraph("The mixed effects model failed to converge. This can happen due to " +
                             "insufficient variation in the grouping variable or other model specification issues.")
        except RuntimeError as re:
            error_msg = f"RuntimeError in mixed model fitting: {str(re)}"
            print(error_msg)
            doc.add_paragraph(error_msg)
            doc.add_paragraph("The mixed effects model failed to converge. This can happen due to " +
                             "insufficient variation in the grouping variable or other model specification issues.")    


        except Exception as e:
            error_msg = f"Error fitting mixed model: {str(e)}"
            print(error_msg)
            doc.add_paragraph(error_msg)
            doc.add_paragraph("The mixed effects model failed to converge. This can happen due to " +
                             "insufficient variation in the grouping variable or other model specification issues.")
        
        #New mixed effects model starts here this one is a simpler version
        

        # Fit mixed effects model with fewer variables
        mixed_model_simple = MixedLM(y, X_simplified, groups)
        try:
            mixed_results_simple = mixed_model_simple.fit()
            mixed_summary_simple = str(mixed_results_simple.summary())
            print("\nApproximated Multilevel Model Results:")
            print(mixed_summary_simple)
            
            # Add to document
            doc.add_paragraph('Model Summary with fewer variables:')
            summary_paragraph_simple = doc.add_paragraph()
            summary_run_simple = summary_paragraph_simple.add_run(mixed_summary_simple)
            summary_run_simple.font.name = 'Courier New'  # Use monospace font
            #summary_run.font.size = Pt(10)  # Optional: Adjust font size
            #for line in mixed_summary.split('\n'):
                #doc.add_paragraph(line)
            
            # Add variance components
            doc.add_paragraph('\nVariance Components with fewer variables:')
            vc_table = doc.add_table(rows=3, cols=2)
            vc_table.style = 'Table Grid'
            vc_table.cell(0, 0).text = 'Component'
            vc_table.cell(0, 1).text = 'Estimate'
            vc_table.cell(1, 0).text = 'State Random Effect Variance'
            vc_table.cell(1, 1).text = f"{mixed_results_simple.cov_re.iloc[0, 0]:.4f}"
            vc_table.cell(2, 0).text = 'Residual Variance'
            vc_table.cell(2, 1).text = f"{mixed_results_simple.scale:.4f}"
            
            # Calculate intraclass correlation coefficient (ICC)
            state_var_simple = mixed_results_simple.cov_re.iloc[0, 0]
            residual_var_simple = mixed_results_simple.scale
            icc_simple = state_var_simple / (state_var_simple + residual_var_simple)

            # Add model summary to document
            #doc.add_paragraph('\nMixed Summary:')
            #for line in mixed_summary.split('\n'):
                #doc.add_paragraph(line)
            
            # Convert coefficients to incident rate ratios (IRR)
            print("\nIncident Rate Ratios (IRR) Mixed Model:")
            irr_mixed_simple = np.exp(mixed_results_simple.params)
            irr_conf_mixed_simple = np.exp(mixed_results_simple.conf_int())
            irr_df_mixed_simple = pd.DataFrame({'IRR': irr_mixed_simple, 'Lower CI': irr_conf_mixed_simple[0], 'Upper CI': irr_conf_mixed_simple[1], 
                                'P-value': mixed_results_simple.pvalues})
            print(irr_df_mixed_simple)
            
            # Add IRR table to document
            doc.add_paragraph('\n')
            doc.add_heading('Incident Rate Ratios (IRR) Mixed Model', level=2)
            irr_table_mixed = doc.add_table(rows=len(irr_df_mixed_simple)+1, cols=5)
            irr_table_mixed.style = 'Table Grid'
            irr_table_mixed.cell(0, 0).text = 'Variable'
            irr_table_mixed.cell(0, 1).text = 'IRR'
            irr_table_mixed.cell(0, 2).text = 'Lower CI'
            irr_table_mixed.cell(0, 3).text = 'Upper CI'
            irr_table_mixed.cell(0, 4).text = 'P-value'
            
            for i, (var, row) in enumerate(irr_df_mixed_simple.iterrows(), 1):
                irr_table_mixed.cell(i, 0).text = str(var)
                irr_table_mixed.cell(i, 1).text = f"{row['IRR']:.4f}"
                irr_table_mixed.cell(i, 2).text = f"{row['Lower CI']:.4f}"
                irr_table_mixed.cell(i, 3).text = f"{row['Upper CI']:.4f}"
                irr_table_mixed.cell(i, 4).text = f"{row['P-value']:.4f}"
            
            doc.add_paragraph(f'\nIntraclass Correlation Coefficient (ICC): {icc:.4f}')
            doc.add_paragraph('The ICC represents the proportion of the total variance in length of stay ' +
                             'that is attributable to differences between states.')
            # After fitting the mixed model
            # Compute residuals and fitted values
            df_clean_model['fitted_simple'] = mixed_results_simple.fittedvalues
            df_clean_model['residuals_simple'] = mixed_results_simple.resid

            # Plot residuals vs fitted values
            plt.figure(figsize=(10, 6))
            plt.scatter(df_clean_model['fitted_simple'], df_clean_model['residuals_simple'], alpha=0.5)
            plt.axhline(y=0, color='r', linestyle='-')
            plt.xlabel('Fitted Values')
            plt.ylabel('Residuals')
            plt.title('Residuals vs Fitted Values')
            plt.tight_layout()
            residuals_vs_fitted_img_simple = BytesIO()
            plt.savefig(residuals_vs_fitted_img_simple, format='png')
            residuals_vs_fitted_img_simple.seek(0)
            plt.close()

            # Add to document
            doc.add_paragraph('\n')
            doc.add_heading('Post-Estimation Diagnostics', level=2)
            doc.add_picture(residuals_vs_fitted_img_simple, width=Inches(6))
            doc.add_paragraph('Figure 5: Residuals vs Fitted Values for Simpler Model')

            # Q-Q plot for normality
            plt.figure(figsize=(10, 6))
            stats.probplot(df_clean_model['residuals_simple'], dist="norm", plot=plt)
            plt.title('Q-Q Plot of Residuals Simpler Model')
            plt.tight_layout()
            qq_plot_img_simple = BytesIO()
            plt.savefig(qq_plot_img_simple, format='png')
            qq_plot_img_simple.seek(0)
            plt.close()

            # Add to document
            doc.add_picture(qq_plot_img_simple, width=Inches(6))
            doc.add_paragraph('Figure 6: Q-Q Plot of Residuals with fewer variables')

            # Fit a simple linear model (no random effects)
            ols_model = smf.ols(formula, df_clean_nb)
            ols_results = ols_model.fit()

            # Compute the likelihood ratio test
            lr_stat_simple = -2 * (ols_results.llf - mixed_results_simple.llf)
            p_value_simple = stats.chi2.sf(lr_stat_simple, df=1)  # df=1 for one random effect
            doc.add_paragraph(f'\nLikelihood Ratio Test for Random Effects with Simple Model: Statistic = {lr_stat_simple:.2f}, P-value = {p_value_simple:.4f}')


            # Pseudo-R² (McFadden's R² approximation)
            null_model_simple = smf.mixedlm("los_capped ~ 1", df_clean_model, groups=df_clean_model['us_state_enc'])
            null_results_simple = null_model_simple.fit()
            pseudo_r2_simple = 1 - (mixed_results_simple.llf / null_results_simple.llf)
            doc.add_paragraph(f'\nPseudo-R² (McFadden) fewer variables: {pseudo_r2_simple:.4f}')

            

            # Check VIF for continuous variables
            from statsmodels.stats.outliers_influence import variance_inflation_factor
            X_continuous_simple = X_simplified[[col for col in X_simplified.columns if col != 'const']]  # Exclude intercept
            vif_data_simple = pd.DataFrame()
            vif_data_simple["Variable"] = X_continuous_simple.columns
            vif_data_simple["VIF"] = [variance_inflation_factor(X_continuous_simple.values, i) for i in range(X_continuous_simple.shape[1])]
            print("VIF for continuous and dummy variables with fewer variables:")
            print(vif_data_simple)
            doc.add_paragraph('\nVariance Inflation Factor (VIF) for Continuous and Categorical Variables with fewer variables:')
            vif_table = doc.add_table(rows=len(vif_data_simple)+1, cols=2)
            vif_table.style = 'Table Grid'
            vif_table.cell(0, 0).text = 'Variable'
            vif_table.cell(0, 1).text = 'VIF'
            for i, (var, vif) in enumerate(zip(vif_data_simple["Variable"], vif_data_simple["VIF"]), 1):
                vif_table.cell(i, 0).text = str(var)
                vif_table.cell(i, 1).text = f"{vif:.4f}"
            doc.add_paragraph('VIF values above 10 indicate potential multicollinearity issues.')
        except ValueError as ve:
            error_msg = f"ValueError in mixed model fitting: {str(ve)}"
            print(error_msg)
            doc.add_paragraph(error_msg)
            doc.add_paragraph("The mixed effects model failed to converge. This can happen due to " +
                             "insufficient variation in the grouping variable or other model specification issues.")
        except RuntimeError as re:
            error_msg = f"RuntimeError in mixed model fitting: {str(re)}"
            print(error_msg)
            doc.add_paragraph(error_msg)
            doc.add_paragraph("The mixed effects model failed to converge. This can happen due to " +
                             "insufficient variation in the grouping variable or other model specification issues.")    


        except Exception as e:
            error_msg = f"Error fitting mixed model: {str(e)}"
            print(error_msg)
            doc.add_paragraph(error_msg)
            doc.add_paragraph("The mixed effects model failed to converge. This can happen due to " +
                             "insufficient variation in the grouping variable or other model specification issues.")
        
        #second mixed effects model ends here

        #Third mixed effects model with log transformations
        # Log transformations will address issues with hetroscedasticity and also model fit

        #Step 1: Log transform length of stay but only if it was non zero
        df_clean_model['log_los_capped'] = np.where(df_clean_model['los_capped'] > 0, np.log(df_clean_model['los_capped']), 0)
                                   

        #Step 2: Log transform the continuous variables
            # Log-transform age (showed high VIF and potential non-linearity)
        df_clean_model['log_age'] = np.log1p(df_clean_model['age'])

             # Log-transform distance_miles (potential non-linear effect on length of stay)
        df_clean_model['log_distance'] = np.log1p(df_clean_model['distance_miles'])

            # log transform others such as immigrant_population, import_from_slu and state_percapita_income
        
        df_clean_model['log_immigrant_population'] = np.log1p(df_clean_model['immigrant_population'])
        df_clean_model['log_import_from_slu'] = np.log1p(df_clean_model['import_from_slu'])
        #df_clean_model['log_state_percapita_income'] = np.log1p(df_clean_model['state_percapita_income'])
        df_clean_model['log_state_unemployment'] = np.log1p(df_clean_model['state_unemployment'])

        #With these transformations, now we set new X's and y's for the mixed effects model
        y_log = df_clean_model['log_los_capped']
        # Create X matrix for fixed effects
        X_log= X.copy()

        #Drop the original continuous variables
        X_log = X_log.drop(columns=['age', 'distance_miles', 'immigrant_population', 'import_from_slu',  'state_unemployment'])
                                    
        # Add log-transformed continuous variables
        X_log['log_age'] = df_clean_model['log_age']**3
        X_log['log_distance'] = df_clean_model['log_distance']**3
        X_log['log_immigrant_population'] = df_clean_model['log_immigrant_population']**2
        X_log['log_import_from_slu'] = df_clean_model['log_import_from_slu']**2
        #X_log['log_state_percapita_income'] = df_clean_model['log_state_percapita_income']
        X_log['log_state_unemployment'] = df_clean_model['log_state_unemployment']**2 
        
    # Fit mixed effects model with log-transformed variables
        mixed_model_log = MixedLM(y_log, X_log, groups)
        try:
            mixed_results_log = mixed_model_log.fit()
            mixed_summary_log = str(mixed_results_log.summary())
            print("\nApproximated Multilevel Model Results With Log Transformations:")
            print(mixed_summary_log)
            
            # Add to document
            doc.add_paragraph('Model Summary With Log Transformations:')
            summary_paragraph_log = doc.add_paragraph()
            summary_run_log = summary_paragraph_log.add_run(mixed_summary_log)
            summary_run_log.font.name = 'Courier New'  # Use monospace font
            #summary_run.font.size = Pt(10)  # Optional: Adjust font size
            #for line in mixed_summary.split('\n'):
                #doc.add_paragraph(line)
            
            # Add variance components
            doc.add_paragraph('\nVariance Components With Log Transformations:')
            vc_table = doc.add_table(rows=3, cols=2)
            vc_table.style = 'Table Grid'
            vc_table.cell(0, 0).text = 'Component'
            vc_table.cell(0, 1).text = 'Estimate'
            vc_table.cell(1, 0).text = 'State Random Effect Variance'
            vc_table.cell(1, 1).text = f"{mixed_results_log.cov_re.iloc[0, 0]:.4f}"
            vc_table.cell(2, 0).text = 'Residual Variance'
            vc_table.cell(2, 1).text = f"{mixed_results_log.scale:.4f}"
            
            # Calculate intraclass correlation coefficient (ICC)
            state_var_log = mixed_results_log.cov_re.iloc[0, 0]
            residual_var_log = mixed_results_log.scale
            icc_log = state_var_log / (state_var_log + residual_var_log)

            # Add model summary to document
            #doc.add_paragraph('\nMixed Summary:')
            #for line in mixed_summary.split('\n'):
                #doc.add_paragraph(line)
            
            # Convert coefficients to incident rate ratios (IRR)
            print("\nIncident Rate Ratios (IRR) Mixed Model With Log Transformations:")
            irr_mixed_log = np.exp(mixed_results_log.params)
            irr_conf_mixed_log = np.exp(mixed_results_log.conf_int())
            irr_df_mixed_log = pd.DataFrame({'IRR': irr_mixed_log, 'Lower CI': irr_conf_mixed_log[0], 'Upper CI': irr_conf_mixed_log[1], 
                                'P-value': mixed_results_log.pvalues})
            print(irr_df_mixed_log)
            
            # Add IRR table to document
            doc.add_paragraph('\n')
            doc.add_heading('Incident Rate Ratios (IRR) Mixed Model With Log Transformations', level=2)
            irr_table_mixed = doc.add_table(rows=len(irr_df_mixed_log)+1, cols=5)
            irr_table_mixed.style = 'Table Grid'
            irr_table_mixed.cell(0, 0).text = 'Variable'
            irr_table_mixed.cell(0, 1).text = 'IRR'
            irr_table_mixed.cell(0, 2).text = 'Lower CI'
            irr_table_mixed.cell(0, 3).text = 'Upper CI'
            irr_table_mixed.cell(0, 4).text = 'P-value'
            
            for i, (var, row) in enumerate(irr_df_mixed_log.iterrows(), 1):
                irr_table_mixed.cell(i, 0).text = str(var)
                irr_table_mixed.cell(i, 1).text = f"{row['IRR']:.4f}"
                irr_table_mixed.cell(i, 2).text = f"{row['Lower CI']:.4f}"
                irr_table_mixed.cell(i, 3).text = f"{row['Upper CI']:.4f}"
                irr_table_mixed.cell(i, 4).text = f"{row['P-value']:.4f}"
            
            doc.add_paragraph(f'\nIntraclass Correlation Coefficient (ICC): {icc:.4f}')
            doc.add_paragraph('The ICC represents the proportion of the total variance in length of stay ' +
                             'that is attributable to differences between states.')
            # After fitting the mixed model
            # Compute residuals and fitted values
            df_clean_model['fitted_log'] = mixed_results_log.fittedvalues
            df_clean_model['residuals_log'] = mixed_results_log.resid

            # Plot residuals vs fitted values
            plt.figure(figsize=(10, 6))
            plt.scatter(df_clean_model['fitted_log'], df_clean_model['residuals_log'], alpha=0.5)
            plt.axhline(y=0, color='r', linestyle='-')
            plt.xlabel('Fitted Values')
            plt.ylabel('Residuals')
            plt.title('Residuals vs Fitted Values')
            plt.tight_layout()
            residuals_vs_fitted_img_log = BytesIO()
            plt.savefig(residuals_vs_fitted_img_log, format='png')
            residuals_vs_fitted_img_log.seek(0)
            plt.close()

            # Add to document
            doc.add_paragraph('\n')
            doc.add_heading('Post-Estimation Diagnostics', level=2)
            doc.add_picture(residuals_vs_fitted_img_log, width=Inches(6))
            doc.add_paragraph('Figure 8: Residuals vs Fitted Values With Log Transformations')

            # Q-Q plot for normality
            plt.figure(figsize=(10, 6))
            stats.probplot(df_clean_model['residuals_log'], dist="norm", plot=plt)
            plt.title('Q-Q Plot of Residuals With Log Transformations')
            plt.tight_layout()
            qq_plot_img_log = BytesIO()
            plt.savefig(qq_plot_img_log, format='png')
            qq_plot_img_log.seek(0)
            plt.close()

            # Add to document
            doc.add_picture(qq_plot_img_log, width=Inches(6))
            doc.add_paragraph('Figure 9: Q-Q Plot of Residuals With Log Transformations')

            # Fit a log linear model (no random effects)
            ols_model = smf.ols(formula, df_clean_nb)
            ols_results = ols_model.fit()

            # Compute the likelihood ratio test
            lr_stat_log = -2 * (ols_results.llf - mixed_results_log.llf)
            p_value_log = stats.chi2.sf(lr_stat_log, df=1)  # df=1 for one random effect
            doc.add_paragraph(f'\nLikelihood Ratio Test for Random Effects with log Model: Statistic = {lr_stat_log:.2f}, P-value = {p_value_log:.4f}')


            # Pseudo-R² (McFadden's R² approximation)
            null_model_log = smf.mixedlm("log_los_capped ~ 1", df_clean_model, groups=df_clean_model['us_state_enc'])
            null_results_log = null_model_log.fit()
            pseudo_r2_log = 1 - (mixed_results_log.llf / null_results_log.llf)
            doc.add_paragraph(f'\nPseudo-R² (McFadden) With Log Transformations: {pseudo_r2_log:.4f}')

            

            # Check VIF for continuous variables
            from statsmodels.stats.outliers_influence import variance_inflation_factor
            X_continuous_log = X_log[[col for col in X_log.columns if col != 'const']]  # Exclude intercept
            vif_data_log = pd.DataFrame()
            vif_data_log["Variable"] = X_continuous_log.columns
            vif_data_log["VIF"] = [variance_inflation_factor(X_continuous_log.values, i) for i in range(X_continuous_log.shape[1])]
            print("VIF for continuous and dummy variables With Log Transformations:")
            print(vif_data_log)
            doc.add_paragraph('\nVariance Inflation Factor (VIF) for Continuous and Categorical Variables With Log Transformations:')
            vif_table = doc.add_table(rows=len(vif_data_log)+1, cols=2)
            vif_table.style = 'Table Grid'
            vif_table.cell(0, 0).text = 'Variable'
            vif_table.cell(0, 1).text = 'VIF'
            for i, (var, vif) in enumerate(zip(vif_data_log["Variable"], vif_data_log["VIF"]), 1):
                vif_table.cell(i, 0).text = str(var)
                vif_table.cell(i, 1).text = f"{vif:.4f}"
            doc.add_paragraph('VIF values above 10 indicate potential multicollinearity issues.')
        except ValueError as ve:
            error_msg = f"ValueError in mixed model fitting: {str(ve)}"
            print(error_msg)
            doc.add_paragraph(error_msg)
            doc.add_paragraph("The mixed effects model failed to converge. This can happen due to " +
                             "insufficient variation in the grouping variable or other model specification issues.")
        except RuntimeError as re:
            error_msg = f"RuntimeError in mixed model fitting: {str(re)}"
            print(error_msg)
            doc.add_paragraph(error_msg)
            doc.add_paragraph("The mixed effects model failed to converge. This can happen due to " +
                             "insufficient variation in the grouping variable or other model specification issues.")    


        except Exception as e:
            error_msg = f"Error fitting mixed model: {str(e)}"
            print(error_msg)
            doc.add_paragraph(error_msg)
            doc.add_paragraph("The mixed effects model failed to converge. This can happen due to " +
                             "insufficient variation in the grouping variable or other model specification issues.")


        #Log transform model ends here
    else:
        no_state_msg = "State variable not found for multilevel modeling."
        print(no_state_msg)
        doc.add_paragraph(no_state_msg)
    
except Exception as e:
    error_msg = f"\nError in model fitting: {str(e)}"
    print(error_msg)
    doc.add_paragraph(error_msg)
    doc.add_paragraph("You may need to check your data or consider using a different modeling approach.")

# Save the Word document
print("\nPlease select where to save the Word document...")
doc_path = select_file(
    "Save Analysis Report As", 
    [("Word Document", "*.docx"), ("All files", "*.*")],
    save=True
)

if doc_path:
    if not doc_path.endswith('.docx'):
        doc_path += '.docx'
    doc.save(doc_path)
    print(f"Analysis report saved to: {doc_path}")
else:
    print("Document not saved as no location was selected.")

print("\nAnalysis complete.")




Please select the input Excel file...
Loading data from: /Users/janai/Library/CloudStorage/OneDrive-SharedLibraries-jlconsulting.llc/Projects - Documents/Research/Saint Lucia Tourism Piece/1.0 Data cleaning/Final data for model/stata raw data.xlsx

Missing data summary:
los                         75
age                          0
sex                         26
marital_status              12
employment_status          425
distance_miles               0
purpose                    403
accomd_type                  0
state_percapita_income       0
state_unemployment           0
travel_date                  0
month_travel                 0
import_from_slu              0
immigrant_population         0
us_state                     0
sex_enc                      0
marital_status_enc           0
employment_status_enc        0
purpose_enc                  0
accomd_type_enc              0
us_state_enc                 0
los_trunc                 2178
dtype: int64

Missing data patterns:
0    14222




Negative Binomial Regression Results:
                 Generalized Linear Model Regression Results                  
Dep. Variable:             los_capped   No. Observations:               132166
Model:                            GLM   Df Residuals:                   132090
Model Family:        NegativeBinomial   Df Model:                           75
Link Function:                    log   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:            -3.7528e+05
Date:                Mon, 19 May 2025   Deviance:                       8275.2
Time:                        13:37:01   Pearson chi2:                 7.89e+03
No. Iterations:                    29   Pseudo R-squ. (CS):           0.007033
Covariance Type:            nonrobust                                         
                                    coef    std err          z      P>|z|      [0.025      0.975]
-------------------------------------------------------------------------




Approximated Multilevel Model Results With Log Transformations:
               Mixed Linear Model Regression Results
Model:                MixedLM   Dependent Variable:   log_los_capped
No. Observations:     132122    Method:               REML          
No. Groups:           50        Scale:                0.0785        
Min. group size:      18        Log-Likelihood:       -19570.8670   
Max. group size:      15803     Converged:            Yes           
Mean group size:      2642.4                                        
--------------------------------------------------------------------
                         Coef.  Std.Err.    z    P>|z| [0.025 0.975]
--------------------------------------------------------------------
const                     1.257    0.084  14.948 0.000  1.092  1.421
sex_enc_1                 0.011    0.002   7.157 0.000  0.008  0.014
marital_status_enc_1     -0.026    0.005  -4.781 0.000 -0.037 -0.016
marital_status_enc_2     -0.049    0.002 -25.480 0.000



VIF for continuous and dummy variables With Log Transformations:
                    Variable         VIF
0                  sex_enc_1    1.846238
1       marital_status_enc_1    1.038726
2       marital_status_enc_2    1.700338
3    employment_status_enc_0  189.431568
4    employment_status_enc_1    2.367711
5    employment_status_enc_2    1.768085
6         purpose_simple_2.0   21.042367
7         purpose_simple_3.0    1.005825
8         purpose_simple_4.0    1.889196
9         purpose_simple_5.0   97.723518
10         accomd_type_enc_1    1.044606
11         accomd_type_enc_2    1.196936
12            month_travel_2    2.228739
13            month_travel_3    2.343711
14            month_travel_4    2.117767
15            month_travel_5    2.357249
16            month_travel_6    2.450757
17            month_travel_7    2.434544
18            month_travel_8    2.092371
19            month_travel_9    1.741468
20           month_travel_10    2.010486
21           month_travel_11    2