In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from scipy.stats import ttest_ind, chi2_contingency
from statsmodels.stats.power import TTestIndPower

In [None]:
dtype_dict = {
    'route_id': 'int16',
    'bus_id': 'int32',
    'stop_sequence': 'int16',
    'arrival_delay': 'int16',
    'dwell_time': 'uint16',
    'travel_time_for_previous_section': 'uint16',
    'scheduled_travel_time': 'uint16',
    'upstream_stop_delay': 'int16',
    'origin_delay': 'int16',
    'previous_bus_delay': 'int16',
    'previous_trip_travel_time': 'uint16',
    'traffic_condition': 'float32',
    'recurrent_delay': 'float32'
}

dummy_vars = [
    'factor(weather)Light_Rain', 'factor(weather)Light_Snow', 'factor(weather)Normal',
    'factor(weather)Rain', 'factor(weather)Snow', 'factor(temperature)Cold',
    'factor(temperature)Extra_cold', 'factor(temperature)Normal', 'factor(day_of_week)weekday',
    'factor(day_of_week)weekend', 'factor(time_of_day)Afternoon_peak',
    'factor(time_of_day)Morning_peak', 'factor(time_of_day)Off-peak'
]

for var in dummy_vars:
    dtype_dict[var] = 'uint8'

In [None]:
data_path = 'data/Dataset-PT.csv'

df = pd.read_csv(
    data_path,
    dtype=dtype_dict,
    parse_dates=['Calendar_date'],
    date_format='%Y%m%d'
)

numeric_cols = [
    'arrival_delay', 'dwell_time', 'travel_time_for_previous_section',
    'scheduled_travel_time', 'upstream_stop_delay', 'origin_delay',
    'previous_bus_delay', 'previous_trip_travel_time', 'traffic_condition',
    'recurrent_delay'
]

categorical_columns = ['weather', 'temperature', 'day_of_week', 'time_of_day']
for col in categorical_columns:
    df[col] = df[col].astype('category')

# Validation of dummy variables
for var in dummy_vars:
    if not df[var].isin([0, 1]).all():
        raise ValueError(f"The variable {var} contains values other than 0 and 1")

In [None]:
df.insert(1, 'day_of_week_num', df['Calendar_date'].dt.dayofweek)

# Calculate Statistics
## Continuous Variables

In [None]:
def calculate_continuous_statistics(df, cols):
    # Create a dictionary to store results
    stats_dict = {
        'Variable': [],
        'Mean': [],
        'Median': [],
        'Standard Deviation': [],
        '95th Percentile': [],
        'Maximum': []
    }

    # Calculate statistics for each column
    for col in cols:
        stats_dict['Variable'].append(col)
        stats_dict['Mean'].append(df[col].mean())
        stats_dict['Median'].append(df[col].median())
        stats_dict['Standard Deviation'].append(df[col].std())
        stats_dict['95th Percentile'].append(df[col].quantile(0.95))
        stats_dict['Maximum'].append(df[col].max())

    # Convert the dictionary to a DataFrame
    stats_df = pd.DataFrame(stats_dict)

    # Print the table in markdown format
    print("\n### Descriptive Statistics for Continuous Variables ###\n")
    print(stats_df.to_markdown(index=False))

    return stats_df


# Calculate descriptive statistics for continuous variables
continuous_stats = calculate_continuous_statistics(df, numeric_cols)

## Categorical Variables

In [None]:
def calculate_categorical_frequencies(df, cols):
    print("\n### Absolute and Relative Frequencies for Categorical Variables ###\n")

    # Iterate over categorical columns to calculate frequencies
    for col in cols:
        absolute_freq = df[col].value_counts()
        relative_freq = df[col].value_counts(normalize=True) * 100

        # Combine absolute and relative frequencies into a single DataFrame
        freq_df = pd.DataFrame({
            'Absolute Frequency': absolute_freq,
            'Relative Frequency (%)': relative_freq
        }).reset_index().rename(columns={'index': col})

        # Print the table in markdown format
        print(f"\nFrequencies for {col}:\n")
        print(freq_df.to_markdown(index=False))


# Apply the function to all categorical variables
categorical_vars = ['weather', 'temperature', 'day_of_week', 'time_of_day']
calculate_categorical_frequencies(df, categorical_vars)

## Continuous Variables by Day of the Week

In [None]:
def calculate_statistics_by_day_of_week(df, cols):
    # Group by day_of_week_num (0=Monday, ..., 6=Sunday) and calculate statistics
    grouped_stats = df.groupby('day_of_week_num')[cols].agg(['mean', 'std']).reset_index()

    # Return the DataFrame with grouped statistics
    return grouped_stats


# Calculate aggregated statistics by day of the week excluding certain variables
continuous_stats_by_day = calculate_statistics_by_day_of_week(df, numeric_cols_filtered)

In [None]:
continuous_stats_by_day

In [None]:
## Continuous Variables by Stop Sequence
def calculate_grouped_statistics(df, group_by_col, continuous_cols):
    grouped_stats = df.groupby(group_by_col)[continuous_cols].agg(['mean', 'median', 'std'])
    print(f"\n### Aggregated Statistics by {group_by_col} ###\n")
    return grouped_stats


# Aggregated statistics by stop_sequence
stats_by_stop = calculate_grouped_statistics(df, 'stop_sequence', numeric_cols)

In [None]:
stats_by_stop

In [None]:
## Categorical Variables by Day of the Week
def calculate_categorical_frequencies_by_group(df, group_by_col, categorical_cols):
    combined_freq_dict = {}

    # Calculate frequencies for each categorical variable by the specified group
    for col in categorical_cols:
        # Absolute frequency by group
        abs_freq = df.groupby(group_by_col)[col].value_counts().unstack().fillna(0)

        # Relative frequency by group (proportion)
        rel_freq = abs_freq.div(abs_freq.sum(axis=1), axis=0) * 100

        # Combine absolute and relative frequencies into a single DataFrame
        combined_freq = pd.concat([rel_freq], axis=1, keys=['(%)'])

        # Rearrange columns so that relative frequencies are next to absolute frequencies
        combined_freq.columns = [f'{lvl1}_{lvl2}' for lvl1, lvl2 in combined_freq.columns]

        combined_freq_dict[col] = combined_freq

    return combined_freq_dict


# Define the categorical variables for analysis
categorical_vars = ['weather', 'temperature', 'day_of_week', 'time_of_day']

# Calculate frequencies for categorical variables by day of the week
frequencies_by_day = calculate_categorical_frequencies_by_group(df, 'day_of_week_num', categorical_vars)

# Display results for a categorical variable as an example
for col, freqs in frequencies_by_day.items():
    print(f"\n### Combined Frequencies for {col} ###\n")
    print(freqs)

In [None]:
## Categorical Variables by Stop Sequence
def calculate_categorical_frequencies_by_stop_sequence(df, categorical_cols):
    combined_freq_dict = {}

    # Calculate frequencies for each categorical variable by stop_sequence
    for col in categorical_cols:
        # Absolute frequency by group (stop_sequence)
        abs_freq = df.groupby('stop_sequence')[col].value_counts().unstack().fillna(0)

        # Relative frequency by group (proportion)
        rel_freq = abs_freq.div(abs_freq.sum(axis=1), axis=0) * 100

        # Create DataFrame with only relative frequencies
        combined_freq = pd.concat([rel_freq], axis=1, keys=['(%)'])

        # Rename columns to reflect categories
        combined_freq.columns = [f'{lvl1}_{lvl2}' for lvl1, lvl2 in combined_freq.columns]

        combined_freq_dict[col] = combined_freq

    return combined_freq_dict

In [None]:
# Calculate relative frequencies for categorical variables by stop_sequence
frequencies_by_stop_sequence = calculate_categorical_frequencies_by_stop_sequence(df, categorical_vars)

# Display results for a categorical variable as an example
for col, freqs in frequencies_by_stop_sequence.items():
    print(f"\n### Relative Frequencies for {col} by stop_sequence ###\n")
    print(freqs)

# Plots Mean and Standard Deviation
## Continuous Variables vs. Day of the Week
exclude_vars = ['travel_time_for_previous_section', 'scheduled_travel_time', 'origin_delay']

# Filter numeric variables excluding the specified ones
numeric_cols_filtered = [col for col in numeric_cols if col not in exclude_vars]


def calculate_and_plot_by_day(df, cols, group_by_col='day_of_week_num', exclude=None):
    # Exclude variables if the exclude list is provided
    if exclude is not None:
        cols = [col for col in cols if col not in exclude]

    # Group by day_of_week_num and calculate statistics
    grouped_stats = df.groupby(group_by_col)[cols].agg(['mean', 'std']).reset_index()

    # Calculate the number of rows needed for the grid (2 columns)
    num_cols = 2
    num_rows = (len(cols) + 1) // num_cols

    # Set up the figure size
    plt.figure(figsize=(15, num_rows * 5))

    # Plot each continuous variable
    for i, col in enumerate(cols, 1):
        plt.subplot(num_rows, num_cols, i)

        # Extract mean and standard deviation
        x = grouped_stats[group_by_col]
        y_mean = grouped_stats[(col, 'mean')]
        y_std = grouped_stats[(col, 'std')]

        # Plot mean and standard deviation band
        plt.plot(x, y_mean, label='Mean', color='blue', marker='o')
        plt.fill_between(x, y_mean - y_std, y_mean + y_std, color='blue', alpha=0.2, label='Standard Deviation')

        # Customize the plot
        plt.title(f'Mean and Standard Deviation of {col} by {group_by_col}')
        plt.xlabel('Day of the Week (0=Monday, 6=Sunday)')
        plt.ylabel(col.capitalize())
        plt.xticks(ticks=range(7), labels=["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"])  # Day labels
        plt.legend()
        plt.grid(True)

    # Adjust spacing between subplots
    plt.tight_layout()
    plt.show()

    # Return grouped statistics
    return grouped_stats


In [None]:
# Apply the function to calculate and plot statistics by day of the week excluding certain variables
stats_by_day = calculate_and_plot_by_day(df, numeric_cols, exclude=exclude_vars)
## Categorical Variables vs. Stop Sequence
exclude_categorical_vars = ['day_of_week']


def plot_categorical_frequencies_by_day(df, categorical_cols, group_by_col='day_of_week_num'):
    # Calculate relative frequencies by group (day of the week)
    for col in categorical_cols:
        # Calculate absolute and then relative frequencies
        abs_freq = df.groupby(group_by_col)[col].value_counts().unstack().fillna(0)
        rel_freq = abs_freq.div(abs_freq.sum(axis=1), axis=0) * 100

        # Set up the figure size
        plt.figure(figsize=(15, 7))

        # Create plot for each categorical variable
        for category in rel_freq.columns:
            plt.plot(rel_freq.index, rel_freq[category], marker='o', label=f'{col}: {category}')

        # Customize the plot
        plt.title(f'Relative Frequencies of {col} by {group_by_col}')
        plt.xlabel('Day of the Week (0=Monday, 6=Sunday)')
        plt.ylabel('Relative Frequency (%)')
        plt.xticks(ticks=range(7), labels=["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"])  # Day labels
        plt.legend()
        plt.grid(True)

        # Show the plot
        plt.show()


# Plot relative frequencies of categorical variables by day of the week
plot_categorical_frequencies_by_day(df, categorical_vars_filtered)
## Categorical Variables vs. Stop Sequence
exclude_categorical_vars = ['day_of_week']

# Filter categorical variables excluding the specified ones
categorical_vars_filtered = [col for col in categorical_vars if col not in exclude_categorical_vars]


In [None]:
def plot_categorical_frequencies_by_stop_sequence(df, categorical_cols, group_by_col='stop_sequence'):
    # Calculate relative frequencies by group (stop_sequence)
    for col in categorical_cols:
        # Calculate absolute and then relative frequencies
        abs_freq = df.groupby(group_by_col)[col].value_counts().unstack().fillna(0)
        rel_freq = abs_freq.div(abs_freq.sum(axis=1), axis=0) * 100

        # Set up the figure size
        plt.figure(figsize=(15, 7))

        # Create plot for each categorical variable
        for category in rel_freq.columns:
            plt.plot(rel_freq.index, rel_freq[category], marker='o', label=f'{col}: {category}')

        # Customize the plot
        plt.title(f'Relative Frequencies of {col} by {group_by_col}')
        plt.xlabel(group_by_col.capitalize())
        plt.ylabel('Relative Frequency (%)')
        plt.legend()
        plt.grid(True)

        # Show the plot
        plt.show()


# Plot relative frequencies of categorical variables by stop_sequence
plot_categorical_frequencies_by_stop_sequence(df, categorical_vars_filtered)

In [None]:
# Histogram Plots
## Histograms of Continuous Variables
def plot_histogram(df, cols, bins=30):
    rows = (len(cols) + 1) // 2
    plt.figure(figsize=(15, rows * 4))
    for i, col in enumerate(cols, 1):
        plt.subplot(rows, 2, i)
        sns.histplot(df[col], kde=True, bins=bins)
        plt.title(f'Histogram of {col}')
    plt.tight_layout()
    plt.show()


numeric_cols = ['arrival_delay', 'dwell_time', 'travel_time_for_previous_section',
                'scheduled_travel_time', 'upstream_stop_delay', 'origin_delay',
                'previous_bus_delay', 'previous_trip_travel_time', 'traffic_condition',
                'recurrent_delay']
plot_histogram(df, numeric_cols)

In [None]:
## Crossed Histograms between Continuous Variables
def plot_crossed_histograms_grid(df, pairs, bins=30):
    num_pairs = len(pairs)
    num_cols = 2
    num_rows = (num_pairs + 1) // num_cols

    plt.figure(figsize=(15, num_rows * 5))

    # Plot each pair of continuous variables in a grid
    for i, (col_x, col_y) in enumerate(pairs, 1):
        plt.subplot(num_rows, num_cols, i)

        # Create scatter plot (2D histogram)
        sns.histplot(data=df, x=col_x, y=col_y, bins=bins, pthresh=.1, cmap="viridis")

        # Customize the plot
        plt.title(f'{col_y} vs {col_x}')
        plt.xlabel(col_x.capitalize())
        plt.ylabel(col_y.capitalize())

    # Adjust spacing between subplots
    plt.tight_layout()
    plt.show()


# Suggest some interesting combinations of continuous variables
continuous_pairs = [
    ('arrival_delay', 'previous_bus_delay'),
    ('arrival_delay', 'upstream_stop_delay'),
    ('travel_time_for_previous_section', 'scheduled_travel_time'),
    ('previous_trip_travel_time', 'arrival_delay'),
    ('traffic_condition', 'recurrent_delay')
]

# Plot crossed combinations of continuous variables in a grid
plot_crossed_histograms_grid(df, continuous_pairs)


In [None]:
## Crossed Histograms between Continuous and Categorical Variables
def plot_continuous_vs_categorical_grid(df, continuous_vars, categorical_vars):
    # Calculate the total number of plots and arrange in 2 columns
    num_plots = len(continuous_vars) * len(categorical_vars)
    num_cols = 2
    num_rows = (num_plots + 1) // num_cols

    plt.figure(figsize=(15, num_rows * 5))

    # Index for subplot position
    plot_idx = 1

    # Iterate over each combination of continuous and categorical variable
    for col_cont in continuous_vars:
        for col_cat in categorical_vars:
            plt.subplot(num_rows, num_cols, plot_idx)
            plot_idx += 1

            # Create boxplot of the continuous variable by the categorical variable
            sns.boxplot(x=col_cat, y=col_cont, data=df)

            # Customize the plot
            plt.title(f'{col_cont.capitalize()} by {col_cat.capitalize()}')
            plt.xlabel(col_cat.capitalize())
            plt.ylabel(col_cont.capitalize())
            plt.xticks(rotation=45)  # Rotate labels if necessary

    # Adjust spacing between subplots
    plt.tight_layout()
    plt.show()


# Define continuous and categorical variables to cross
interesting_continuous_vars = ['arrival_delay', 'travel_time_for_previous_section', 'scheduled_travel_time',
                               'traffic_condition']
interesting_categorical_vars = ['weather', 'temperature', 'time_of_day', 'day_of_week_num']

# Plot combinations of continuous and categorical variables in a grid
plot_continuous_vs_categorical_grid(df, interesting_continuous_vars, interesting_categorical_vars)

In [None]:
# Bar Charts
## Bar Charts of Categorical Variables
def plot_categorical_bars(df, categorical_cols):
    rows = (len(categorical_cols) + 1) // 2
    plt.figure(figsize=(15, rows * 5))

    for i, col in enumerate(categorical_cols, 1):
        plt.subplot(rows, 2, i)
        # Count the frequency of each category
        value_counts = df[col].value_counts()

        # Create bar plot
        sns.barplot(x=value_counts.index, y=value_counts.values)

        # Customize the plot
        plt.title(f'Distribution of {col}')
        plt.xlabel(col.capitalize())
        plt.ylabel('Frequency')
        plt.xticks(rotation=45)  # Rotate labels if necessary

    plt.tight_layout()
    plt.show()


# Define categorical variables to plot
categorical_vars = ['weather', 'temperature', 'time_of_day']

# Plot frequency distribution of categorical variables
plot_categorical_bars(df, categorical_vars)


In [None]:
# Box Plots
## Box Plots of Continuous Variables
def plot_boxplots_continuous(df, continuous_vars):
    # Calculate the total number of variables and arrange in 2 columns
    num_vars = len(continuous_vars)
    num_cols = 2
    num_rows = (num_vars + 1) // num_cols

    plt.figure(figsize=(15, num_rows * 5))

    # Plot each continuous variable
    for i, col in enumerate(continuous_vars, 1):
        plt.subplot(num_rows, num_cols, i)

        # Create boxplot of the continuous variable
        sns.boxplot(data=df, y=col)

        # Customize the plot
        plt.title(f'Boxplot of {col}')
        plt.ylabel(col.capitalize())

    # Adjust spacing between subplots
    plt.tight_layout()
    plt.show()


# Define continuous variables to plot
continuous_vars = ['arrival_delay', 'travel_time_for_previous_section', 'scheduled_travel_time',
                   'traffic_condition', 'recurrent_delay', 'upstream_stop_delay', 'dwell_time']

# Plot boxplots of the continuous variables
plot_boxplots_continuous(df, continuous_vars)

In [None]:
# Outliers
from scipy import stats


def remove_multiple_outliers(df, cols, threshold=3):
    initial_rows = df.shape[0]  # Initial number of rows
    total_removed = pd.Series(0, index=cols)  # Series to count rows removed per column

    # Iterate over each column and remove outliers
    for col in cols:
        z_scores = stats.zscore(df[col])
        abs_z_scores = np.abs(z_scores)
        is_not_outlier = abs_z_scores <= threshold

        # Count removed rows
        total_removed[col] = initial_rows - is_not_outlier.sum()

        # Filter the DataFrame for the current column
        df = df[is_not_outlier]

    # Calculate the total number of rows after removing outliers
    final_rows = df.shape[0]
    total_removed_absolute = initial_rows - final_rows

    # Calculate removal percentages
    removal_percentage = (total_removed / initial_rows) * 100
    total_removal_percentage = (total_removed_absolute / initial_rows) * 100

    # Print removal statistics per column
    print("\n### Outlier Removal Statistics ###\n")
    print(f"Initial number of rows: {initial_rows}")
    print(f"Final number of rows: {final_rows}")
    print(f"Total rows removed: {total_removed_absolute} ({total_removal_percentage:.2f}%)")
    print("\nRows removed per column:")
    for col in cols:
        print(f" - {col}: {total_removed[col]} removed ({removal_percentage[col]:.2f}%)")

    return df


# Define continuous variables for which to remove outliers
'''
continuous_vars = ['arrival_delay', 'travel_time_for_previous_section', 'scheduled_travel_time',
                   'traffic_condition', 'recurrent_delay', 'upstream_stop_delay', 'dwell_time']
'''
continuous_vars = ['arrival_delay', 'dwell_time']

# Remove outliers from multiple columns
df_cleaned_multiple = remove_multiple_outliers(df, continuous_vars)
# Export the cleaned DataFrame as a CSV file
output_path = "data/Dataset-PT_no_sample.csv"
df_cleaned_multiple.to_csv(output_path, index=False)

print(f"The DataFrame has been successfully exported to {output_path}")

In [None]:
# Sub Sampling
## Stratified Sampling
df_sorted = df_cleaned_multiple.sort_values('Calendar_date')
df_time_sampled = df_sorted.iloc[::10, :].reset_index(drop=True)

df_train, df_stratified = train_test_split(
    df_time_sampled,
    test_size=0.1,
    stratify=df_time_sampled['day_of_week'],
    random_state=42
)
# Export the cleaned DataFrame as a CSV file
output_path = "data/Dataset-PT_stratified.csv"
df_stratified.to_csv(output_path, index=False)

print(f"The DataFrame has been successfully exported to {output_path}")

In [None]:
## KMeans Sampling
from sklearn.cluster import KMeans


def kmeans_subsampling(df, columns_to_exclude, n_clusters=10, sample_percentage=0.01):
    # Exclude specific columns
    df_filtered = df.drop(columns=columns_to_exclude)

    # Separate features (X) and target variable (y)
    X = df_filtered.drop(['arrival_delay'], axis=1)
    y = df_filtered['arrival_delay']

    # Determine the total number of rows to calculate sample size per cluster
    total_rows = X.shape[0]

    # Calculate the number of samples per cluster as 1% of the total rows
    samples_per_cluster = int(sample_percentage * total_rows / n_clusters)

    # Apply KMeans clustering using the filtered columns
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    clusters = kmeans.fit_predict(X)
    df['cluster'] = clusters

    # Sample a percentage from each cluster, allowing replacement if necessary
    df_kmeans_sampled = df.groupby('cluster', group_keys=False).apply(
        lambda x: x.sample(
            n=samples_per_cluster,  # Always take the calculated size
            replace=True,  # Allow replacement if there are fewer rows in the cluster
            random_state=42
        )
    ).reset_index(drop=True)

    return df_kmeans_sampled


# Define columns to exclude for KMeans
columns_to_drop = ['Calendar_date', 'route_id', 'bus_id', 'weather', 'temperature', 'day_of_week', 'time_of_day']

# Apply KMeans for sampling based on 1% of total rows
df_kmeans_sampled = kmeans_subsampling(df_cleaned_multiple, columns_to_drop)
df_kmeans_sampled = df_kmeans_sampled.drop(columns=['cluster'])
# Export the cleaned DataFrame as a CSV file
output_path = "data/Dataset-PT_KMeans.csv"
df_kmeans_sampled.to_csv(output_path, index=False)

print(f"The DataFrame has been successfully exported to {output_path}")

In [None]:
# Plots Original vs. KMeans Distribution
def plot_distributions(df_original, df_sampled, cols):
    for col in cols:
        plt.figure(figsize=(10, 6))
        sns.histplot(df_original[col], color='blue', label='Original', kde=True, stat="density")
        sns.histplot(df_sampled[col], color='orange', label='Sampled', kde=True, stat="density")
        plt.title(f'Distribution of {col}')
        plt.legend()
        plt.show()


plot_distributions(df, df_kmeans_sampled, numeric_cols)


In [None]:
# Statistical Comparisons
## Statistical Comparison
def compare_statistics(df_original, df_sampled, numeric_cols, threshold=0.05):
    original_summary = df_original[numeric_cols].describe()
    sampled_summary = df_sampled[numeric_cols].describe()

    print("\n### Statistics Comparison ###")
    for col in numeric_cols:
        mean_diff = np.abs(original_summary.loc['mean', col] - sampled_summary.loc['mean', col]) / original_summary.loc[
            'mean', col]
        print(f"{col} - Mean Difference: {mean_diff:.2%} (Threshold: {threshold * 100}%)")

        # Other comparisons of medians and standard deviations can be added here


compare_statistics(df, df_stratified, numeric_cols)
compare_statistics(df, df_kmeans_sampled, numeric_cols)

In [None]:
## T-test
def perform_t_test(df_original, df_sampled, cols, alpha=0.05):
    print("\n### T-tests ###")
    for col in cols:
        t_stat, p_value = ttest_ind(df_original[col], df_sampled[col])
        result = "Significant" if p_value < alpha else "Not Significant"
        print(f"{col} - p-value: {p_value:.3f} ({result})")


perform_t_test(df, df_stratified, numeric_cols)
perform_t_test(df, df_kmeans_sampled, numeric_cols)


In [None]:
## Chi-square
def perform_chi_square_test(df_original, df_sampled, dummy_vars, alpha=0.05):
    print("\n### Chi-Square Tests ###")
    for var in dummy_vars:
        contingency_table = pd.crosstab(df_original[var], df_sampled[var])
        _, p_value, _, _ = chi2_contingency(contingency_table)
        result = "Significant" if p_value < alpha else "Not Significant"
        print(f"{var} - p-value: {p_value:.3f} ({result})")


perform_chi_square_test(df, df_stratified, dummy_vars)
perform_chi_square_test(df, df_kmeans_sampled, dummy_vars)

In [None]:
# Creating Scenario 2 and Scenario 3
## Scenario 2
columns_to_drop = ['travel_time_for_previous_section', 'recurrent_delay', 'previous_trip_travel_time']
s2_nosample = df_cleaned_multiple.drop(columns=columns_to_drop)
s2_stratified = df_stratified.drop(columns=columns_to_drop)
s2_kMeans = df_kmeans_sampled.drop(columns=columns_to_drop)
output_path = "data/s2_no_sample.csv"
s2_nosample.to_csv(output_path, index=False)
print(f"The DataFrame has been successfully exported to {output_path}")
output_path = "data/s2_stratified.csv"
s2_stratified.to_csv(output_path, index=False)
print(f"The DataFrame has been successfully exported to {output_path}")
output_path = "data/s2_KMeans.csv"
s2_kMeans.to_csv(output_path, index=False)
print(f"The DataFrame has been successfully exported to {output_path}")

In [None]:
## Scenario 3
columns_to_drop = ['travel_time_for_previous_section', 'recurrent_delay', 'previous_trip_travel_time', 'dwell_time',
                   'traffic_condition']
s3_nosample = df_cleaned_multiple.drop(columns=columns_to_drop)
s3_stratified = df_stratified.drop(columns=columns_to_drop)
s3_kMeans = df_kmeans_sampled.drop(columns=columns_to_drop)
output_path = "data/s3_no_sample.csv"
s3_nosample.to_csv(output_path, index=False)
print(f"The DataFrame has been successfully exported to {output_path}")
output_path = "data/s3_stratified.csv"
s3_stratified.to_csv(output_path, index=False)
print(f"The DataFrame has been successfully exported to {output_path}")
output_path = "data/s3_KMeans.csv"
s3_kMeans.to_csv(output_path, index=False)
print(f"The DataFrame has been successfully exported to {output_path}")