In [32]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [33]:
# Load data from CSV file
df = pd.read_csv('/Users/PavelKudrautsau/Downloads/Jira_exported_data.csv')

print(df.isnull().sum())

Issue Type      0
Summary         0
Status          0
Updated         0
Issue key       0
Issue id        0
Parent id     235
Created         0
Priority        0
Reporter        0
Assignee       21
Resolved        0
dtype: int64


In [ ]:
def convert_to_datetime(df):
    """
    Converts 'Created' and 'Resolved' columns to datetime format
    """
    try:
        df['Created'] = pd.to_datetime(df['Created'])
        df['Resolved'] = pd.to_datetime(df['Resolved'])
    except Exception as e:
        print(f"Error in converting columns to datetime: {str(e)}")
        raise e # Re-raise the exception to see the full traceback
        
convert_to_datetime(df)

print(df.dtypes) #Check the conversion was applied for Created and Resolved: should be datetime64[ns]

In [ ]:
def calculate_and_plot_delivery(df):
    """
    Calculates 'Delivery_Time'in days with correct rounding up and plots its boxplot 
    """
    
    df.loc[:, 'Delivery_Time'] = np.ceil((df['Resolved'] - df['Created']).dt.total_seconds() / 86400).astype(int)
    plt.figure(figsize=(10, 5))
    df.boxplot(column=['Delivery_Time'])
    plt.show()
    
calculate_and_plot_delivery(df)

In [ ]:
#Getting distribution before the cleaning
def plot_distribution_and_stats(df):
    """
    Calculates mean and median of 'Delivery_Time', and plots its distribution with custom x and y ticks.
    """
    mean_delivery_time = df['Delivery_Time'].mean()
    median_delivery_time = df['Delivery_Time'].median()

    plt.figure(figsize=(10, 5))
    sns.histplot(df['Delivery_Time'], kde=True, bins=30)
    plt.axvline(mean_delivery_time, color='r', linestyle='--')
    plt.axvline(median_delivery_time, color='g', linestyle='-')

    # Add text annotations for mean and median
    plt.text(mean_delivery_time, plt.ylim()[1]*0.95, f'Mean: {mean_delivery_time:.2f}', 
             color='r', ha='right')
    plt.text(median_delivery_time, plt.ylim()[1]*0.90, f'Median: {median_delivery_time:.2f}', 
             color='g', ha='right')

    # Set custom ticks for x and y axes
    plt.xticks([0, 5, 10, 15, 20, 25, 30, 35, 40, 50, 60, 70, 80, 90])  # Example x-ticks
    plt.yticks([0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110])      # Example y-ticks

    plt.legend({'Mean': mean_delivery_time, 'Median': median_delivery_time})
    plt.xlabel('Delivery_Time')
    plt.ylabel('Frequency')
    plt.title('Distribution of Projected Delivery Times')
    plt.show()

# usage
plot_distribution_and_stats(df)

In [ ]:
#Cleaning data from outlier
Q1 = df['Delivery_Time'].quantile(0.25)
Q3 = df['Delivery_Time'].quantile(0.75)
IQR = Q3 - Q1
one_and_half_IQR = 1.5 * IQR
df = df[~((df['Delivery_Time'] < (Q1 - one_and_half_IQR)) | (df['Delivery_Time'] > (Q3 + one_and_half_IQR)))]

In [ ]:
#Getting distribution after the cleaning
def plot_distribution_and_stats(df):
    """
    Calculates mean and median of 'Delivery_Time', and plots its distribution with custom x and y ticks.
    """
    mean_delivery_time = df['Delivery_Time'].mean()
    median_delivery_time = df['Delivery_Time'].median()

    plt.figure(figsize=(10, 5))
    sns.histplot(df['Delivery_Time'], kde=True, bins=30)
    plt.axvline(mean_delivery_time, color='r', linestyle='--')
    plt.axvline(median_delivery_time, color='g', linestyle='-')

    # Add text annotations for mean and median
    plt.text(mean_delivery_time, plt.ylim()[1]*0.95, f'Mean: {mean_delivery_time:.2f}', 
             color='r', ha='right')
    plt.text(median_delivery_time, plt.ylim()[1]*0.90, f'Median: {median_delivery_time:.2f}', 
             color='g', ha='right')

    # Set custom ticks for x and y axes
    plt.xticks([0, 5, 10, 15, 20, 25, 30, 35, 40, 50, 60, 70, 80, 90])  # Example x-ticks
    plt.yticks([0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110])      # Example y-ticks

    plt.legend({'Mean': mean_delivery_time, 'Median': median_delivery_time})
    plt.xlabel('Delivery_Time')
    plt.ylabel('Frequency')
    plt.title('Distribution of Projected Delivery Times')
    plt.show()

# usage
plot_distribution_and_stats(df)

In [ ]:
try:
    print(df[['Created', 'Resolved', 'Delivery_Time']].head())
except Exception as e:
    print(f"Error in getting data: {str(e)}") 

In [ ]:
def monte_carlo_simulation(df, n_runs, n_tickets):
    results = []
    for _ in range(n_runs):
        # Sample and then take the average of the sample
        sample = df['Delivery_Time'].sample(n_tickets, replace=True).mean()
        results.append(sample)
    return results

# Run the Monte Carlo simulation with the revised function
simulation_results = monte_carlo_simulation(df, 1000000, 374)  # Adjust the number of runs and tickets as needed

In [ ]:
# Create a DataFrame from the simulation results
results_df = pd.DataFrame(simulation_results, columns=['Projected Delivery Time'])

# Calculate statistical measures
average_time = results_df['Projected Delivery Time'].mean()
median_time = results_df['Projected Delivery Time'].median()
confidence_interval = np.percentile(results_df['Projected Delivery Time'], [2.5, 97.5])

# Print out the results
print(f"Average Projected Delivery Time: {average_time} days")
print(f"Median Projected Delivery Time: {median_time} days")
print(f"95% Confidence Interval for Projected Delivery Time: {confidence_interval} days")
results_df.describe()

In [ ]:
# Assuming 'results_df' is your DataFrame with the simulation results
# Calculate the average, median, and standard deviation
average_time = results_df['Projected Delivery Time'].mean()
median_time = results_df['Projected Delivery Time'].median()
std_dev = results_df['Projected Delivery Time'].std()
eight_sigmas = std_dev * 8

# Plot the histogram
sns.histplot(results_df['Projected Delivery Time'], kde=True)

# Add vertical lines for mean, median
plt.axvline(average_time, color='r', linestyle='--', label=f'Mean: {average_time:.2f} days')
plt.axvline(median_time, color='g', linestyle='-', label=f'Median: {median_time:.2f} days')

# Add shaded area for 8 sigmas (mean +/- 4 sigmas covers approximately 8 sigmas in total)
plt.axvspan(average_time - 4*std_dev, average_time + 4*std_dev, alpha=0.2, color='yellow', label='8 Sigmas Range')

# Add legend and labels
plt.legend()
plt.xlabel('Projected Delivery Time (days)')
plt.ylabel('Frequency')
plt.title('Distribution of Projected Delivery Times with Statistical Indicators')
plt.show()

In [ ]:
# Assuming 'results_df' is your DataFrame with the simulation results
average_time = results_df['Projected Delivery Time'].mean()
median_time = results_df['Projected Delivery Time'].median()
std_dev = results_df['Projected Delivery Time'].std()

within_1_sigma = ((average_time - std_dev) <= results_df['Projected Delivery Time']) & (results_df['Projected Delivery Time'] <= (average_time + std_dev))
within_2_sigma = ((average_time - 2*std_dev) <= results_df['Projected Delivery Time']) & (results_df['Projected Delivery Time'] <= (average_time + 2*std_dev))
within_3_sigma = ((average_time - 3*std_dev) <= results_df['Projected Delivery Time']) & (results_df['Projected Delivery Time'] <= (average_time + 3*std_dev))

count_within_1_sigma = within_1_sigma.sum()
count_within_2_sigma = within_2_sigma.sum() - count_within_1_sigma
count_within_3_sigma = within_3_sigma.sum() - within_2_sigma.sum()

sns.histplot(results_df['Projected Delivery Time'], kde=True)

plt.axvline(average_time, color='r', linestyle='--', label=f'Mean: {average_time:.2f} days')
plt.axvline(median_time, color='g', linestyle='-', label=f'Median: {median_time:.2f} days')

plt.axvspan(average_time - std_dev, average_time + std_dev, alpha=0.3, color='blue', label=f'1 Sigma Range (n={count_within_1_sigma})')
plt.axvspan(average_time - 2*std_dev, average_time + 2*std_dev, alpha=0.2, color='orange', label=f'2 Sigma Range (n={count_within_2_sigma})')
plt.axvspan(average_time - 3*std_dev, average_time + 3*std_dev, alpha=0.1, color='green', label=f'3 Sigma Range (n={count_within_3_sigma})')

plt.legend(loc='upper right')
plt.xlabel('Projected Delivery Time (days)')
plt.ylabel('Frequency')
plt.title('Distribution of Projected Delivery Times with Sigma Ranges')
plt.show()

