In [None]:
# 06_correlation_analysis.ipynb

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from scipy import stats
import sys
import os
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))
from utils.revenue_data_preprocessing import preprocess_tickets_data, preprocess_rpe_data
from utils.journal_data_preprocessing import preprocess_journal_data

# Load and preprocess the data
print("Loading and preprocessing data...")
tickets_df = pd.read_csv('../raw_data/global_tickets_wles_ops_data.csv')
journal_df = pd.read_csv('../raw_data/global_journal_operatingtime.csv')
rpe_df = pd.read_csv('../raw_data/global_rpe_revenue.csv')

tickets_df = preprocess_tickets_data(tickets_df)
journal_df = preprocess_journal_data(journal_df)
rpe_df = preprocess_rpe_data(rpe_df)

# Filter for specific geounits
geounits_to_include = ['QTG', 'ECP', 'APG', 'NAO']

tickets_df = tickets_df[tickets_df['Sl Geounit (Code)'].isin(geounits_to_include)]
journal_df = journal_df[journal_df['Geounit'].isin(geounits_to_include)]
rpe_df = rpe_df[rpe_df['SL Geounit (Code)'].isin(geounits_to_include)]

# Aggregate tickets data by month and geounit
tickets_monthly = tickets_df.groupby(['Sl Geounit (Code)', pd.Grouper(key='Adjusted Date', freq='MS')])['Field Ticket USD net value'].sum().reset_index()
tickets_monthly.columns = ['Geounit', 'Date', 'Tickets_Ops_Revenue']

# Aggregate journal data by month and geounit
journal_monthly = journal_df.groupby(['Geounit', pd.Grouper(key='OA Start', freq='MS')])['Value'].sum().reset_index()
journal_monthly.columns = ['Geounit', 'Date', 'Operating_Days']

# Aggregate RPE data by month and geounit
rpe_monthly = rpe_df.groupby(['SL Geounit (Code)', pd.Grouper(key='Month Date', freq='MS')])['RPE Revenue'].sum().reset_index()
rpe_monthly.columns = ['Geounit', 'Date', 'CIM_RPE_Ops_Service_Revenue']

# Merge the datasets
merged_data = pd.merge(tickets_monthly, journal_monthly, on=['Geounit', 'Date'], how='outer')
merged_data = pd.merge(merged_data, rpe_monthly, on=['Geounit', 'Date'], how='outer').fillna(0)

# Add quarter column
merged_data['Quarter'] = merged_data['Date'].dt.to_period('Q')

# Calculate correlations for each geounit
correlations = merged_data.groupby('Geounit').apply(lambda x: pd.Series({
    'Tickets_Ops_Revenue_vs_Operating_Days': x['Tickets_Ops_Revenue'].corr(x['Operating_Days']),
    'CIM_RPE_Ops_Service_Revenue_vs_Operating_Days': x['CIM_RPE_Ops_Service_Revenue'].corr(x['Operating_Days'])
})).sort_values('Tickets_Ops_Revenue_vs_Operating_Days', ascending=False)

print("Correlations by Geounit:")
print(correlations)

# Plot correlation heatmap
plt.figure(figsize=(12, 8))
sns.heatmap(correlations, annot=True, cmap='coolwarm', center=0)
plt.title('Correlations with Operating Days by Geounit')
plt.tight_layout()
plt.show()

# Function to plot analysis for a given geounit
def plot_geounit_analysis(geounit):
    data = merged_data[merged_data['Geounit'] == geounit]
    
    fig, (ax1, ax2, ax3) = plt.subplots(3, 1, figsize=(12, 18))
    
    # Scatter plots
    sns.scatterplot(data=data, x='Operating_Days', y='Tickets_Ops_Revenue', ax=ax1, label='Tickets Ops Revenue')
    sns.scatterplot(data=data, x='Operating_Days', y='CIM_RPE_Ops_Service_Revenue', ax=ax1, label='CIM RPE Ops Service Revenue')
    ax1.set_title(f'Scatter Plot: Operating Days vs Revenues - {geounit}')
    ax1.set_xlabel('Operating Days')
    ax1.set_ylabel('Revenue')
    ax1.legend()
    
    # Add correlation coefficients and p-values
    corr1, p_value1 = stats.pearsonr(data['Operating_Days'], data['Tickets_Ops_Revenue'])
    corr2, p_value2 = stats.pearsonr(data['Operating_Days'], data['CIM_RPE_Ops_Service_Revenue'])
    ax1.annotate(f'Tickets Ops Revenue Correlation: {corr1:.2f}\np-value: {p_value1:.4f}', 
                 xy=(0.05, 0.95), xycoords='axes fraction', ha='left', va='top')
    ax1.annotate(f'CIM RPE Ops Service Revenue Correlation: {corr2:.2f}\np-value: {p_value2:.4f}', 
                 xy=(0.05, 0.85), xycoords='axes fraction', ha='left', va='top')
    
    # Monthly time series plot
    ax2.plot(data['Date'], data['Operating_Days'], label='Operating Days')
    ax2.set_ylabel('Operating Days', color='tab:blue')
    ax2.tick_params(axis='y', labelcolor='tab:blue')
    
    ax2_twin = ax2.twinx()
    ax2_twin.plot(data['Date'], data['Tickets_Ops_Revenue'], color='tab:orange', label='Tickets Ops Revenue')
    ax2_twin.plot(data['Date'], data['CIM_RPE_Ops_Service_Revenue'], color='tab:green', label='CIM RPE Ops Service Revenue')
    ax2_twin.set_ylabel('Revenue', color='tab:orange')
    ax2_twin.tick_params(axis='y', labelcolor='tab:orange')
    
    ax2.set_title(f'Monthly Time Series: Operating Days and Revenues - {geounit}')
    ax2.set_xlabel('Date')
    
    lines1, labels1 = ax2.get_legend_handles_labels()
    lines2, labels2 = ax2_twin.get_legend_handles_labels()
    ax2.legend(lines1 + lines2, labels1 + labels2, loc='upper left')
    
    # Quarterly time series plot
    quarterly_data = data.groupby('Quarter').agg({
        'Operating_Days': 'sum', 
        'Tickets_Ops_Revenue': 'sum', 
        'CIM_RPE_Ops_Service_Revenue': 'sum'
    }).reset_index()
    
    ax3.plot(quarterly_data['Quarter'].astype(str), quarterly_data['Operating_Days'], label='Operating Days')
    ax3.set_ylabel('Operating Days', color='tab:blue')
    ax3.tick_params(axis='y', labelcolor='tab:blue')
    
    ax3_twin = ax3.twinx()
    ax3_twin.plot(quarterly_data['Quarter'].astype(str), quarterly_data['Tickets_Ops_Revenue'], color='tab:orange', label='Tickets Ops Revenue')
    ax3_twin.plot(quarterly_data['Quarter'].astype(str), quarterly_data['CIM_RPE_Ops_Service_Revenue'], color='tab:green', label='CIM RPE Ops Service Revenue')
    ax3_twin.set_ylabel('Revenue', color='tab:orange')
    ax3_twin.tick_params(axis='y', labelcolor='tab:orange')
    
    ax3.set_title(f'Quarterly Time Series: Operating Days and Revenues - {geounit}')
    ax3.set_xlabel('Quarter')
    ax3.set_xticklabels(quarterly_data['Quarter'].astype(str), rotation=45)
    
    lines3, labels3 = ax3.get_legend_handles_labels()
    lines4, labels4 = ax3_twin.get_legend_handles_labels()
    ax3.legend(lines3 + lines4, labels3 + labels4, loc='upper left')
    
    plt.tight_layout()
    plt.show()

# Plot analysis for each geounit
for geounit in geounits_to_include:
    plot_geounit_analysis(geounit)

# Overall correlation analysis (monthly)
plt.figure(figsize=(12, 6))
sns.scatterplot(data=merged_data, x='Operating_Days', y='Tickets_Ops_Revenue', alpha=0.5, label='Tickets Ops Revenue')
sns.scatterplot(data=merged_data, x='Operating_Days', y='CIM_RPE_Ops_Service_Revenue', alpha=0.5, label='CIM RPE Ops Service Revenue')
plt.title('Overall Monthly Scatter Plot: Operating Days vs Revenues')
plt.xlabel('Operating Days')
plt.ylabel('Revenue')
plt.legend()

# Add overall correlation coefficients and p-values
corr1, p_value1 = stats.pearsonr(merged_data['Operating_Days'], merged_data['Tickets_Ops_Revenue'])
corr2, p_value2 = stats.pearsonr(merged_data['Operating_Days'], merged_data['CIM_RPE_Ops_Service_Revenue'])
plt.annotate(f'Tickets Ops Revenue Correlation: {corr1:.2f}\np-value: {p_value1:.4f}', 
             xy=(0.05, 0.95), xycoords='axes fraction', ha='left', va='top')
plt.annotate(f'CIM RPE Ops Service Revenue Correlation: {corr2:.2f}\np-value: {p_value2:.4f}', 
             xy=(0.05, 0.85), xycoords='axes fraction', ha='left', va='top')

plt.tight_layout()
plt.show()

# Quarterly overall analysis
quarterly_data = merged_data.groupby(['Quarter', 'Geounit']).agg({
    'Operating_Days': 'sum', 
    'Tickets_Ops_Revenue': 'sum', 
    'CIM_RPE_Ops_Service_Revenue': 'sum'
}).reset_index()

plt.figure(figsize=(12, 6))
sns.scatterplot(data=quarterly_data, x='Operating_Days', y='Tickets_Ops_Revenue', hue='Geounit', alpha=0.7, label='Tickets Ops Revenue')
sns.scatterplot(data=quarterly_data, x='Operating_Days', y='CIM_RPE_Ops_Service_Revenue', hue='Geounit', alpha=0.7, label='CIM RPE Ops Service Revenue', marker='s')
plt.title('Quarterly Overall Scatter Plot: Operating Days vs Revenues')
plt.xlabel('Operating Days')
plt.ylabel('Revenue')

# Add overall quarterly correlation coefficients and p-values
corr1, p_value1 = stats.pearsonr(quarterly_data['Operating_Days'], quarterly_data['Tickets_Ops_Revenue'])
corr2, p_value2 = stats.pearsonr(quarterly_data['Operating_Days'], quarterly_data['CIM_RPE_Ops_Service_Revenue'])
plt.annotate(f'Tickets Ops Revenue Correlation: {corr1:.2f}\np-value: {p_value1:.4f}', 
             xy=(0.05, 0.95), xycoords='axes fraction', ha='left', va='top')
plt.annotate(f'CIM RPE Ops Service Revenue Correlation: {corr2:.2f}\np-value: {p_value2:.4f}', 
             xy=(0.05, 0.85), xycoords='axes fraction', ha='left', va='top')

plt.legend(title='Revenue Type', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()

print("Analysis complete.")