In [None]:
# 05_explore_journal_operatingtime.ipynb

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sys
import os
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))
from utils.journal_data_preprocessing import preprocess_journal_data

# Load the data
print("Loading data...")
df = pd.read_csv('../raw_data/global_journal_operatingtime.csv')

# Display basic information about the dataset
print("Original Dataset Info:")
print(df.info())

# Apply the preprocessing function
print("Applying preprocessing function...")
processed_df = preprocess_journal_data(df)

print("\nProcessed Dataset Info:")
print(processed_df.info())

# Display the first few rows of the processed dataset
print("\nFirst few rows of the Processed Data:")
print(processed_df.head())

# Basic statistics of the processed data
print("\nSummary Statistics:")
print(processed_df.describe())

# Check for any missing values
print("\nMissing Values:")
print(processed_df.isnull().sum())

# Visualize the distribution of 'Value' (total days)
plt.figure(figsize=(10, 6))
sns.histplot(processed_df['Value'], kde=True)
plt.title('Distribution of Total Days per Activity')
plt.xlabel('Total Days')
plt.ylabel('Frequency')
plt.show()

# Analyze the relationship between 'OA End' - 'OA Start' and 'Value'
processed_df['OA Duration'] = (processed_df['OA End'] - processed_df['OA Start']).dt.total_seconds() / (24 * 3600)  # Convert to days
plt.figure(figsize=(10, 6))
plt.scatter(processed_df['OA Duration'], processed_df['Value'])
plt.title('Relationship between OA Duration and Total Days')
plt.xlabel('OA Duration (days)')
plt.ylabel('Total Days')
plt.show()

# Time series analysis of operating time
monthly_operating_days = processed_df.groupby(processed_df['OA Start'].dt.to_period('M'))['Value'].sum()
plt.figure(figsize=(12, 6))
monthly_operating_days.plot()
plt.title('Monthly Total Operating Days')
plt.xlabel('Month')
plt.ylabel('Total Operating Days')
plt.tight_layout()
plt.show()

# Analyze distribution by Geounit
geounit_stats = processed_df.groupby('Geounit')['Value'].agg(['mean', 'median', 'min', 'max']).sort_values('mean', ascending=False)
print("\nValue Statistics by Geounit:")
print(geounit_stats)

# Visualize top Geounits by average Value
top_geounits = geounit_stats.head(10)
plt.figure(figsize=(12, 6))
top_geounits['mean'].plot(kind='bar')
plt.title('Top 10 Geounits by Average Total Days')
plt.xlabel('Geounit')
plt.ylabel('Average Total Days')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

# Additional analyses
print("\nTop 10 Activities by Total Days:")
print(processed_df.nlargest(10, 'Value'))

print("\nBottom 10 Activities by Total Days:")
print(processed_df.nsmallest(10, 'Value'))

# Analyze the distribution of OA durations
plt.figure(figsize=(10, 6))
sns.histplot(processed_df['OA Duration'], kde=True)
plt.title('Distribution of OA Durations')
plt.xlabel('OA Duration (days)')
plt.ylabel('Frequency')
plt.show()

# Calculate and display some statistics about the Value and OA Duration
print("\nCorrelation between Value and OA Duration:")
print(processed_df['Value'].corr(processed_df['OA Duration']))

print("\nPercentage of activities where Value exceeds OA Duration:")
print((processed_df['Value'] > processed_df['OA Duration']).mean() * 100)

print(f"\nTotal number of unique Activity IDs: {len(processed_df)}")

# Additional analyses can be added here based on specific requirements and insights gained from the exploration