In [3]:
# 05_explore_journal_operatingtime.ipynb

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sys
import os
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))
from utils.journal_data_preprocessing import preprocess_journal_data

# Load the data
print("Loading data...")
df = pd.read_csv('../raw_data/global_journal_operatingtime.csv')

# Display basic information about the dataset
print("Original Dataset Info:")
print(df.info())

# Apply the preprocessing function
print("Applying preprocessing function...")
try:
    result = preprocess_journal_data(df)
    print("Type of result:", type(result))
    print("Length of result:", len(result) if isinstance(result, tuple) else "Not a tuple")
    
    if isinstance(result, tuple) and len(result) == 2:
        df_processed, simplified_result = result
        print("\nPreprocessing successful.")
    else:
        print("\nUnexpected return value from preprocess_journal_data.")
        print("Result:", result)
        raise ValueError("Preprocessing function did not return expected values.")
except Exception as e:
    print(f"An error occurred during preprocessing: {str(e)}")
    raise

print("\nPreprocessed Dataset Info:")
print(df_processed.info())

print("\nSimplified Result Dataset Info:")
print(simplified_result.info())

# Verify that the data is sorted correctly
print("\nVerifying sort order:")
is_sorted = (df_processed.groupby('Activity ID')['Journal Activity start time'].is_monotonic_increasing).all()
print(f"Data is properly sorted: {is_sorted}")

# Display the first few rows of the processed dataset
print("\nFirst few rows of the Processed Data:")
print(df_processed.head())

print("\nFirst few rows of the Simplified Result:")
print(simplified_result.head())

# Basic statistics of the processed data
print("\nSummary Statistics of Processed Data:")
print(df_processed.describe())

print("\nSummary Statistics of Simplified Result:")
print(simplified_result.describe())

# Check for any missing values
print("\nMissing Values in Processed Data:")
print(df_processed.isnull().sum())

print("\nMissing Values in Simplified Result:")
print(simplified_result.isnull().sum())

# Visualize the distribution of 'Value' (total days)
plt.figure(figsize=(10, 6))
sns.histplot(simplified_result['Value'], kde=True)
plt.title('Distribution of Total Days per Activity')
plt.xlabel('Total Days')
plt.ylabel('Frequency')
plt.show()

# Analyze the relationship between 'OA End' - 'OA Start' and 'Value'
simplified_result['OA Duration'] = (simplified_result['OA End'] - simplified_result['OA Start']).dt.total_seconds() / (24 * 3600)  # Convert to days
plt.figure(figsize=(10, 6))
plt.scatter(simplified_result['OA Duration'], simplified_result['Value'])
plt.title('Relationship between OA Duration and Total Days')
plt.xlabel('OA Duration (days)')
plt.ylabel('Total Days')
plt.show()

# Group by 'Sl Geounit (Code)' and calculate average 'Value'
geounit_avg_days = df_processed.groupby('Sl Geounit (Code)')['Value'].mean().sort_values(ascending=False)
plt.figure(figsize=(12, 6))
geounit_avg_days.plot(kind='bar')
plt.title('Average Total Days by Geounit')
plt.xlabel('Geounit')
plt.ylabel('Average Total Days')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# Analyze the distribution of 'Journal Activity'
activity_counts = df_processed['Journal Activity'].value_counts()
plt.figure(figsize=(12, 6))
activity_counts.plot(kind='bar')
plt.title('Distribution of Journal Activities')
plt.xlabel('Journal Activity')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# Time series analysis of operating time
simplified_result['OA Start'] = pd.to_datetime(simplified_result['OA Start'])
monthly_operating_days = simplified_result.groupby(simplified_result['OA Start'].dt.to_period('M'))['Value'].sum()
plt.figure(figsize=(12, 6))
monthly_operating_days.plot()
plt.title('Monthly Total Operating Days')
plt.xlabel('Month')
plt.ylabel('Total Operating Days')
plt.tight_layout()
plt.show()

# Additional analyses specific to the simplified result
print("\nTop 10 Activities by Total Days:")
print(simplified_result.nlargest(10, 'Value'))

print("\nBottom 10 Activities by Total Days:")
print(simplified_result.nsmallest(10, 'Value'))

# Analyze the distribution of OA durations
plt.figure(figsize=(10, 6))
sns.histplot(simplified_result['OA Duration'], kde=True)
plt.title('Distribution of OA Durations')
plt.xlabel('OA Duration (days)')
plt.ylabel('Frequency')
plt.show()

# Calculate and display some statistics about the Value and OA Duration
print("\nCorrelation between Value and OA Duration:")
print(simplified_result['Value'].corr(simplified_result['OA Duration']))

print("\nPercentage of activities where Value exceeds OA Duration:")
print((simplified_result['Value'] > simplified_result['OA Duration']).mean() * 100)

# Additional analyses can be added here based on specific requirements and insights gained from the exploration

Original Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10708 entries, 0 to 10707
Data columns (total 9 columns):
 #   Column                          Non-Null Count  Dtype 
---  ------                          --------------  ----- 
 0   Sl Geounit (Code)               10708 non-null  object
 1   Job Group code                  10708 non-null  object
 2   Job Type code                   10708 non-null  object
 3   Activity ID                     10708 non-null  object
 4   Journal Ops Event ID            10708 non-null  object
 5   Journal Activity                10708 non-null  object
 6   Journal Activity start time     10708 non-null  object
 7   Journal Activity end time       10708 non-null  object
 8   Journal Activity duration, hrs  10708 non-null  int64 
dtypes: int64(1), object(8)
memory usage: 753.0+ KB
None


ValueError: too many values to unpack (expected 2)