In [None]:
import pandas as pd
import sys
import os
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))
from utils.revenue_data_preprocessing import preprocess_tickets_data

# Load and preprocess the data
df = pd.read_csv('../raw_data/global_tickets_wles_ops_data.csv')
df = preprocess_tickets_data(df)

# Group by all columns except Well Name, and calculate revenue sum and well count
grouped_data = df.groupby([
    'Adjusted Date', 'Sl Geounit (Code)', 'Country Name', 'Activity ID', 'Rig Name',
    'Rig type', 'Well type', 'Well Operating Environment', 'Billing Account', 'Rig environment'
]).agg({
    'Field Ticket USD net value': 'sum',
    'Well Name': 'nunique'
}).reset_index()

# Rename the 'Well Name' column to 'Well_Count'
grouped_data = grouped_data.rename(columns={'Well Name': 'Well_Count'})

# Display the first few rows of the grouped dataset
print("First few rows of the Grouped Data (with Well Count):")
print(grouped_data.head(10))

# Display basic information about the dataset
print("\nDataset Info:")
print(grouped_data.info())

# Check for duplicate Activity IDs
duplicate_activities = grouped_data[grouped_data.duplicated(subset=['Activity ID'], keep=False)]

print("\nNumber of rows with duplicate Activity IDs:", len(duplicate_activities))

if len(duplicate_activities) > 0:
    print("\nExample of duplicate Activity IDs:")
    print(duplicate_activities.head(10))

    # Group by Activity ID and show the count of duplicates
    activity_counts = grouped_data['Activity ID'].value_counts()
    print("\nTop 10 Activity IDs by occurrence count:")
    print(activity_counts.head(10))

    # Show details of the Activity ID with the most occurrences
    most_common_activity = activity_counts.index[0]
    print(f"\nDetails for the most common Activity ID ({most_common_activity}):")
    print(grouped_data[grouped_data['Activity ID'] == most_common_activity])
else:
    print("\nNo duplicate Activity IDs found in the grouped data.")

# Basic statistics of the grouped data
print("\nSummary Statistics:")
print(grouped_data.describe())

# Check for any missing values
print("\nMissing Values:")
print(grouped_data.isnull().sum())