In [6]:
import pandas as pd
import numpy as np
import random
from datetime import datetime, timedelta

# Load the attached Excel file to use as a reference for synthesizing the dataset
excel_file_path = '/workspaces/UnderWater-Decision/data/mf-data.xlsx'
xls = pd.ExcelFile(excel_file_path)


In [7]:

# Extract platform names from sheet names
platform_names = xls.sheet_names
# Remove non-platform sheets from the list
non_platform_sheets = ['Explanation', 'Location', 'All', 'Sheet9']
platforms = [name for name in platform_names if name not in non_platform_sheets]

structures = platforms
locations = ['North Sea']  # Assuming all structures are in the North Sea
cleaning_methods = ['Method A', 'Method B', 'Method C']


In [8]:

# Define a function to generate a random date for last cleaning
def generate_random_date(start_year=2015, end_year=2021):
    start_date = datetime(year=start_year, month=1, day=1)
    end_date = datetime(year=end_year, month=1, day=1)
    return (start_date + timedelta(days=random.randint(0, (end_date - start_date).days))).date()

# Function to generate synthetic dataset
def generate_synthetic_data(num_entries):
    synthetic_data = []

    for _ in range(num_entries):
        structure_id = f"{random.randint(1, 999):03}"
        structure_type = random.choice(structures)
        location = random.choice(locations)
        age = random.randint(1, 20)  # Structures aged between 1 to 20 years
        last_cleaning = generate_random_date()
        image_path = f"/path/img{structure_id}"
        detected_algae = random.randint(0, 60)  # Algae coverage between 0% to 60%
        detected_barnacles = random.randint(0, 40)  # Barnacle coverage between 0% to 40%
        detected_mussels = random.randint(0, 30)  # Mussel coverage between 0% to 30%
        total_coverage = detected_algae + detected_barnacles + detected_mussels
        recommended_cleaning_method = random.choice(cleaning_methods)

        # Ensure total coverage doesn't exceed 100%
        total_coverage = min(total_coverage, 100)

        synthetic_data.append({
            "Structure_ID": structure_id,
            "Structure_Type": structure_type,
            "Location": location,
            "Age (years)": age,
            "Last_Cleaning": last_cleaning,
            "Image_Path": image_path,
            "Detected_Algae (%)": detected_algae,
            "Detected_Barnacles (%)": detected_barnacles,
            "Detected_Mussels (%)": detected_mussels,
            "Total_Coverage (%)": total_coverage,
            "Recommended_Cleaning_Method": recommended_cleaning_method
        })
    
    return pd.DataFrame(synthetic_data)

In [10]:

# Generate a synthetic dataset with the desired number of entries
synthetic_dataset = generate_synthetic_data(1000)
synthetic_dataset_path = '/workspaces/UnderWater-Decision/data/synthetic_dataset1.csv'
synthetic_dataset.to_csv(synthetic_dataset_path, index=False)

# Output the path to the generated CSV file
synthetic_dataset_path


'/workspaces/UnderWater-Decision/data/synthetic_dataset1.csv'

In [None]:
import pandas as pd
import numpy as np
import random
from datetime import datetime, timedelta

# Define the parameters for the synthetic dataset based on the provided data structure
structures = ['Oil Rig', 'Wind Turbine', 'Underwater Pipeline', 'Ship Hull']
locations = ['North Sea', 'Gulf of Mexico', 'Baltic Sea', 'Pacific Ocean']
cleaning_methods = ['Method A', 'Method B', 'Method C']
items = ['Level 1', 'Risers', 'Caissons', 'Conductors']
years = [2010, 2012, 2013, 2015]

# Define a function to generate a random date for last cleaning
def generate_random_date(start_year=2015, end_year=2021):
    start_date = datetime(year=start_year, month=1, day=1)
    end_date = datetime(year=end_year, month=1, day=1)
    return (start_date + timedelta(days=random.randint(0, (end_date - start_date).days))).date()

# Function to generate synthetic dataset
def generate_synthetic_data(num_entries):
    synthetic_data = []

    for _ in range(num_entries):
        structure_id = f"{random.randint(1, 999):03}"
        structure_type = random.choice(structures)
        location = random.choice(locations)
        year = random.choice(years)
        depthmin = random.uniform(-50, 0)  # Assuming depthmin is a negative value below water
        depthmax = random.uniform(depthmin, 0)  # depthmax will be less than or equal to depthmin
        item = random.choice(items)
        hard_perc = random.uniform(0, 100)  # Percentage of hard growth
        hard_mm = random.uniform(0, 60)  # Thickness of hard growth in mm
        soft_perc = random.uniform(0, 100)  # Percentage of soft growth
        soft_mm = random.uniform(0, 40)  # Thickness of soft growth in mm
        recommended_cleaning_method = random.choice(cleaning_methods)

        synthetic_data.append({
            "platform": structure_type,
            "year": year,
            "depthmin": depthmin,
            "depthmax": depthmax,
            "item": item,
            "hardPerc": hard_perc,
            "hardmm": hard_mm,
            "softPerc": soft_perc,
            "softmm": soft_mm,
            "Recommended_Cleaning_Method": recommended_cleaning_method
        })
    
    return pd.DataFrame(synthetic_data)

# Generate a synthetic dataset with 10 entries
synthetic_dataset = generate_synthetic_data(10)

# Output the synthetic dataset to a CSV file
synthetic_dataset_path = '/mnt/data/synthetic_dataset.csv'
synthetic_dataset.to_csv(synthetic_dataset_path, index=False)


In [12]:
import pandas as pd
import os

def export_excel_sheets_to_csv(excel_path, output_dir):
    # Read the Excel file to get the sheet names
    xls = pd.ExcelFile(excel_path)

    # Create directory for CSV files if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)

    # Export each sheet to a separate CSV file
    for sheet_name in xls.sheet_names:
        # Read the sheet into a pandas dataframe
        df = pd.read_excel(xls, sheet_name=sheet_name)
        
        # Define the CSV file path
        csv_file_path = os.path.join(output_dir, f'{sheet_name}.csv')
        
        # Save the dataframe to CSV
        df.to_csv(csv_file_path, index=False)

    # Return a list of paths to the exported CSV files
    return [os.path.join(output_dir, file) for file in os.listdir(output_dir)]

# Usage
excel_file_path = '/workspaces/UnderWater-Decision/data/mf-data.xlsx'  # Replace with your Excel file path
csv_directory = '/workspaces/UnderWater-Decision/data/excel'   # Replace with your desired output directory path
exported_files_paths = export_excel_sheets_to_csv(excel_file_path, csv_directory)

# Print the paths to the exported CSV files
print(exported_files_paths)


['/workspaces/UnderWater-Decision/data/excel/L10-BB.csv', '/workspaces/UnderWater-Decision/data/excel/D15-A.csv', '/workspaces/UnderWater-Decision/data/excel/L10-L.csv', '/workspaces/UnderWater-Decision/data/excel/L10-AC.csv', '/workspaces/UnderWater-Decision/data/excel/L10-AR.csv', '/workspaces/UnderWater-Decision/data/excel/L10-D.csv', '/workspaces/UnderWater-Decision/data/excel/K12-C.csv', '/workspaces/UnderWater-Decision/data/excel/Sheet9.csv', '/workspaces/UnderWater-Decision/data/excel/L10-E.csv', '/workspaces/UnderWater-Decision/data/excel/L5-A.csv', '/workspaces/UnderWater-Decision/data/excel/K12-G.csv', '/workspaces/UnderWater-Decision/data/excel/F3-1A.csv', '/workspaces/UnderWater-Decision/data/excel/G16-B.csv', '/workspaces/UnderWater-Decision/data/excel/L10-M.csv', '/workspaces/UnderWater-Decision/data/excel/K12-BP.csv', '/workspaces/UnderWater-Decision/data/excel/L10-AD.csv', '/workspaces/UnderWater-Decision/data/excel/L10-EE.csv', '/workspaces/UnderWater-Decision/data/exc

In [5]:
import random
from datetime import datetime, timedelta
from typing import Union

import pandas as pd
import numpy as np

# Load the CSV file into a DataFrame
csv_file_path = '/workspaces/UnderWater-Decision/data/excel/All.csv'
real_data_df = pd.read_csv(csv_file_path)


# Define thresholds for fouling characteristics
HARD_PERCENTAGE_THRESHOLD = 50
HARD_THICKNESS_THRESHOLD = 30
SOFT_PERCENTAGE_THRESHOLD = 50
SOFT_THICKNESS_THRESHOLD = 50

# Constants for depth thresholds and coverage percentages
SHALLOW_DEPTH_UPPER_BOUND = 0
SHALLOW_DEPTH_LOWER_BOUND = -10
MID_DEPTH_UPPER_BOUND = -25
MID_DEPTH_LOWER_BOUND = -35
DEEP_DEPTH_LOWER_BOUND = -40
MAX_COVERAGE_PERCENTAGE = 90
SHALLOW_DEPTH_COVERAGE_RANGE = (5, 11)
DEEP_DEPTH_COVERAGE_RANGE = (70, 91)
GENERIC_COVERAGE_RANGE = (5, 91)


def determine_cleaning_method(hard_perc, hard_mm, soft_perc, soft_mm):
    """
    Determines the cleaning method based on the fouling characteristics.
    
    Parameters:
    - hard_perc (int): Percentage of hard fouling.
    - hard_mm (int): Thickness of hard fouling in millimeters.
    - soft_perc (int): Percentage of soft fouling.
    - soft_mm (int): Thickness of soft fouling in millimeters.

    Returns:
    - str: The recommended cleaning method.
    """
    

    # Mechanical cleaning for severe hard fouling
    if hard_perc >= 75 or hard_mm >= 50:
        return 'Mechanical cleaning methods'

    # High-pressure water jetting for severe soft fouling
    if soft_perc >= 75 or soft_mm >= SOFT_THICKNESS_THRESHOLD:
        return 'High-pressure water jetting'

    # Cavitation water jetting for significant hard and soft fouling
    if hard_perc >= HARD_PERCENTAGE_THRESHOLD and soft_perc >= SOFT_PERCENTAGE_THRESHOLD:
        return 'Cavitation water jetting'

    # Ultrasonic cleaning for minor hard fouling without significant thickness
    if hard_perc < HARD_PERCENTAGE_THRESHOLD and hard_mm < HARD_THICKNESS_THRESHOLD:
        return 'Ultrasonic cleaning'

    # Laser cleaning as a default for other cases
    return 'Laser cleaning'


def area_coverage_by_fouling_and_depth(hard_perc: int, soft_perc: int, depth: int) -> int:
    """
    Calculate the area coverage by fouling at a given depth.

    Args:
    hard_perc (int): The percentage of hard fouling.
    soft_perc (int): The percentage of soft fouling.
    depth (int): The depth in meters (negative for below sea level).

    Returns:
    int: The fouling coverage percentage.
    """
    # Handle shallow depths with less coverage due to wave action and cleaning
    if SHALLOW_DEPTH_LOWER_BOUND <= depth <= SHALLOW_DEPTH_UPPER_BOUND:
        return random.randint(*SHALLOW_DEPTH_COVERAGE_RANGE)

    # Handle mid-range depths with the highest fouling thickness
    if MID_DEPTH_LOWER_BOUND <= depth <= MID_DEPTH_UPPER_BOUND:
        return min(max(hard_perc, soft_perc), MAX_COVERAGE_PERCENTAGE)

    # Handle deeper layers with higher coverage, but less than the peak range
    if depth >= DEEP_DEPTH_LOWER_BOUND:
        return random.randint(*DEEP_DEPTH_COVERAGE_RANGE)

    # Default case for other depths
    return min(max(hard_perc, soft_perc), random.randint(*GENERIC_COVERAGE_RANGE))

    
# Modify the synthetic data generation function to ensure all growth values are integers
def generate_synthetic_data(real_df, num_entries):
    synthetic_data = []
    platforms = real_df['platform'].unique()
    years = real_df['year'].unique()
    depthmins = real_df['depthmin'].dropna().astype(int).unique()
    depthmaxs = real_df['depthmax'].dropna().astype(int).unique()
    items = real_df['Item'].unique()

    for _ in range(num_entries):
        platform = np.random.choice(platforms)
        year = int(np.random.choice(years))
        depthmin = np.random.choice(depthmins)
        depthmax = np.random.choice(depthmaxs)
        item = np.random.choice(items)
        hard_perc = np.random.randint(0, 101)
        hard_mm = np.random.randint(0, int(real_df['hardmm'].dropna().max()) + 1)
        soft_perc = np.random.randint(0, 101 - hard_perc)
        soft_mm = np.random.randint(0, int(real_df['softmm'].dropna().max()) + 1)
        # Use the average depth for coverage calculation
        avg_depth = (depthmin + depthmax) // 2
        total_area_coverage = area_coverage_by_fouling_and_depth(hard_perc, soft_perc, avg_depth)
        
        cleaning_method = determine_cleaning_method(hard_perc, hard_mm, soft_perc, soft_mm)

        synthetic_data.append({
            "platform": platform,
            "year": year,
            "depthmin": depthmin,
            "depthmax": depthmax,
            "item": item,
            "hardPerc": hard_perc,
            "hardmm": hard_mm,
            "softPerc": soft_perc,
            "softmm": soft_mm,
            "Total_Area_Coverage": total_area_coverage,
            "Recommended_Cleaning_Method": cleaning_method
        })
    
    return pd.DataFrame(synthetic_data)

# Generate a synthetic dataset with 10 entries based on the real CSV data
synthetic_dataset = generate_synthetic_data(real_data_df, 1000)

# Save the DataFrame to a CSV file
synthetic_dataset_path = '/workspaces/UnderWater-Decision/data/synthetic_dataset2.csv'
synthetic_dataset.to_csv(synthetic_dataset_path, index=False)
