# **DATA ACQUISITION AND PREPROCESSING** 

In [1]:
import pandas as pd
import random
import numpy as np
import math

random.seed(60)
np.random.seed(60)

#### **SERVICE TOWN, DISTANCE AND TRAVEL TIME DATA**

In [2]:
# Real data
towns_data = {
    'service_town': ['Gbi - Kledzo', 'Gbi - Wegbe', 'Godenu', 'Gbi - Avege', 'Ve-Dator',
                     'Ve- Kobenu', 'Golokwati', 'Afadzo South', 'Ve -Gbodome', 'Wuinta',
                     'Logba', 'Fume', 'Fodome', 'Liati', 'Zimugaziwo snake Village',
                     'Wli', 'Leklebi', 'Gbledi', 'Agome yo', 'Santrokofi', 'Akpafu',
                     'Lolobi', 'Likpe'],
    'distance_in_km': [4.9, 6.2, 19.2, 10.2, 14.3, 13.7, 21.3, 38.4, 25.4, 30.7,
                       38.3, 38.3, 0.65, 20, 16.5, 22.8, 23.5, 1.3, 15.9, 7.5,
                       11.9, 10.1, 17.2],
    'travel_time_in_hours': [0.267, 0.333, 0.5, 0.467, 0.6, 0.583, 0.5, 1.017, 0.667, 0.783,
                             1, 0.917, 0.65, 0.55, 0.467, 0.483, 0.633, 0.05, 0.517, 0.333,
                             0.417, 0.35, 0.467]

}

In [3]:
# DataFrame
df_towns = pd.DataFrame(towns_data)
df_towns.head()

Unnamed: 0,service_town,distance_in_km,travel_time_in_hours
0,Gbi - Kledzo,4.9,0.267
1,Gbi - Wegbe,6.2,0.333
2,Godenu,19.2,0.5
3,Gbi - Avege,10.2,0.467
4,Ve-Dator,14.3,0.6


In [4]:
max_distance = math.ceil(df_towns["distance_in_km"].max())
max_distance

39

In [5]:
# zone classification Near and Far
df_towns['zone'] = df_towns['distance_in_km'].apply(lambda x: 'Near' if x <= (max_distance/2) else 'Far')

# check
print(df_towns)
print(f"\nNear zone towns: {len(df_towns[df_towns['zone'] == 'Near'])}")
print(f"Far zone towns: {len(df_towns[df_towns['zone'] == 'Far'])}")

                service_town  distance_in_km  travel_time_in_hours  zone
0               Gbi - Kledzo            4.90                 0.267  Near
1                Gbi - Wegbe            6.20                 0.333  Near
2                     Godenu           19.20                 0.500  Near
3                Gbi - Avege           10.20                 0.467  Near
4                   Ve-Dator           14.30                 0.600  Near
5                 Ve- Kobenu           13.70                 0.583  Near
6                  Golokwati           21.30                 0.500   Far
7               Afadzo South           38.40                 1.017   Far
8                Ve -Gbodome           25.40                 0.667   Far
9                     Wuinta           30.70                 0.783   Far
10                     Logba           38.30                 1.000   Far
11                      Fume           38.30                 0.917   Far
12                    Fodome            0.65       

#### **SIMULATING FAULTS REPORT**

In [6]:
fault_types = {
    'Transformer installation': 4.0,
    'Transformer maintenance': 0.75,
    'Cable joining and termination': 0.33,
    'Network line extension': 2.0,
    'Pole replacement': 1.0,
    'Vegetation control': 4.0
}

# Distribution percentages
fault_type_weights = [
    0.15,  # Transformer installation 
    0.25,  # Transformer maintenance 
    0.20,  # Cable joining and termination
    0.15,  # Network line extension
    0.10,  # Pole replacement 
    0.15   # Vegetation control
]

# Priority levels
priority_levels = ['High', 'Normal']
priority_weights = [0.30, 0.70]  # 30% high priority, 70% normal

# Generate 15 faults
num_faults = 15

# Randomly select 20 towns (with replacement)
selected_towns = random.choices(df_towns['service_town'].tolist(), k=num_faults)

# Randomly assign fault types
selected_fault_types = random.choices(
    list(fault_types.keys()),
    weights=fault_type_weights,
    k=num_faults
)

# Randomly assign priorities
selected_priorities = random.choices(
    priority_levels,
    weights=priority_weights,
    k=num_faults
)

In [9]:
# Create fault dataset
faults_data = []

for i in range(num_faults):
    town = selected_towns[i]
    fault_type = selected_fault_types[i]
    priority = selected_priorities[i]
    
    # Get town data
    town_row = df_towns[df_towns['service_town'] == town].iloc[0]
    
    fault_dict = {
        'Fault_ID': f'F{i+1}',
        'Town': town,
        'Zone': town_row['zone'],
        'Distance_km': town_row['distance_in_km'],
        'Travel_time_hours': round(town_row['travel_time_in_hours'], 3),
        'Fault_type': fault_type,
        'Repair_time_hours': fault_types[fault_type],
        'Priority': priority
    }
    
    faults_data.append(fault_dict)

# Create DataFrame
df_faults = pd.DataFrame(faults_data)
df_faults

Unnamed: 0,Fault_ID,Town,Zone,Distance_km,Travel_time_hours,Fault_type,Repair_time_hours,Priority
0,F1,Afadzo South,Far,38.4,1.017,Pole replacement,1.0,High
1,F2,Liati,Far,20.0,0.55,Transformer maintenance,0.75,Normal
2,F3,Golokwati,Far,21.3,0.5,Transformer maintenance,0.75,High
3,F4,Agome yo,Near,15.9,0.517,Network line extension,2.0,Normal
4,F5,Logba,Far,38.3,1.0,Network line extension,2.0,Normal
5,F6,Afadzo South,Far,38.4,1.017,Cable joining and termination,0.33,High
6,F7,Akpafu,Near,11.9,0.417,Cable joining and termination,0.33,Normal
7,F8,Agome yo,Near,15.9,0.517,Cable joining and termination,0.33,High
8,F9,Santrokofi,Near,7.5,0.333,Network line extension,2.0,Normal
9,F10,Wli,Far,22.8,0.483,Transformer installation,4.0,High


In [15]:
print("SUMMARY STATISTICS")
print(f"Total faults: {len(df_faults)}")
print(f"\nFaults by Zone:")
print(df_faults['Zone'].value_counts())
print(f"\nFaults by Type:")
print(df_faults['Fault_type'].value_counts())
print(f"\nFaults by Priority:")
print(df_faults['Priority'].value_counts())
print(f"\nTravel Time Statistics:")
print(f" Average travel time (Near zone): {df_faults[df_faults['Zone']=='Near']['Travel_time_hours'].mean():.3f} hours")
print(f" Average travel time (Far zone): {df_faults[df_faults['Zone']=='Far']['Travel_time_hours'].mean():.3f} hours")
print(f"\nRepair Time Statistics:")
print(f" Average repair time: {df_faults['Repair_time_hours'].mean():.3f} hours")
print(f" Shortest repair: {df_faults['Repair_time_hours'].min():.3f} hours ({df_faults[df_faults['Repair_time_hours']==df_faults['Repair_time_hours'].min()]['Fault_type'].iloc[0]})")
print(f" Longest repair: {df_faults['Repair_time_hours'].max():.3f} hours ({df_faults[df_faults['Repair_time_hours']==df_faults['Repair_time_hours'].max()]['Fault_type'].iloc[0]})")

SUMMARY STATISTICS
Total faults: 15

Faults by Zone:
Zone
Near    8
Far     7
Name: count, dtype: int64

Faults by Type:
Fault_type
Transformer maintenance          5
Cable joining and termination    4
Network line extension           3
Transformer installation         2
Pole replacement                 1
Name: count, dtype: int64

Faults by Priority:
Priority
Normal    10
High       5
Name: count, dtype: int64

Travel Time Statistics:
 Average travel time (Near zone): 0.469 hours
 Average travel time (Far zone): 0.748 hours

Repair Time Statistics:
 Average repair time: 1.338 hours
 Shortest repair: 0.330 hours (Cable joining and termination)
 Longest repair: 4.000 hours (Transformer installation)


In [16]:
# SAVE FILES
df_faults.to_csv('../dataset/ecg_faults_dataset.csv', index=False)
df_towns.to_csv('../dataset/ecg_towns_dataset.csv', index=False)

print("\n" + "=" * 80)
print("FILES SAVED")
print("=" * 80)
print("✅ ecg_faults_dataset.csv (15 faults with real ECG service types)")
print("✅ ecg_towns_dataset.csv (23 towns with zones)")
print("=" * 80)

# ============================================================================
# ADDITIONAL ANALYSIS FOR YOUR PAPER
# ============================================================================
print("\n" + "=" * 80)
print("ADDITIONAL INSIGHTS FOR YOUR RESEARCH PAPER")
print("=" * 80)

print("\n1️⃣  SERVICE TYPE COMPLEXITY:")
quick_services = df_faults[df_faults['Repair_time_hours'] <= 1.0]
medium_services = df_faults[(df_faults['Repair_time_hours'] > 1.0) & (df_faults['Repair_time_hours'] <= 2.5)]
complex_services = df_faults[df_faults['Repair_time_hours'] > 2.5]

print(f"   Quick (<1 hr): {len(quick_services)} faults ({len(quick_services)/num_faults*100:.1f}%)")
print(f"   Medium (1-2.5 hrs): {len(medium_services)} faults ({len(medium_services)/num_faults*100:.1f}%)")
print(f"   Complex (>2.5 hrs): {len(complex_services)} faults ({len(complex_services)/num_faults*100:.1f}%)")

print("\n2️⃣  WORKLOAD ESTIMATION:")
total_travel_time = df_faults['Travel_time_hours'].sum()
total_repair_time = df_faults['Repair_time_hours'].sum()
total_work_time = total_travel_time + total_repair_time

print(f"   Total travel time: {total_travel_time:.2f} hours")
print(f"   Total repair time: {total_repair_time:.2f} hours")
print(f"   Total work time: {total_work_time:.2f} hours")
print(f"   Average per group (if distributed equally): {total_work_time/num_groups:.2f} hours")

if total_work_time/num_groups <= shift_hours:
    print(f"   ✅ Workload is reasonable (avg {total_work_time/num_groups:.2f}h < {shift_hours}h shift)")
else:
    print(f"   ⚠️  Workload may be tight (avg {total_work_time/num_groups:.2f}h approaching {shift_hours}h shift)")

print("\n3️⃣  ZONE EQUITY BASELINE:")
if len(df_faults[df_faults['Zone']=='Near']) > 0 and len(df_faults[df_faults['Zone']=='Far']) > 0:
    near_avg = df_faults[df_faults['Zone']=='Near']['Travel_time_hours'].mean()
    far_avg = df_faults[df_faults['Zone']=='Far']['Travel_time_hours'].mean()
    ratio = far_avg / near_avg
    print(f"   Near zone avg travel: {near_avg:.3f} hours ({near_avg*60:.0f} min)")
    print(f"   Far zone avg travel: {far_avg:.3f} hours ({far_avg*60:.0f} min)")
    print(f"   Current ratio: {ratio:.2f}x")
    print(f"   Equity constraint: ≤ 1.5x")
    if ratio > 1.5:
        print(f"   ⚠️  Natural geographic inequity detected - optimization will help!")
    else:
        print(f"   ✅ Current distribution is within equity bounds")

print("\n" + "=" * 80)


FEASIBILITY CHECK
Available capacity: 5 groups × 3 faults = 15 faults
Required capacity: 15 faults
✅ FEASIBLE: 15 ≥ 15

Longest single fault:
  F12: Ve -Gbodome - Transformer installation
  Total time: 4.667 hours (Travel: 0.667h + Repair: 4.000h)
✅ All faults fit within 8-hour shift
