<a href="https://colab.research.google.com/github/intanelaqsha/Grievances-Event/blob/main/Grievances_Event_4step.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

What's new?

- limited access now in social issues
- fix overlap same source, not related issues [internal monitoring]

STEP 1- Expand multiple sources

In [None]:
import pandas as pd
import ast
import re

# Load file
df = pd.read_csv("Grievances-Grid view 3.csv", dtype=str)
df.columns = [c.strip() for c in df.columns]

# Tambahkan ID yang akan dipakai sampai Step 3
df["Raw_ID"] = df.index.astype(int)

# -----------------------------------------
# CLEAN + SPLIT function
# -----------------------------------------
def split_list(cell):
    if pd.isna(cell) or str(cell).strip() == "":
        return []
    s = str(cell).replace("[", "").replace("]", "")
    parts = [p.strip() for p in re.split("[,;]", s)]
    return list({p for p in parts if p})

# Columns to normalize
multi_cols = ["Suppliers", "Mills", "PIOConcessions", "Issues"]

for col in multi_cols:
    df[col] = df[col].apply(split_list)

# Source split (SPECIAL rule)
def split_source(val):
    if pd.isna(val) or str(val).strip() == "":
        return []
    #s = str(val).replace("[", "").replace("]", "")
    s = str(val).strip()
    parts = re.split(r",(?!\s)", s)
    #parts = re.split("[,;]", s)
    parts = [p.strip() for p in parts if p.strip()]
    return parts

df["Source"] = df["Source"].apply(split_source)

# -----------------------------------------
# Step 1: Expand Source → 1 row per source
# -----------------------------------------
expanded_rows = []

for idx, row in df.iterrows():
    raw_id = row["Raw_ID"]
    sources = row["Source"]

    # Jika tidak ada source → tetap simpan 1 row
    if len(sources) == 0:
        new_row = row.copy()
        new_row["Source"] = None
        expanded_rows.append(new_row)
        continue

    # Jika ada banyak source → pecah
    for src in sources:
        new_row = row.copy()
        new_row["Source"] = src      # hanya 1 source
        new_row["Raw_ID"] = raw_id   # tetap sama
        expanded_rows.append(new_row)

df_expanded = pd.DataFrame(expanded_rows)

# Tambahkan Row_ID unik untuk Step 2–3
df_expanded["Row_ID"] = df_expanded.index.astype(int)

print("Original grievances:", df.shape[0])
print("Expanded rows:", df_expanded.shape[0])
df_expanded.head()



Original grievances: 2497
Expanded rows: 3487


Unnamed: 0,ID,Company Tracker,Tracker ID,Company Tracker ID,Suppliers,Tier 1 Suppliers,Indirect Suppliers,Responsible Company,Linked Grievances,Concessions (OLD),...,Responsible Company Count,New Supplier Count,New vs Old Supplier Check,Old Concession Lat,Old Concession Lon,Update Dates,Last Modified Year,New Issue Tags,Raw_ID,Row_ID
0,AAK 1,AAK,1,,"[AAA, ASG, Wilmar]",,,,,,...,0,0,-3,,,6/1/2018,2025,,0,0
1,AAK 2,AAK,2,,[Anglo Eastern Plantations (AEP)],,,Anglo Eastern Plantations (AEP),,Kahayan Agro Plantation,...,1,1,0,-0.961,113.5,"12/1/2017, 3/1/2019, 12/1/2019, 1/1/2020, 1/3/...",2025,,1,1
1,AAK 2,AAK,2,,[Anglo Eastern Plantations (AEP)],,,Anglo Eastern Plantations (AEP),,Kahayan Agro Plantation,...,1,1,0,-0.961,113.5,"12/1/2017, 3/1/2019, 12/1/2019, 1/1/2020, 1/3/...",2025,,1,1
1,AAK 2,AAK,2,,[Anglo Eastern Plantations (AEP)],,,Anglo Eastern Plantations (AEP),,Kahayan Agro Plantation,...,1,1,0,-0.961,113.5,"12/1/2017, 3/1/2019, 12/1/2019, 1/1/2020, 1/3/...",2025,,1,1
1,AAK 2,AAK,2,,[Anglo Eastern Plantations (AEP)],,,Anglo Eastern Plantations (AEP),,Kahayan Agro Plantation,...,1,1,0,-0.961,113.5,"12/1/2017, 3/1/2019, 12/1/2019, 1/1/2020, 1/3/...",2025,,1,1


Step 2 - merge entity with the same sources

In [None]:
df2 = df_expanded.copy()

# Helper: unique sorted list
def uniq_list(x):
    return sorted(list(set(x)))

events = []
event_id = 1

# ---- CLUSTER PER SOURCE ----
for source, group in df2.groupby("Source"):
    group = group.reset_index(drop=True)

    source_events = []

    for idx, row in group.iterrows():

        # Ambil entitas
        sup = set(row["Suppliers"])
        mil = set(row["Mills"])
        pio = set(row["PIOConcessions"])
        iss = set(row["Issues"])

        # IMPORTANT → Grievance ID asli
        gid = row["ID"]

        # Date Filed asli
        date_filed = row["Date Filed"]

        merged = False

        # Cek overlap
        for evt in source_events:

            overlap = (
                len(sup & set(evt["Suppliers"])) > 0 or
                len(mil & set(evt["Mills"])) > 0 or
                len(pio & set(evt["PIOConcessions"])) > 0
            )

            if overlap:
                # Merge entitas
                evt["Suppliers"] = uniq_list(list(set(evt["Suppliers"]) | sup))
                evt["Mills"] = uniq_list(list(set(evt["Mills"]) | mil))
                evt["PIOConcessions"] = uniq_list(list(set(evt["PIOConcessions"]) | pio))
                evt["Issues"] = uniq_list(list(set(evt["Issues"]) | iss))

                # Merge grievance list
                evt["Grievance_List"].append(gid)
                evt["Grievance_List"] = uniq_list(evt["Grievance_List"])
                evt["Grievance_Count"] = len(evt["Grievance_List"])

                # Merge Date Filed → ambil yang paling lama
                evt["Date Filed_List"].append(date_filed)
                evt["Date Filed_List"] = uniq_list(evt["Date Filed_List"])
                evt["Date Filed"] = min(evt["Date Filed_List"])

                merged = True
                break

        # Tidak overlap → buat event baru
        if not merged:
            source_events.append({
                "Event_ID": f"EVT_{event_id}",
                "Source": source,

                "Suppliers": uniq_list(list(sup)),
                "Mills": uniq_list(list(mil)),
                "PIOConcessions": uniq_list(list(pio)),
                "Issues": uniq_list(list(iss)),

                "Grievance_List": [gid],
                "Grievance_Count": 1,

                # Simpan Date Filed list & final date
                "Date Filed_List": [date_filed],
                "Date Filed": date_filed
            })
            event_id += 1

    events.extend(source_events)

# Convert ke DataFrame
df_step2 = pd.DataFrame(events)

# Convert list → string
for col in ["Suppliers", "Mills", "PIOConcessions", "Issues", "Grievance_List", "Date Filed_List"]:
    df_step2[col] = df_step2[col].apply(lambda x: ", ".join(uniq_list(x)))

df_step2.to_csv("Step2.csv", index=False)
print('total', df_step2.shape[0])
df_step2.head(30)



total 1376


Unnamed: 0,Event_ID,Source,Suppliers,Mills,PIOConcessions,Issues,Grievance_List,Grievance_Count,Date Filed_List,Date Filed
0,EVT_1,"""","Aspirasi Kristal (M) Sdn Bhd, Instantstar",,11503,Deforestation,"Cargill 78, IOI 26",2,"7/1/2020, 8/1/2020",7/1/2020
1,EVT_2,"""",IJM,,11512,Deforestation,Mewah 26,1,7/10/2020,7/10/2020
2,EVT_3,"""""""Cash Investigation"""" révèle qu’Unicef Franc...","Cargill, Erasakti Wira Forestama, Kurnia Tungg...","PO1000004103, PO1000008383, PO1000010014","4335, 4336, 4337, 4338","Forced Labor and/or Child Labor, Labor Rights ...","Bunge 113, Cargill 98, GAR 146, KLK 16, LIPSA ...",6,"10/22/2022, 11/25/2022, 11/7/2022, 12/23/2022,...",10/22/2022
3,EVT_4,"""A palm oil company, a group of US financiers,...",,,,Deforestation,Ocho Sur 11,1,12/31/2024,12/31/2024
4,EVT_5,"""APT to Dismantle Power of industrial Plantati...",,,,"Deforestation, Labor Rights Violations, Land D...",Wilmar 150,1,11/23/2020,11/23/2020
5,EVT_6,"""ASTRA AGRO LESTARI EXPANDS\nINDONESIAN FOREST...",Astra Agro Lestari (AAL),,"5603, 6658, 7295",Land Dispute,LDC 59,1,4/1/2024,4/1/2024
6,EVT_7,"""Allegedly Violating Freedom of Association, P...",DAP,,,Labor Rights Violations,GAR 89,1,8/14/2017,8/14/2017
7,EVT_8,"""Amazon palm - Major international brands sour...","AGROPALMA, Brasil BioFuels (BBF)",,,Land Grabbing,Colgate 23,1,9/28/2023,9/28/2023
8,EVT_9,"""Anti-trafficking group urges US ban on Sime D...",Sime Darby Guthrie (SDGI),,,"Forced Labor and/or Child Labor, Gender and Et...","GAR 60, IOI 13, Olam 1",3,"12/30/2020, 7/1/2020, 7/8/2020",12/30/2020
9,EVT_10,"""Aspirasi Kristal’s Forest Clearance for Oil P...","Aspirasi Kristal (M) Sdn Bhd, Instantstar",,11503,Deforestation,"Cargill 78, IOI 26",2,"7/1/2020, 8/1/2020",7/1/2020


STEP 3 - Expand Each Issue Group

In [None]:
# =========================================
# LOAD Step 2 (output dari Step 2)
# =========================================
df2 = pd.read_csv("Step2.csv", dtype=str)
df2.columns = [c.strip() for c in df2.columns]

# Convert list-like columns menjadi Python list
def to_list(cell):
    if pd.isna(cell) or cell.strip() == "":
        return []
    return [x.strip() for x in str(cell).split(",") if x.strip()]

list_cols = ["Suppliers", "Mills", "PIOConcessions", "Issues", "Grievance_List"]
for col in list_cols:
    df2[col] = df2[col].apply(to_list)


# =========================================
# LOAD file tambahan (Grievances-grid view 2.csv)
# berisi kolom Issues Combined
# =========================================
df_original_grievances = pd.read_csv("Grievances-Grid view 3.csv", dtype=str)
df_original_grievances.columns = [c.strip() for c in df_original_grievances.columns]

# Convert Issues Combined ke list
df_original_grievances["Issues Combined"] = df_original_grievances["Issues Combined"].apply(to_list)

# Create a mapping from original grievance ID to its 'Issues Combined'
id_to_issues_combined_map = df_original_grievances.set_index('ID')['Issues Combined'].to_dict()

# For each event in df2, collect all 'Issues Combined' from its constituent grievances
def get_event_issues_combined(grievance_list_ids):
    combined_issues = []
    for gid in grievance_list_ids:
        if gid in id_to_issues_combined_map:
            combined_issues.extend(id_to_issues_combined_map[gid])
    return sorted(list(set(combined_issues)))

df2['Issues Combined'] = df2['Grievance_List'].apply(get_event_issues_combined)

# Now, df2 contains the 'Issues Combined' column directly.
# Assign df2 to df to maintain the original variable name for the subsequent code.
df = df2


# =========================================
# STEP 3 – DEFINE ISSUE CATEGORY DICTIONARY
# =========================================
ISSUE_MAP = {
    "Environmental": [
        "Deforestation", "Peatland Loss", "Fires", "Riparian Issues",
        "Biodiversity loss", "Environmental Pollution"
    ],
    "Social": [
        "Labor Rights Violations", "Violence and/or Coercion",
        "Gender and Ethnic Disparities", "Human Rights Violation",
        "Labor Disputes", "Wage Dispute", "Forced Labor and/or Child Labor", "Limited Access to Services"
    ],
    "Land Conflict": [
        "Land Dispute", "Land Grabbing", "Indigenous Peoples Conflict"
    ],
    "Governance": [
        "Corruption", "Illegal Infrastructure", "Infrastructure Damage"
    ]
}

# Reverse map untuk lookup cepat
ISSUE_TO_GROUP = {}
for group, items in ISSUE_MAP.items():
    for it in items:
        ISSUE_TO_GROUP[it.lower()] = group


# =========================================
# STEP 3 – BUILDING GROUPED EVENTS
# =========================================
final_rows = []
new_eid = 1

for idx, row in df.iterrows():
    issues_raw = row["Issues Combined"] if isinstance(row["Issues Combined"], list) else []
    issues_raw = [x.strip() for x in issues_raw if x.strip()]

    # tampung issues berdasarkan kategori
    grouped = {}

    for issue in issues_raw:
        key = issue.lower()

        if key in ISSUE_TO_GROUP:
            cat = ISSUE_TO_GROUP[key]
        else:
            cat = "Other"

        if cat not in grouped:
            grouped[cat] = []
        grouped[cat].append(issue)

    # Untuk setiap kategori → buat event baru
    for cat, issue_list in grouped.items():
        new_row = {
            "Event_ID_S3": f"EVT3_{new_eid}",
            "Original_Event_ID": row["Event_ID"],
            "Issue_Category": cat,
            "Issues": ", ".join(sorted(set(issue_list))),
            "Suppliers": ", ".join(row["Suppliers"]),
            "Mills": ", ".join(row["Mills"]),
            "PIOConcessions": ", ".join(row["PIOConcessions"]),
            "Grievance_List": ", ".join(row["Grievance_List"]),
            "Grievance_Count": row["Grievance_Count"],
            "Source": row["Source"],
            "Date_Filed": row["Date Filed"]

        }
        final_rows.append(new_row)
        new_eid += 1

# Output akhir
df_step3 = pd.DataFrame(final_rows)
df_step3.to_csv("Step3.csv", index=False)

print("Step 3 selesai. Total events:", len(df_step3))
df_step3.head(40)

Step 3 selesai. Total events: 1697


Unnamed: 0,Event_ID_S3,Original_Event_ID,Issue_Category,Issues,Suppliers,Mills,PIOConcessions,Grievance_List,Grievance_Count,Source,Date_Filed
0,EVT3_1,EVT_1,Environmental,Deforestation,"Aspirasi Kristal (M) Sdn Bhd, Instantstar",,11503,"Cargill 78, IOI 26",2,"""",7/1/2020
1,EVT3_2,EVT_2,Environmental,Deforestation,IJM,,11512,Mewah 26,1,"""",7/10/2020
2,EVT3_3,EVT_3,Social,"Forced Labor and/or Child Labor, Labor Rights ...","Cargill, Erasakti Wira Forestama, Kurnia Tungg...","PO1000004103, PO1000008383, PO1000010014","4335, 4336, 4337, 4338","Bunge 113, Cargill 98, GAR 146, KLK 16, LIPSA ...",6,"""""""Cash Investigation"""" révèle qu’Unicef Franc...",10/22/2022
3,EVT3_4,EVT_3,Land Conflict,Land Dispute,"Cargill, Erasakti Wira Forestama, Kurnia Tungg...","PO1000004103, PO1000008383, PO1000010014","4335, 4336, 4337, 4338","Bunge 113, Cargill 98, GAR 146, KLK 16, LIPSA ...",6,"""""""Cash Investigation"""" révèle qu’Unicef Franc...",10/22/2022
4,EVT3_5,EVT_4,Environmental,Deforestation,,,,Ocho Sur 11,1,"""A palm oil company, a group of US financiers,...",12/31/2024
5,EVT3_6,EVT_5,Environmental,Deforestation,,,,Wilmar 150,1,"""APT to Dismantle Power of industrial Plantati...",11/23/2020
6,EVT3_7,EVT_5,Social,Labor Rights Violations,,,,Wilmar 150,1,"""APT to Dismantle Power of industrial Plantati...",11/23/2020
7,EVT3_8,EVT_5,Land Conflict,"Land Dispute, Land Grabbing",,,,Wilmar 150,1,"""APT to Dismantle Power of industrial Plantati...",11/23/2020
8,EVT3_9,EVT_6,Land Conflict,Land Dispute,Astra Agro Lestari (AAL),,"5603, 6658, 7295",LDC 59,1,"""ASTRA AGRO LESTARI EXPANDS\nINDONESIAN FOREST...",4/1/2024
9,EVT3_10,EVT_7,Social,Labor Rights Violations,DAP,,,GAR 89,1,"""Allegedly Violating Freedom of Association, P...",8/14/2017


STEP 4. Clustering, if Issue group same, and source in range

In [None]:
import pandas as pd
from datetime import datetime, timedelta

# =========================================
# LOAD Step 3
# =========================================
df3 = pd.read_csv("Step3.csv", dtype=str)
df3.columns = [c.strip() for c in df3.columns]

# Convert list-like columns ke Python list
def to_list(cell):
    if pd.isna(cell) or cell.strip() == "":
        return []
    return [x.strip() for x in str(cell).split(",") if x.strip()]

list_cols = ["Suppliers", "Mills", "PIOConcessions", "Grievance_List", "Source"]
for col in list_cols:
    df3[col] = df3[col].apply(to_list)

# Convert Date Filed ke datetime
df3["Date_Filed"] = pd.to_datetime(df3["Date_Filed"], errors='coerce')

# =========================================
# Step 4 – Merge events berdasarkan entitas + issue + time window
# =========================================
merged_events = []
mhid_id = 1

# Time window per Issue Category
def in_time_window(issue_cat, date1, date2):
    if pd.isna(date1) or pd.isna(date2):
        return False
    if issue_cat == "Environmental":
        return date1.year == date2.year
    else:
        # 2 months range
        return abs((date1 - date2).days) <= 60

# Loop per Issue Category
for cat, group in df3.groupby("Issue_Category"):
    # Sort by Date Filed ascending
    group = group.sort_values("Date_Filed").reset_index(drop=True)

    # Temp list of active merged events in this category
    active_events = []

    for idx, row in group.iterrows():
        merged = False

        # Check against active events
        for evt in active_events:
            # Count entitas yang overlap
            ent_overlap = 0
            ent_overlap += len(set(row["Suppliers"]) & set(evt["Suppliers"]))
            ent_overlap += len(set(row["Mills"]) & set(evt["Mills"]))
            ent_overlap += len(set(row["PIOConcessions"]) & set(evt["PIOConcessions"]))

            if ent_overlap >= 2 and in_time_window(cat, row["Date_Filed"], evt["Earliest_Date"]):
                # Merge event
                evt["Suppliers"] = sorted(list(set(evt["Suppliers"]) | set(row["Suppliers"])))
                evt["Mills"] = sorted(list(set(evt["Mills"]) | set(row["Mills"])))
                evt["PIOConcessions"] = sorted(list(set(evt["PIOConcessions"]) | set(row["PIOConcessions"])))
                evt["Source"] = sorted(list(set(evt["Source"]) | set(row["Source"])))
                evt["Grievance_List"] = sorted(list(set(evt["Grievance_List"]) | set(row["Grievance_List"])))
                evt["Grievance_Count"] = len(evt["Grievance_List"])
                # Update earliest & latest Date Filed
                evt["Earliest_Date"] = min(evt["Earliest_Date"], row["Date_Filed"])
                evt["Latest_Date"] = max(evt["Latest_Date"], row["Date_Filed"])
                merged = True
                break

        # Jika tidak bisa merge → buat event baru
        if not merged:
            active_events.append({
                "MHID": f"MHID_{mhid_id}",
                "Issue_Category": cat,
                "Suppliers": row["Suppliers"],
                "Mills": row["Mills"],
                "PIOConcessions": row["PIOConcessions"],
                "Source": row["Source"],
                "Grievance_List": row["Grievance_List"],
                "Grievance_Count": row["Grievance_Count"],
                "Earliest_Date": row["Date_Filed"],
                "Latest_Date": row["Date_Filed"]
            })
            mhid_id += 1

    # Setelah selesai per category, simpan active_events ke merged_events
    merged_events.extend(active_events)

# Convert ke DataFrame
df_step4 = pd.DataFrame(merged_events)

# Convert list → string untuk CSV
for col in ["Suppliers", "Mills", "PIOConcessions", "Source", "Grievance_List"]:
    df_step4[col] = df_step4[col].apply(lambda x: ", ".join(x))

# Convert Date Filed ke string
df_step4["Earliest_Date"] = df_step4["Earliest_Date"].dt.strftime("%Y-%m-%d")
df_step4["Latest_Date"] = df_step4["Latest_Date"].dt.strftime("%Y-%m-%d")

# Save output
df_step4.to_csv("Step4_MergedEvents.csv", index=False)

print("Step 4 selesai. Total merged events:", len(df_step4))
df_step4.head(20)

Step 4 selesai. Total merged events: 1107


Unnamed: 0,MHID,Issue_Category,Suppliers,Mills,PIOConcessions,Source,Grievance_List,Grievance_Count,Earliest_Date,Latest_Date
0,MHID_1,Environmental,IOI,,,Burning Down the House,Bunge 54,1,2009-12-20,2009-12-20
1,MHID_2,Environmental,IOI,PO1000000109,"3943, 4521, 7004","Aidenvironment Complaint - RSPO, RSPO Complain...",Sime Darby 22,1,2010-03-01,2010-03-01
2,MHID_3,Environmental,"Bunge, IOI",PO1000003833,13498,Dirty Bankers - How HSBC is Financing Forest D...,"Cargill 5, Unilever 12",2,2010-03-01,2010-03-01
3,MHID_4,Environmental,NOBLE,,,Dirty Bankers - How HSBC is Financing Forest D...,Bunge 4,1,2010-12-01,2010-12-01
4,MHID_5,Environmental,"BLD, Palmco Oil Mill",,13061,"Rapid Response 11, Rapid Response 4, Rapid Res...","AAK 3, ADM 10, BLD Plantation 1, Bunge 10, IOI...",8,2010-12-01,2010-12-01
5,MHID_6,Environmental,"IJM, IOI",,,IOI Statement Link,Bunge 3,1,2010-12-01,2010-12-01
6,MHID_7,Environmental,HSA,,,Rapid Response 9,Bunge 2,1,2010-12-01,2010-12-01
7,MHID_8,Environmental,HSA,,,Rapid Response 15,Bunge 2,1,2010-12-01,2010-12-01
8,MHID_9,Environmental,Gagah Putera Satria (GPS),,6536,Burning Down the House,"Bunge 17, GAR 7",2,2010-12-01,2010-12-01
9,MHID_10,Environmental,HSA,,,Rapid Response 12,Bunge 2,1,2010-12-01,2010-12-01


PAKE YANG INI

In [None]:
import pandas as pd
from datetime import datetime, timedelta

# =========================================
# LOAD Step 3
# =========================================
df3 = pd.read_csv("Step3.csv", dtype=str)
df3.columns = [c.strip() for c in df3.columns]

# Convert list-like columns ke Python list
def to_list(cell):
    if pd.isna(cell) or cell.strip() == "":
        return []
    return [x.strip() for x in str(cell).split(",") if x.strip()]

list_cols = ["Suppliers", "Mills", "PIOConcessions", "Grievance_List", "Source"]
for col in list_cols:
    df3[col] = df3[col].apply(to_list)

# Convert Date Filed ke datetime
df3["Date_Filed"] = pd.to_datetime(df3["Date_Filed"], errors='coerce')

# =========================================
# Step 4 – Merge events berdasarkan entitas + issue + time window
# =========================================
merged_events = []
mhid_id = 1

# Time window per Issue Category
def in_time_window(issue_cat, date_new, date_latest):
    if pd.isna(date_new) or pd.isna(date_latest):
        return False
    if issue_cat == "Environmental":
        return date_new.year == date_latest.year
    else:
        # 2 months range
        return abs((date_new - date_latest).days) <= 60

# Loop per Issue Category
for cat, group in df3.groupby("Issue_Category"):
    group = group.sort_values("Date_Filed").reset_index(drop=True)

    active_events = []

    for idx, row in group.iterrows():
        merged = False

        for evt in active_events:
            # Count entitas yang overlap
            ent_overlap = 0
            ent_overlap += len(set(row["Suppliers"]) & set(evt["Suppliers"]))
            ent_overlap += len(set(row["Mills"]) & set(evt["Mills"]))
            ent_overlap += len(set(row["PIOConcessions"]) & set(evt["PIOConcessions"]))

            # Check merge conditions: minimal 2 entitas + time window vs Latest_Date
            if ent_overlap >= 1 and in_time_window(cat, row["Date_Filed"], evt["Latest_Date"]):
                # Merge event
                evt["Suppliers"] = sorted(list(set(evt["Suppliers"]) | set(row["Suppliers"])))
                evt["Mills"] = sorted(list(set(evt["Mills"]) | set(row["Mills"])))
                evt["PIOConcessions"] = sorted(list(set(evt["PIOConcessions"]) | set(row["PIOConcessions"])))
                evt["Source"] = sorted(list(set(evt["Source"]) | set(row["Source"])))
                evt["Grievance_List"] = sorted(list(set(evt["Grievance_List"]) | set(row["Grievance_List"])))
                evt["Grievance_Count"] = len(evt["Grievance_List"])
                # Update earliest & latest Date Filed
                evt["Earliest_Date"] = min(evt["Earliest_Date"], row["Date_Filed"])
                evt["Latest_Date"] = max(evt["Latest_Date"], row["Date_Filed"])
                merged = True
                break

        # Jika tidak bisa merge → buat event baru
        if not merged:
            active_events.append({
                "MHID": f"MHID_{mhid_id}",
                "Issue_Category": cat,
                "Suppliers": row["Suppliers"],
                "Mills": row["Mills"],
                "PIOConcessions": row["PIOConcessions"],
                "Source": row["Source"],
                "Grievance_List": row["Grievance_List"],
                "Grievance_Count": row["Grievance_Count"],
                "Earliest_Date": row["Date_Filed"],
                "Latest_Date": row["Date_Filed"]
            })
            mhid_id += 1

    # Tambahkan hasil per category ke merged_events
    merged_events.extend(active_events)

# Convert ke DataFrame
df_step4 = pd.DataFrame(merged_events)

# Convert list → string untuk CSV
for col in ["Suppliers", "Mills", "PIOConcessions", "Source", "Grievance_List"]:
    df_step4[col] = df_step4[col].apply(lambda x: ", ".join(x))

# Convert Date Filed ke string
df_step4["Earliest_Date"] = df_step4["Earliest_Date"].dt.strftime("%Y-%m-%d")
df_step4["Latest_Date"] = df_step4["Latest_Date"].dt.strftime("%Y-%m-%d")

# Save output
df_step4.to_csv("Step4_MergedEvents_Fix.csv", index=False)

print("Step 4 selesai. Total merged events:", len(df_step4))
df_step4.head(20)

Step 4 selesai. Total merged events: 818


Unnamed: 0,MHID,Issue_Category,Suppliers,Mills,PIOConcessions,Source,Grievance_List,Grievance_Count,Earliest_Date,Latest_Date
0,MHID_1,Environmental,IOI,,,Burning Down the House,Bunge 54,1,2009-12-20,2009-12-20
1,MHID_2,Environmental,"AAA, AAK, ASI, Apical, Bunge, CRS, FORTIUS COR...","PO1000000109, PO1000003833, PO1000004300, PO10...","13498, 3858, 3943, 3949, 3950, 4521, 4615, 461...","Aidenvironment Complaint - RSPO, Dirty Bankers...","ADM 43, Apical 46, Bunge 127, Bunge 3, Bunge 4...",24,2010-03-01,2010-12-01
2,MHID_3,Environmental,HSA,,,"Rapid Response 10, Rapid Response 11, Rapid Re...",Bunge 2,1,2010-12-01,2010-12-01
3,MHID_4,Environmental,"BLD, Palmco Oil Mill",,13061,"Rapid Response 11, Rapid Response 4, Rapid Res...","AAK 3, ADM 10, BLD Plantation 1, Bunge 10, IOI...",8,2010-12-01,2010-12-01
4,MHID_5,Environmental,"Aditya Agroindo, DTK Opportunity, Kemilau Inda...",,"4852, 6837, 7350",Final Countdown : Now or Never to Reform the P...,"Apical 24, Bunge 14, Cargill 22, GAR 3, Wilmar 92",5,2010-12-01,2010-12-01
5,MHID_6,Environmental,NOBLE,,,Dirty Bankers - How HSBC is Financing Forest D...,Bunge 4,1,2010-12-01,2010-12-01
6,MHID_7,Environmental,Gagah Putera Satria (GPS),,6536,Burning Down the House,"Bunge 17, GAR 7",2,2010-12-01,2010-12-01
7,MHID_8,Environmental,,,4564,International Animal Rescue Indonesia (RSPO Co...,FR 13,1,2011-02-01,2011-02-01
8,MHID_9,Environmental,DARMEX AGRO,,,Dirty Business : How a leading RSPO palm oil p...,Bunge 31,1,2012-12-01,2012-12-01
9,MHID_10,Environmental,"AMS-GANDA, KPN",,"4042, 4057, 4534, 4799, 4966, 5075, 5085, 6741...","""Palm Oil row erupts in Sumatra, Bentangan: Je...","Apical 26, Cargill 28, Fuji 54, GAR 26, Musim ...",5,2013-10-07,2013-10-07


ENTITAS OVERLAP MIN=2, SUPPLIER 1; PLOT OR MILL 1

In [None]:
import pandas as pd
from datetime import datetime, timedelta

# =========================================
# LOAD Step 3
# =========================================
df3 = pd.read_csv("Step3.csv", dtype=str)
df3.columns = [c.strip() for c in df3.columns]

# Convert list-like ke Python list
def to_list(cell):
    if pd.isna(cell) or cell.strip() == "":
        return []
    return [x.strip() for x in str(cell).split(",") if x.strip()]

list_cols = ["Suppliers", "Mills", "PIOConcessions", "Grievance_List", "Source"]
for col in list_cols:
    df3[col] = df3[col].apply(to_list)

# Convert Date Filed ke datetime
df3["Date_Filed"] = pd.to_datetime(df3["Date_Filed"], errors='coerce')

# =========================================
# Step 4 – Merge logic baru
# =========================================
merged_events = []
mhid_id = 1

# ⏱ Time window rule
def in_time_window(issue_cat, date_new, date_latest):
    if pd.isna(date_new) or pd.isna(date_latest):
        return False

    delta_days = abs((date_new - date_latest).days)

    if issue_cat == "Environmental":
        return delta_days <= 90   # 3 bulan
    else:
        return delta_days <= 60   # 2 bulan

# Loop per Issue Category
for cat, group in df3.groupby("Issue_Category"):
    group = group.sort_values("Date_Filed").reset_index(drop=True)
    active_events = []

    for idx, row in group.iterrows():
        merged = False

        for evt in active_events:
            # Hitung overlap
            supplier_overlap = len(set(row["Suppliers"]) & set(evt["Suppliers"]))
            mill_overlap = len(set(row["Mills"]) & set(evt["Mills"]))
            plot_overlap = len(set(row["PIOConcessions"]) & set(evt["PIOConcessions"]))

            # ✅ Syarat baru:
            has_supplier = supplier_overlap >= 1
            has_asset = (mill_overlap >= 1 or plot_overlap >= 1)

            # Check merge
            if has_supplier and has_asset and in_time_window(cat, row["Date_Filed"], evt["Latest_Date"]):

                evt["Suppliers"] = sorted(list(set(evt["Suppliers"]) | set(row["Suppliers"])))
                evt["Mills"] = sorted(list(set(evt["Mills"]) | set(row["Mills"])))
                evt["PIOConcessions"] = sorted(list(set(evt["PIOConcessions"]) | set(row["PIOConcessions"])))
                evt["Source"] = sorted(list(set(evt["Source"]) | set(row["Source"])))
                evt["Grievance_List"] = sorted(list(set(evt["Grievance_List"]) | set(row["Grievance_List"])))
                evt["Grievance_Count"] = len(evt["Grievance_List"])

                # Update tanggal
                evt["Earliest_Date"] = min(evt["Earliest_Date"], row["Date_Filed"])
                evt["Latest_Date"] = max(evt["Latest_Date"], row["Date_Filed"])

                merged = True
                break

        # Jika tidak bisa di-merge → buat event baru
        if not merged:
            active_events.append({
                "MHID": f"MHID_{mhid_id}",
                "Issue_Category": cat,
                "Suppliers": row["Suppliers"],
                "Mills": row["Mills"],
                "PIOConcessions": row["PIOConcessions"],
                "Source": row["Source"],
                "Grievance_List": row["Grievance_List"],
                "Grievance_Count": row["Grievance_Count"],
                "Earliest_Date": row["Date_Filed"],
                "Latest_Date": row["Date_Filed"]
            })
            mhid_id += 1

    merged_events.extend(active_events)

# ========================================
# Convert ke DataFrame
# ========================================
df_step4 = pd.DataFrame(merged_events)

# Convert list → string
for col in ["Suppliers", "Mills", "PIOConcessions", "Source", "Grievance_List"]:
    df_step4[col] = df_step4[col].apply(lambda x: ", ".join(x))

# Convert tanggal ke string
df_step4["Earliest_Date"] = df_step4["Earliest_Date"].dt.strftime("%Y-%m-%d")
df_step4["Latest_Date"] = df_step4["Latest_Date"].dt.strftime("%Y-%m-%d")

df_step4.to_csv("Step4_MergedEvents_NewLogic.csv", index=False)

print("Step 4 selesai dengan logic baru ✅")
print("Total events:", len(df_step4))

df_step4.head(20)

Step 4 selesai dengan logic baru ✅
Total events: 1189


Unnamed: 0,MHID,Issue_Category,Suppliers,Mills,PIOConcessions,Source,Grievance_List,Grievance_Count,Earliest_Date,Latest_Date
0,MHID_1,Environmental,IOI,,,Burning Down the House,Bunge 54,1,2009-12-20,2009-12-20
1,MHID_2,Environmental,IOI,PO1000000109,"3943, 4521, 7004","Aidenvironment Complaint - RSPO, RSPO Complain...",Sime Darby 22,1,2010-03-01,2010-03-01
2,MHID_3,Environmental,"Bunge, IOI",PO1000003833,13498,Dirty Bankers - How HSBC is Financing Forest D...,"Cargill 5, Unilever 12",2,2010-03-01,2010-03-01
3,MHID_4,Environmental,HSA,,,Rapid Response 7,Bunge 2,1,2010-12-01,2010-12-01
4,MHID_5,Environmental,"AAA, AAK, ASI, Apical, Bunge, CRS, FORTIUS COR...","PO1000004345, PO1000004367, PO1000004392, PO10...","4615, 4616, 5709, 6427",Investigative Report: Enough is Enough,"Bunge 46, Bunge 75, Bunge 76, Cargill 30, Carg...",8,2010-12-01,2010-12-01
5,MHID_6,Environmental,"BLD, Palmco Oil Mill",,13061,"Rapid Response 11, Rapid Response 4, Rapid Res...","AAK 3, ADM 10, BLD Plantation 1, Bunge 10, IOI...",8,2010-12-01,2010-12-01
6,MHID_7,Environmental,HSA,,,Rapid Response 9,Bunge 2,1,2010-12-01,2010-12-01
7,MHID_8,Environmental,"Aditya Agroindo, DTK Opportunity, Kemilau Inda...",,"4852, 6837, 7350",Final Countdown : Now or Never to Reform the P...,"Apical 24, Bunge 14, Cargill 22, GAR 3, Wilmar 92",5,2010-12-01,2010-12-01
8,MHID_9,Environmental,NOBLE,,,Dirty Bankers - How HSBC is Financing Forest D...,Bunge 4,1,2010-12-01,2010-12-01
9,MHID_10,Environmental,"AAK, Apical, Fuji Oil Group, Syaukath Sejahter...",PO1000004583,6771,Royal Golden Eagle Group Links Global Brands a...,"Apical 46, Bunge 127, Cargill 80, Musim Mas 56...",5,2010-12-01,2010-12-01
