<a href="https://colab.research.google.com/github/intanelaqsha/Grievances-Event/blob/main/Scenario_C_fixx.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **A. SCENARIO C STEP 1**

calling all plot/mill mentioned in the grievances.

In [None]:
import pandas as pd

df = pd.read_csv("Grievances-Grid view.csv", dtype=str).fillna("")

# -------------------------
# Step 1: Load & normalize
# -------------------------

def clean_array(x):
    if x.strip() == "":
        return []
    return list(dict.fromkeys([i for i in x.split(",") if i.strip() != ""]))

df["plots_array"] = df["PIOConcessions-v2"].apply(lambda s: clean_array(s.replace(" ", "")))
df["mills_array"] = df["Mills"].apply(lambda s: clean_array(s.replace(" ", "")))

# Source: reproduce DuckDB logic exactly
def split_source(src):
    if src.strip() == "":
        return []
    tmp = src.replace(", ", "<COMMA_SPACE>")
    parts = tmp.split(",")
    cleaned = [p.strip().replace("<COMMA_SPACE>", ", ") for p in parts if p.strip() != ""]
    return list(dict.fromkeys(cleaned))

df["source_array"] = df["Source"].apply(split_source)

df["issues_array"] = df["Issues"].apply(
    lambda s: list(dict.fromkeys([i.strip() for i in s.split(",") if i.strip() != ""]))
)

# -------------------------
# Step 2: Unnest arrays
# -------------------------

g_plots = df.explode("plots_array")[["ID", "plots_array"]].rename(columns={"plots_array": "plot"})
g_mills = df.explode("mills_array")[["ID", "mills_array"]].rename(columns={"mills_array": "mill"})
g_issues = df.explode("issues_array")[["ID", "issues_array"]].rename(columns={"issues_array": "issue"})

# -------------------------
# Step 3: issue category
# -------------------------

issue_category = pd.DataFrame([
    ('Biodiversity loss', 'ENV'), ('Deforestation', 'ENV'), ('Fires', 'ENV'),
    ('Illegal Infrastructure', 'ENV'), ('Infrastructure Damage', 'ENV'),
    ('Peatland Loss', 'ENV'), ('Riparian Issues', 'ENV'), ('Environmental Pollution', 'ENV'),
    ('Corruption', 'SOC'), ('Forced Labor and/or Child Labor', 'SOC'),
    ('Gender and Ethnic Disparities', 'SOC'), ('Human Rights Violation', 'SOC'),
    ('Indigenous Peoples Conflict', 'SOC'), ('Labor Rights Violations', 'SOC'),
    ('Land Dispute', 'SOC'), ('Land Grabbing', 'SOC'), ('Labor Disputes', 'SOC'),
    ('Limited Access to Services', 'SOC'), ('Violence and/or Coercion', 'SOC'),
    ('Wage Dispute', 'SOC')
], columns=["issue", "category"])

# -------------------------
# Step 4: generate group_key
# -------------------------

plot_issue = g_plots.merge(g_issues, on="ID", how="inner") \
    .merge(issue_category, on="issue", how="left")

plot_issue["group_key"] = plot_issue["plot"] + "_" + plot_issue["category"].str.lower()

mill_issue = g_mills.merge(g_issues, on="ID", how="inner") \
    .merge(issue_category, on="issue", how="left")

mill_issue["group_key"] = mill_issue["mill"] + "_" + mill_issue["category"].str.lower()

# -------------------------
# Step 5: union
# -------------------------

combined = pd.concat([
    plot_issue[["group_key", "ID"]],
    mill_issue[["group_key", "ID"]]
], ignore_index=True)

# -------------------------
# Step 6: aggregation (match DuckDB exactly)
# -------------------------

def flat_unique(series):
    out = []
    for arr in series:
        if isinstance(arr, list):
            out.extend(arr)
    return list(dict.fromkeys(out))

result = (
    combined.merge(df, on="ID", how="left")
    .groupby("group_key")
    .agg({
        "ID": lambda x: sorted(set(x), key=lambda v: v),   # LIST(DISTINCT ID ORDER BY ID)
        "plots_array": flat_unique,
        "mills_array": flat_unique,
        "source_array": flat_unique,
        "issues_array": flat_unique,
    })
    .reset_index()
)

result.rename(columns={"group_key": "MHGID"}, inplace=True)

# -------------------------
# Step 7: format output
# -------------------------

def fmt(x):
    if isinstance(x, list):
        if len(x) == 0:
            return ""
        return ", ".join(str(i) for i in x)
    return x

for col in ["ID", "plots_array", "mills_array", "source_array", "issues_array"]:
    result[col] = result[col].apply(fmt)

result.rename(columns={
    "ID": "grievance_IDs",
    "plots_array": "plots",
    "mills_array": "mills",
    "source_array": "sources",
    "issues_array": "issues"
}, inplace=True)

# -------------------------
# NEW: grievance_count
# -------------------------
result["grievance_count"] = result["grievance_IDs"].apply(
    lambda s: 0 if s == "" else len([i.strip() for i in s.split(",")])
)

result.to_csv("MHG_Newest_fix.csv", index=False)



# **B. GROUPING PLOT BY NAME AND AREA PROXIMITY**

In [None]:
!pip install rapidfuzz

Collecting rapidfuzz
  Downloading rapidfuzz-3.14.3-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (12 kB)
Downloading rapidfuzz-3.14.3-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (3.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.2/3.2 MB[0m [31m24.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rapidfuzz
Successfully installed rapidfuzz-3.14.3


# USE THIS

STEP 2. Check plot proximity (50 km)

this already combine step 1 and 2

In [None]:
# ======================================================
# COMBINED STEP 1 & 2: NAME GROUPING + PROXIMITY REFINE
# ======================================================
import pandas as pd
import geopandas as gpd
from rapidfuzz import fuzz
from scipy.sparse import csr_matrix
from scipy.sparse.csgraph import connected_components
import re

# ======================================================
# CONFIG
# ======================================================
INPUT_FILE = "Concessions-v2-Grid view (5).csv"
GPKG_FILE = "plots_v2_20251129.gpkg"
OUTPUT_FILE = "Concessions-with-group-proximity.csv"

FUZZY_THRESHOLD = 90
MAX_DIST_KM = 50

name_col = "Name"
id_col = "ID"

# ======================================================
# HELPER FUNCTIONS
# ======================================================

def clean_id(x):
    if pd.isna(x):
        return "UNKNOWN_ID"
    try:
        return str(int(float(x)))
    except:
        return str(x).strip()

def normalize_name(x):
    if pd.isna(x):
        return "UNKNOWN"
    s = str(x).strip()
    if s == "":
        return "UNKNOWN"
    if s.lower() == "unknown":
        return "UNKNOWN"
    if s.lower().startswith("no name"):
        return "NO_NAME"
    return s

def word_difference(a, b):
    words_a = a.split()
    words_b = b.split()
    return len(set(words_a) ^ set(words_b))

def extract_base_name(name):
    """
    Extract base name without trailing numbers
    'Palong 4' -> 'Palong'
    'Estate A 12' -> 'Estate A'
    'FELDA Bukit Jalor 1' -> 'FELDA Bukit Jalor'
    """
    # Remove trailing numbers (with optional separators)
    base = re.sub(r'\s+\d+$', '', name)
    return base.strip()

def dist_km(a, b):
    return a.distance(b) / 1000.0

# ======================================================
# STEP 1: LOAD & NAME GROUPING
# ======================================================
df = pd.read_csv(INPUT_FILE)

df[id_col] = df[id_col].apply(clean_id)
df[name_col] = df[name_col].apply(normalize_name)

# Extract base names for comparison
df["base_name"] = df[name_col].apply(extract_base_name)

# Grouping by name
groups = []
group_ids = []

for idx, row in df.iterrows():
    name = row[name_col]
    base_name = row["base_name"]
    pid = row[id_col]

    # RULE 1: UNKNOWN or NO NAME → own group
    if name == "UNKNOWN" or name == "NO_NAME":
        group_ids.append(f"{pid}G")
        continue

    # RULE 2: PT Perkebunan Nusantara → exact match only
    if name.lower().startswith("pt perkebunan nusantara"):
        assigned = False
        for g in groups:
            if g["name"].lower().startswith("pt perkebunan nusantara"):
                if word_difference(name, g["name"]) == 0:
                    group_ids.append(f"{g['root_id']}G")
                    assigned = True
                    break
        if not assigned:
            groups.append({"name": name, "base_name": base_name, "root_id": pid})
            group_ids.append(f"{pid}G")
        continue

    # RULE 3: CHECK BASE NAME FIRST (for "Name + Number" pattern)
    assigned = False

    for g in groups:
        if g["name"] in ["UNKNOWN", "NO_NAME"]:
            continue
        if g["name"].lower().startswith("pt perkebunan nusantara"):
            continue

        # Check if base names match exactly
        if base_name == g["base_name"] and base_name != name:
            # Same base name (e.g., both "Palong") → group together!
            group_ids.append(f"{g['root_id']}G")
            assigned = True
            break

    if assigned:
        continue

    # RULE 4: NORMAL FUZZY GROUPING (if base name didn't match)
    for g in groups:
        if g["name"] in ["UNKNOWN", "NO_NAME"]:
            continue
        if g["name"].lower().startswith("pt perkebunan nusantara"):
            continue

        sim = fuzz.token_sort_ratio(name, g["name"])
        if sim >= FUZZY_THRESHOLD:
            group_ids.append(f"{g['root_id']}G")
            assigned = True
            break

    if not assigned:
        groups.append({"name": name, "base_name": base_name, "root_id": pid})
        group_ids.append(f"{pid}G")

df["GroupID"] = group_ids

print(f"Step 1 complete: {len(groups)} name-based groups created")

# Show Palong grouping
palong_df = df[df["base_name"].str.contains("Palong", na=False, case=False)]
if len(palong_df) > 0:
    print("\nPalong plots grouping:")
    print(palong_df[["ID", "Name", "base_name", "GroupID"]])

# ======================================================
# STEP 2: LOAD GEOMETRY & PROXIMITY REFINE
# ======================================================
gdf = gpd.read_file(GPKG_FILE)
gdf[id_col] = gdf[id_col].astype(str)

# Merge geometry
merged = df.merge(gdf[[id_col, "geometry"]], on=id_col, how="left")
geo = gpd.GeoDataFrame(merged, geometry="geometry")

# Convert CRS & calculate centroids
geo = geo.to_crs(3857)
geo["centroid"] = geo.geometry.centroid

# ======================================================
# PROXIMITY REFINE (ONLY FOR ELIGIBLE GROUPS)
# ======================================================
refined_group = []

for group_id, sub in geo.groupby("GroupID"):

    # Skip solo plots
    if len(sub) == 1:
        refined_group.append((sub.index[0], group_id))
        continue

    # Check if UNKNOWN/NO_NAME group
    names = sub[name_col].unique()
    if len(names) == 1 and names[0] in ["UNKNOWN", "NO_NAME"]:
        for idx, pid in zip(sub.index, sub[id_col]):
            refined_group.append((idx, f"{pid}G"))
        continue

    # Normal proximity refinement
    ids = sub[id_col].tolist()
    cents = sub["centroid"].tolist()
    indices = sub.index.tolist()
    n = len(ids)

    # Build adjacency matrix
    adj = []
    for i in range(n):
        row = []
        for j in range(n):
            if i == j:
                row.append(1)
            else:
                d = dist_km(cents[i], cents[j])
                row.append(1 if d <= MAX_DIST_KM else 0)
        adj.append(row)

    # Find connected components
    graph = csr_matrix(adj)
    n_components, labels = connected_components(graph, directed=False)

    if n_components == 1:
        # All connected → keep original GroupID
        for idx in indices:
            refined_group.append((idx, group_id))
    else:
        # Split into sub-groups
        for comp_id in range(n_components):
            comp_indices = [indices[i] for i in range(n) if labels[i] == comp_id]

            if len(comp_indices) == 1:
                idx = comp_indices[0]
                pid = geo.loc[idx, id_col]
                refined_group.append((idx, f"{pid}G"))
            else:
                first_id = geo.loc[comp_indices[0], id_col]
                new_group = f"{first_id}G"
                for idx in comp_indices:
                    refined_group.append((idx, new_group))

# Assign final groups
idx_map = {i: g for i, g in refined_group}
geo["GroupID_proximity"] = geo.index.map(idx_map)

print(f"\nStep 2 complete: Proximity refinement done")

# ======================================================
# STATISTICS & SAVE
# ======================================================
result = geo.drop(columns=["geometry", "centroid"])

# Stats
group_sizes = result.groupby("GroupID_proximity").size()
print(f"\nFinal statistics:")
print(f"  Total plots: {len(result)}")
print(f"  Solo plots: {(group_sizes == 1).sum()}")
print(f"  Groups with 2+ plots: {(group_sizes > 1).sum()}")
print(f"  Largest group: {group_sizes.max()} plots")

# Show Palong final result
palong_result = result[result["base_name"].str.contains("Palong", na=False, case=False)]
if len(palong_result) > 0:
    print("\nPalong plots FINAL:")
    print(palong_result[["ID", "Name", "GroupID", "GroupID_proximity"]])

# Save
result.to_csv(OUTPUT_FILE, index=False)
print(f"\nSaved: {OUTPUT_FILE}")

Step 1 complete: 7119 name-based groups created

Palong plots grouping:
         ID                                               Name  \
4609   9033                        FGVPM Palong Timur 5 Estate   
4625   9049                                      Palong Estate   
5712  10136                                     Palong Timur 3   
5713  10137                        FGVPM Palong Timur 6 Estate   
5714  10138                                     Palong Timur 1   
5716  10140                                     Palong Timur 2   
5727  10151                                           Palong 3   
5949  10375                                           Palong 4   
5950  10376                                           Palong 6   
5951  10377                                           Palong 2   
5952  10378                                           Palong 1   
5954  10380  Lee Brothers Plantation & Realty (M) Sdn Bhd (...   
5965  10391                                    FGVPM Palong 17   
5966

  return ogr_read(



Step 2 complete: Proximity refinement done

Final statistics:
  Total plots: 10758
  Solo plots: 7835
  Groups with 2+ plots: 958
  Largest group: 54 plots

Palong plots FINAL:
         ID                                               Name GroupID  \
4609   9033                        FGVPM Palong Timur 5 Estate   9033G   
4625   9049                                      Palong Estate   9049G   
5712  10136                                     Palong Timur 3  10136G   
5713  10137                        FGVPM Palong Timur 6 Estate   9033G   
5714  10138                                     Palong Timur 1  10136G   
5716  10140                                     Palong Timur 2  10136G   
5727  10151                                           Palong 3  10151G   
5949  10375                                           Palong 4  10151G   
5950  10376                                           Palong 6  10151G   
5951  10377                                           Palong 2  10151G   
5952  10

# **C. COMBINE PLOT GROUP AND PLOT GRIEVANCES**

-lookup group name and groupid plot and mill

# FINAL MERGED WITH PLOT GROUP AND MILL GROUP LOOKUP.

In [None]:
import pandas as pd
import re

# ============================
# Load Files
# ============================
mhg = pd.read_csv("MHG_Newest_fixx.csv", dtype=str).fillna("")
plot = pd.read_csv("Concessions-with-group-proximity.csv", dtype=str).fillna("")
mill = pd.read_csv("Mills-Grid view (10).csv", dtype=str).fillna("")

# ============================
# Helper: Ambil angka sebelum "_"
# ============================
def extract_id(x):
    if pd.isna(x):
        return ""
    # PO100… adalah Mill ID
    if x.startswith("PO100"):
        m = re.match(r"(PO\d+)_", x)
        if m:
            return m.group(1)
        return x
    # Angka biasa adalah Plot ID
    m = re.match(r"(\d+)_", x)
    if m:
        return m.group(1)
    return x

mhg["BaseID"] = mhg["MHGID"].apply(extract_id)
mhg["is_mill"] = mhg["MHGID"].apply(lambda x: x.startswith("PO100"))

# ============================
# Build mapping from PLOT file
# ============================
plot["ID_clean"] = plot["ID"].apply(
    lambda x: re.match(r"(\d+)", x).group(1) if re.match(r"(\d+)", x) else x
)

plot_group_map = plot.set_index("ID_clean")["GroupID_proximity"].to_dict()
plot_group_name_map = plot.set_index("ID_clean")["Group Name"].to_dict()
plot_group_airtable_map = plot.set_index("ID_clean")["GroupAirtableRecID"].to_dict()

# Build reverse mapping: GroupID_proximity → list of all plot IDs in that group
group_to_plots = {}
for idx, row in plot.iterrows():
    grp = row["GroupID_proximity"]
    pid = row["ID_clean"]
    if grp not in group_to_plots:
        group_to_plots[grp] = []
    group_to_plots[grp].append(pid)

# ============================
# Build mapping from MILL file
# ============================
mill["UML_ID_clean"] = mill["UML_ID"].apply(
    lambda x: re.match(r"(PO\d+)", x).group(1) if re.match(r"(PO\d+)", x) else x
)

mill_group_name_map = mill.set_index("UML_ID_clean")["Group_Name"].to_dict()
mill_group_airtable_map = mill.set_index("UML_ID_clean")["GroupAirtableRecID"].to_dict()

# ============================
# Determine GroupID_proximity per base ID
# ============================
def get_group_id(row):
    if row["is_mill"]:
        return ""  # Mills don't have GroupID_proximity
    else:
        return plot_group_map.get(row["BaseID"], "")

mhg["GroupID_proximity"] = mhg.apply(get_group_id, axis=1)

# ============================
# Issue group from suffix
# ============================
mhg["issue_group"] = mhg["MHGID"].apply(
    lambda x: x.split("_")[1] if "_" in x else ""
)

# ============================
# Build grouping dict
# ============================
group_dict = {}

for idx, row in mhg.iterrows():
    base = row["BaseID"]
    grp = row["GroupID_proximity"]
    issue = row["issue_group"]
    is_mill = row["is_mill"]

    if is_mill:
        # Mills: No merge with others, each is unique
        group_key = f"MILL-{row['MHGID']}"
    elif grp == "":
        # No group found
        group_key = f"{base}-{issue}"
    else:
        # Normal plot grouping
        group_key = f"{grp}-{issue}"

    group_dict.setdefault(group_key, []).append(idx)

# ============================
# Merge Process
# ============================
merged_rows = []
merge_count = 0

cols_to_merge = [
    c for c in mhg.columns
    if c not in ["MHGID", "BaseID", "GroupID_proximity", "issue_group", "is_mill"]
]

for group_key, indices in group_dict.items():

    # only 1 → no merge
    if len(indices) == 1:
        row = mhg.loc[indices[0]].copy()
        base = row["BaseID"]
        is_mill = row["is_mill"]

        if is_mill:
            # Mill lookup
            row["Group Name"] = mill_group_name_map.get(base, "")
            row["GroupAirtableRecID"] = mill_group_airtable_map.get(base, "")
            row["Plot_group"] = ""  # Mills don't have plot groups
        else:
            # Plot lookup
            row["Group Name"] = plot_group_name_map.get(base, "")
            row["GroupAirtableRecID"] = plot_group_airtable_map.get(base, "")

            # Expand plot_group: get ALL plots in this GroupID_proximity
            grp = row["GroupID_proximity"]
            if grp and grp in group_to_plots:
                row["Plot_group"] = ", ".join(sorted(group_to_plots[grp]))
            else:
                row["Plot_group"] = base

        merged_rows.append(row.to_dict())
        continue

    # merge happens
    merge_count += (len(indices) - 1)
    df = mhg.loc[indices]

    rep = df.iloc[0].copy()
    rep_base = rep["BaseID"]
    is_mill = rep["is_mill"]

    # merge unique values
    for col in cols_to_merge:
        vals = df[col].tolist()
        uniq = sorted(set(sum([
            v.split(", ") if ", " in v else [v] for v in vals
        ], [])))
        uniq = [v for v in uniq if v != ""]
        rep[col] = ", ".join(uniq)

    if is_mill:
        # Mill lookup
        rep["Group Name"] = mill_group_name_map.get(rep_base, "")
        rep["GroupAirtableRecID"] = mill_group_airtable_map.get(rep_base, "")
        rep["Plot_group"] = ""  # Mills don't have plot groups
    else:
        # Plot lookup
        rep["Group Name"] = plot_group_name_map.get(rep_base, "")
        rep["GroupAirtableRecID"] = plot_group_airtable_map.get(rep_base, "")

        # Expand plot_group: get ALL plots in this GroupID_proximity
        grp = rep["GroupID_proximity"]
        if grp and grp in group_to_plots:
            rep["Plot_group"] = ", ".join(sorted(group_to_plots[grp]))
        else:
            # Fallback: use BaseIDs from merged rows
            plot_ids = sorted(set(df["BaseID"].tolist()))
            rep["Plot_group"] = ", ".join(plot_ids)

    # recalc grievance_count from updated grievance_IDs
    if "grievance_IDs" in rep:
        gid_list = rep["grievance_IDs"].split(", ")
        gid_list = [x for x in gid_list if x != ""]
        rep["grievance_count"] = str(len(set(gid_list)))

    merged_rows.append(rep.to_dict())

# ============================
# Build final DF
# ============================
result = pd.DataFrame(merged_rows)

# Drop helper columns from output
result = result.drop(columns=["BaseID", "issue_group", "is_mill"], errors="ignore")

print("Jumlah sebelum merge :", len(mhg))
print("Jumlah sesudah merge :", len(result))
print("Total rows merged    :", merge_count)

# Show sample of mills
mill_results = result[result["MHGID"].str.startswith("PO100", na=False)]
if len(mill_results) > 0:
    print("\nSample Mill results:")
    print(mill_results[["MHGID", "Group Name", "GroupAirtableRecID"]].head())

# Show sample of plots with expanded plot_group
plot_results = result[~result["MHGID"].str.startswith("PO100", na=False)]
if len(plot_results) > 0:
    print("\nSample Plot results with expanded Plot_group:")
    print(plot_results[["MHGID", "Plot_group", "Group Name"]].head())

# ============================
# Export
# ============================
result.to_csv("MHG_Merged_Final.csv", index=False)
print("\nFile saved as MHG_Merged_Final.csv")

Jumlah sebelum merge : 1293
Jumlah sesudah merge : 1053
Total rows merged    : 240

Sample Mill results:
                MHGID       Group Name GroupAirtableRecID
716  PO1000000017_soc        AGROPALMA  recNkyyeESIxx6zeL
717  PO1000000099_env            SIPEF  rec440HVR0OrBALic
718  PO1000000099_soc            SIPEF  rec440HVR0OrBALic
719  PO1000000109_env  IOI CORPORATION  recJ3cFKfD2UMrzZg
720  PO1000000109_soc  IOI CORPORATION  recJ3cFKfD2UMrzZg

Sample Plot results with expanded Plot_group:
       MHGID           Plot_group               Group Name
0  10060_env                10060          YPJ PLANTATIONS
1  10136_env  10136, 10138, 10140                    FELDA
2  10137_env          10137, 9033                      FGV
3  10289_env                10289  Ladang Rakyat Trengganu
4  10681_env                10681             Puncak Niaga

File saved as MHG_Merged_Final.csv
