In [None]:
حتما! اینجا یک کد کامل پایتون هست که:

از فایل JSON ورودی، گراف رو می‌خونه (شامل نودها و یال‌ها)

ویژگی‌ها و برچسب‌ها رو از داده‌ها استخراج می‌کنه

با GraphSMOTE نمونه‌های جدید تولید می‌کنه

نودها و یال‌های اصلی و مصنوعی رو توی یه فایل JSON خروجی می‌نویسه



In [19]:
import json
import numpy as np
from sklearn.neighbors import NearestNeighbors

class GraphSMOTE:
    def __init__(self, k=5):
        self.k = k

    def fit_resample(self, x, y, edge_index):
        unique_classes, counts = np.unique(y, return_counts=True)
        max_count = counts.max()

        target_count = int(max_count * 0.1)  # 40% of majority class

        x_res = x.copy()
        y_res = y.copy()
        edge_res = edge_index.copy()

        synthetic_samples = []
        synthetic_labels = []

        for cls, count in zip(unique_classes, counts):
            if count >= target_count:
                # Enough samples, skip
                continue

            idx_cls = np.where(y == cls)[0]
            x_minority = x[idx_cls]

            k_neighbors = min(self.k, len(x_minority) - 1)
            if k_neighbors < 1:
                print(f"Warning: minority class {cls} has {len(x_minority)} sample(s), skipping synthetic generation.")
                continue

            nn = NearestNeighbors(n_neighbors=k_neighbors + 1).fit(x_minority)
            neighbors = nn.kneighbors(x_minority, return_distance=False)

            n_synth_needed = target_count - count
            n_synth_per_sample = max(1, n_synth_needed // count)

            for i in range(len(x_minority)):
                for _ in range(n_synth_per_sample):
                    nn_idx = np.random.choice(neighbors[i][1:])
                    diff = x_minority[nn_idx] - x_minority[i]
                    gap = np.random.rand()
                    synthetic = x_minority[i] + gap * diff
                    synthetic_samples.append(synthetic)
                    synthetic_labels.append(cls)
                    if len(synthetic_samples) >= n_synth_needed:
                        break
                if len(synthetic_samples) >= n_synth_needed:
                    break

        if synthetic_samples:
            x_synth = np.vstack(synthetic_samples)
            y_synth = np.array(synthetic_labels)
            x_new = np.vstack([x_res, x_synth])
            y_new = np.hstack([y_res, y_synth])
        else:
            x_new, y_new = x_res, y_res

        return x_new, y_new, edge_res

def load_graph_from_json(filename):
    with open(filename, "r", encoding="utf-8") as f:
        data = json.load(f)

    nodes = data["nodes"]
    edges = data["edges"]

    # کلاس در اندیس 3
    node_classes = [node[3] for node in nodes]
    unique_classes = list(sorted(set(node_classes)))
    class_to_int = {cls: i for i, cls in enumerate(unique_classes)}
    y = np.array([class_to_int[cls] for cls in node_classes])

    # ویژگی‌ها از اندیس 1 و 2
    def safe_float(val):
        try:
            return float(val)
        except:
            return 0.0

    x = np.array([[safe_float(node[1]), safe_float(node[2])] for node in nodes])

    edge_index = np.array(edges).T

    return x, y, edge_index, nodes, edges, class_to_int

def save_graph_with_synthetic(x_orig, y_orig, edge_orig, x_res, y_res, edge_res, nodes_orig, edges_orig, class_to_int, filename):
    n_orig = x_orig.shape[0]
    n_res = x_res.shape[0]
    int_to_class = {v: k for k, v in class_to_int.items()}

    nodes_synth = []
    for i in range(n_orig, n_res):
        idx = i
        node_class = int_to_class[y_res[i]]
        val1 = str(x_res[i, 0])
        val2 = str(x_res[i, 1])
        nodes_synth.append([idx, val1, val2, node_class])

    all_nodes = nodes_orig + nodes_synth
    all_edges = edges_orig  # اگر یال مصنوعی داری باید اضافه کنی اینجا

    out_data = {
        "nodes": all_nodes,
        "edges": all_edges
    }

    with open(filename, "w", encoding="utf-8") as f:
        json.dump(out_data, f, indent=2, ensure_ascii=False)

    print(f"Wrote {len(all_nodes)} nodes and {len(all_edges)} edges to {filename}")

if __name__ == "__main__":
    input_file = "C:/Users/Leila/datasetpy/buggy/youtube-dl_4_jsinterp.json"
    output_file = "C:/Users/Leila/datasetpy/buggy/output_graph_with_synthetic.json"

    print("Loading graph...")
    x, y, edge_index, nodes, edges, class_to_int = load_graph_from_json(input_file)

    unique_classes = list(class_to_int.keys())
    counts = {cls: np.sum(y == idx) for cls, idx in class_to_int.items()}

    print("Classes found:", unique_classes)
    print("Number of samples per class:")
    for cls in unique_classes:
        print(f"  Class {class_to_int[cls]} ({cls}): {counts[cls]} samples")

    print("Running GraphSMOTE...")
    smote = GraphSMOTE(k=5)
    x_res, y_res, edge_res = smote.fit_resample(x, y, edge_index)

    n_synth = x_res.shape[0] - x.shape[0]
    print(f"Number of synthetic nodes generated: {n_synth}")

    print("Saving new graph with synthetic samples...")
    save_graph_with_synthetic(x, y, edge_index, x_res, y_res, edge_res, nodes, edges, class_to_int, output_file)


Loading graph...
Classes found: ['bug-free', 'insert-node', 'update']
Number of samples per class:
  Class 0 (bug-free): 2926 samples
  Class 1 (insert-node): 16 samples
  Class 2 (update): 2 samples
Running GraphSMOTE...
Number of synthetic nodes generated: 290
Saving new graph with synthetic samples...
Wrote 3234 nodes and 8554 edges to C:/Users/Leila/datasetpy/buggy/output_graph_with_synthetic.json


In [23]:
import json
import numpy as np
from sklearn.neighbors import NearestNeighbors

class GraphSMOTE:
    def __init__(self, k=5):
        self.k = k

    def fit_resample(self, x, y, edge_index):
        unique_classes, counts = np.unique(y, return_counts=True)
        max_count = counts.max()

        target_count = int(max_count * 0.3)  # 40% of majority class

        x_res = x.copy()
        y_res = y.copy()
        edge_res = edge_index.copy()

        synthetic_samples = []
        synthetic_labels = []

        for cls, count in zip(unique_classes, counts):
            if count >= target_count:
                # Enough samples, skip
                continue

            idx_cls = np.where(y == cls)[0]
            x_minority = x[idx_cls]

            k_neighbors = min(self.k, len(x_minority) - 1)
            if k_neighbors < 1:
                print(f"Warning: minority class {cls} has {len(x_minority)} sample(s), skipping synthetic generation.")
                continue

            nn = NearestNeighbors(n_neighbors=k_neighbors + 1).fit(x_minority)
            neighbors = nn.kneighbors(x_minority, return_distance=False)

            n_synth_needed = target_count - count
            n_synth_per_sample = max(1, n_synth_needed // count)

            for i in range(len(x_minority)):
                for _ in range(n_synth_per_sample):
                    nn_idx = np.random.choice(neighbors[i][1:])
                    diff = x_minority[nn_idx] - x_minority[i]
                    gap = np.random.rand()
                    synthetic = x_minority[i] + gap * diff
                    synthetic_samples.append(synthetic)
                    synthetic_labels.append(cls)
                    if len(synthetic_samples) >= n_synth_needed:
                        break
                if len(synthetic_samples) >= n_synth_needed:
                    break

        if synthetic_samples:
            x_synth = np.vstack(synthetic_samples)
            y_synth = np.array(synthetic_labels)
            x_new = np.vstack([x_res, x_synth])
            y_new = np.hstack([y_res, y_synth])
        else:
            x_new, y_new = x_res, y_res

        return x_new, y_new, edge_res

def load_graph_from_json(filename):
    with open(filename, "r", encoding="utf-8") as f:
        data = json.load(f)

    nodes = data["nodes"]
    edges = data["edges"]

    # استخراج نگاشت type و value از داده‌های متنی
    type_map = {}
    value_map = {}

    def encode_categorical(val, mapping):
        if val not in mapping:
            mapping[val] = len(mapping)
        return mapping[val]

    # کلاس‌ها را استخراج و نگاشت کن
    node_classes = [node[3] for node in nodes]
    unique_classes = list(sorted(set(node_classes)))
    class_to_int = {cls: i for i, cls in enumerate(unique_classes)}

    y = np.array([class_to_int[cls] for cls in node_classes])

    # ویژگی‌های type و value را به عدد نگاشت کن
    x = []
    for node in nodes:
        type_feat = encode_categorical(node[1], type_map)
        val_feat = encode_categorical(node[2], value_map)
        x.append([type_feat, val_feat])
    x = np.array(x)

    edge_index = np.array(edges).T

    return x, y, edge_index, nodes, edges, class_to_int, type_map, value_map

def save_graph_with_synthetic(x_orig, y_orig, edge_orig, x_res, y_res, edge_res, nodes_orig, edges_orig, class_to_int, type_map, value_map, filename):
    n_orig = x_orig.shape[0]
    n_res = x_res.shape[0]

    int_to_class = {v: k for k, v in class_to_int.items()}
    inv_type_map = {v: k for k, v in type_map.items()}
    inv_value_map = {v: k for k, v in value_map.items()}

    nodes_synth = []
    for i in range(n_orig, n_res):
        node_class = int_to_class[y_res[i]]

        synthetic_type_num = int(round(x_res[i, 0]))
        synthetic_value_num = int(round(x_res[i, 1]))

        synthetic_type_str = inv_type_map.get(synthetic_type_num, "Unknown")
        synthetic_value_str = inv_value_map.get(synthetic_value_num, "Unknown")

        nodes_synth.append([i, synthetic_type_str, synthetic_value_str, node_class])

    all_nodes = nodes_orig + nodes_synth
    all_edges = edges_orig  # اگر یال مصنوعی داری اضافه کن

    out_data = {
        "nodes": all_nodes,
        "edges": all_edges
    }

    with open(filename, "w", encoding="utf-8") as f:
        json.dump(out_data, f, indent=2, ensure_ascii=False)

    print(f"Wrote {len(all_nodes)} nodes and {len(all_edges)} edges to {filename}")

if __name__ == "__main__":
    input_file = "C:/Users/Leila/datasetpy/buggy/youtube-dl_4_jsinterp.json"
    output_file = "C:/Users/Leila/datasetpy/buggy/output_graph_with_synthetic.json"

    print("Loading graph...")
    x, y, edge_index, nodes, edges, class_to_int, type_map, value_map = load_graph_from_json(input_file)

    unique_classes = list(class_to_int.keys())
    counts = {cls: np.sum(y == idx) for cls, idx in class_to_int.items()}

    print("Classes found:", unique_classes)
    print("Number of samples per class:")
    for cls in unique_classes:
        print(f"  Class {class_to_int[cls]} ({cls}): {counts[cls]} samples")

    print("Running GraphSMOTE...")
    smote = GraphSMOTE(k=5)
    x_res, y_res, edge_res = smote.fit_resample(x, y, edge_index)

    n_synth = x_res.shape[0] - x.shape[0]
    print(f"Number of synthetic nodes generated: {n_synth}")

    print("Saving new graph with synthetic samples...")
    save_graph_with_synthetic(x, y, edge_index, x_res, y_res, edge_res, nodes, edges, class_to_int, type_map, value_map, output_file)


Loading graph...
Classes found: ['bug-free', 'insert-node', 'update']
Number of samples per class:
  Class 0 (bug-free): 2926 samples
  Class 1 (insert-node): 16 samples
  Class 2 (update): 2 samples
Running GraphSMOTE...
Number of synthetic nodes generated: 875
Saving new graph with synthetic samples...
Wrote 3819 nodes and 8554 edges to C:/Users/Leila/datasetpy/buggy/output_graph_with_synthetic.json
