In [12]:
from sklearn.datasets import make_classification
import pandas as pd
import numpy as np

n_samples = 3000
n_features = 7  
n_classes = 3
minority_class_proportion = 0.2 

X, y = make_classification(
    n_samples=n_samples,
    n_features=n_features,
    n_informative=4,  
    n_redundant=2,
    n_classes=n_classes,
    n_clusters_per_class=2,
    random_state=42
)

X[:, 0] = np.random.randint(0, 1000, size=n_samples)  # students_passed
X[:, 1] = np.random.randint(10, 100, size=n_samples)  # num_teachers
X[:, 2] = np.random.randint(500, 3000, size=n_samples)  # total_students
X[:, 3] = np.random.randint(5000, 20000, size=n_samples)  # Total_budget
X[:, 4] = np.random.randint(0, 30, size=n_samples)  # dropout_rate
X[:, 5] = np.random.randint(20, 60, size=n_samples)  # avg_class_size
X[:, 6] = np.random.randint(1, 10, size=n_samples)  # school_distance (in km)


class_0_indices = np.where(y == 0)[0]
class_1_indices = np.where(y == 1)[0]
class_2_indices = np.where(y == 2)[0]

minority_class_count = int(minority_class_proportion * n_samples)
class_0_indices = np.random.choice(class_0_indices, size=minority_class_count, replace=False)

balanced_indices = np.concatenate((class_0_indices, class_1_indices, class_2_indices))
X = X[balanced_indices]
y = y[balanced_indices]

label_noise_fraction = 0.05
num_noisy_labels = int(label_noise_fraction * len(y))
noisy_indices = np.random.choice(len(y), num_noisy_labels, replace=False)
y[noisy_indices] = np.random.choice(n_classes, size=num_noisy_labels)

df = pd.DataFrame(X, columns=[
    "students_passed", "num_teachers", "total_students",
    "Total_budget", "dropout_rate", "avg_class_size", "school_distance"
])
df = df.astype(int)
district_mapping = {0: "Kicukiro", 1: "Gasabo", 2: "Nyarugenge"}
df['district_label'] = y
df['district_label'] = df['district_label'].map(district_mapping)

df.to_csv("district_classification_dataset_v4.csv", index=False)

print(df.head(50))


    students_passed  num_teachers  total_students  Total_budget  dropout_rate  \
0               449            31             690         18475            12   
1               513            64            2529          5130             8   
2               993            17             547          6902             7   
3               597            29            2866          7367            21   
4               202            39            2157         14033            10   
5               772            20             857         19874            21   
6               235            71             889         17088            17   
7               351            31            2679         10432            22   
8               890            98            1991          9394             8   
9               997            20             960         19565            29   
10              263            59            1284          9469            10   
11              254         