In [19]:
from sklearn.datasets import make_classification
import pandas as pd
import numpy as np

n_samples = 5000
n_features = 7  
n_classes = 3
minority_class_proportion = 0.2 

X, y = make_classification(
    n_samples=n_samples,
    n_features=n_features,
    n_informative=5,  
    n_redundant=0,
    n_classes=n_classes,
    n_clusters_per_class=2,
    random_state=42
)

X[:, 0] = np.random.randint(0, 1000, size=n_samples)  # students_passed
X[:, 1] = np.random.randint(10, 100, size=n_samples)  # num_teachers
X[:, 2] = np.random.randint(500, 3000, size=n_samples)  # total_students
X[:, 3] = np.random.randint(5000, 20000, size=n_samples)  # Total_budget
X[:, 4] = np.random.randint(0, 30, size=n_samples)  # dropout_rate
X[:, 5] = np.random.randint(20, 60, size=n_samples)  # avg_class_size
X[:, 6] = np.random.randint(1, 10, size=n_samples)  # school_distance (in km)


class_0_indices = np.where(y == 0)[0]
class_1_indices = np.where(y == 1)[0]
class_2_indices = np.where(y == 2)[0]

minority_class_count = int(minority_class_proportion * n_samples)
class_0_indices = np.random.choice(class_0_indices, size=minority_class_count, replace=False)

balanced_indices = np.concatenate((class_0_indices, class_1_indices, class_2_indices))
X = X[balanced_indices]
y = y[balanced_indices]

label_noise_fraction = 0.05
num_noisy_labels = int(label_noise_fraction * len(y))
noisy_indices = np.random.choice(len(y), num_noisy_labels, replace=False)
y[noisy_indices] = np.random.choice(n_classes, size=num_noisy_labels)

df = pd.DataFrame(X, columns=[
    "students_passed", "num_teachers", "total_students",
    "Total_budget", "dropout_rate", "avg_class_size", "school_distance"
])
df = df.astype(int)
district_mapping = {0: "Kicukiro", 1: "Gasabo", 2: "Nyarugenge"}
df['district_label'] = y
df['district_label'] = df['district_label'].map(district_mapping)

df.to_csv("../data/district_classification_dataset_v4.csv", index=False)

print(df.head(50))


    students_passed  num_teachers  total_students  Total_budget  dropout_rate  \
0               802            44             925          7190            15   
1               211            87            1543         12076             2   
2               753            13            1059         10727            23   
3               700            26            2372         17893             2   
4                 8            73            1475         13998            10   
5               599            15            1924         11474             0   
6               892            70             638          9486            22   
7               317            97            2304         11368            22   
8               931            66             905         14777            26   
9               891            10            1122         19971            17   
10              974            11            2077          7700             7   
11              286         

In [35]:
from sklearn.datasets import make_classification
import pandas as pd
import numpy as np

n_samples = 10000
n_features = 7
n_classes = 3
label_noise_fraction = 0.01  

# Create a classification dataset with clear separation between classes
X, y = make_classification(
    n_samples=n_samples,
    n_features=n_features,
    n_informative=5,  # Use 5 informative features
    n_redundant=0,    # No redundant features
    n_classes=n_classes,
    n_clusters_per_class=1,  # One cluster per class
    class_sep=10.0,           # Increase the class separation
    random_state=42
)

# Modify features to ensure they are distinct across classes
X[:, 0] = np.random.randint(200, 1000, size=n_samples)  # students_passed (int)
X[:, 1] = np.random.randint(10, 100, size=n_samples)  # num_teachers (int)
X[:, 2] = np.random.randint(500, 3000, size=n_samples)  # total_students (int)
X[:, 3] = np.random.randint(5000, 20000, size=n_samples)  # Total_budget (int)

# Calculate dropout_rate as a float between 0 and 0.3
dropout_rate = np.random.uniform(0.05, 0.3, size=n_samples)  # Random floats between 0.05 and 0.3
X[:, 4] = np.round(dropout_rate, 2)  # Ensure two decimal places

# Calculate avg_class_size and ensure it stays as a float
X[:, 5] = np.round(X[:, 2] / X[:, 1], 2)  # avg_class_size (float)

# Add school_distance as an integer
X[:, 6] = np.random.randint(1, 10, size=n_samples)  # school_distance (int)

# Create balanced classes by selecting equal numbers of samples for each class
class_0_indices = np.where(y == 0)[0]
class_1_indices = np.where(y == 1)[0]
class_2_indices = np.where(y == 2)[0]

class_counts = [len(class_0_indices), len(class_1_indices), len(class_2_indices)]
min_class_count = min(class_counts)

class_0_indices = np.random.choice(class_0_indices, size=min_class_count, replace=False)
class_1_indices = np.random.choice(class_1_indices, size=min_class_count, replace=False)
class_2_indices = np.random.choice(class_2_indices, size=min_class_count, replace=False)

balanced_indices = np.concatenate((class_0_indices, class_1_indices, class_2_indices))
X = X[balanced_indices]
y = y[balanced_indices]

# Introduce noise in labels as specified
num_noisy_labels = int(label_noise_fraction * len(y))
noisy_indices = np.random.choice(len(y), num_noisy_labels, replace=False)
y[noisy_indices] = np.random.choice(n_classes, size=num_noisy_labels)

# Create a DataFrame for better visualization
df = pd.DataFrame(X, columns=[
    "students_passed", "num_teachers", "total_students",
    "Total_budget", "dropout_rate", "avg_class_size", "school_distance"
])

# Convert specific columns to int and keep others as float
int_columns = ["students_passed", "num_teachers", "total_students", "Total_budget", "school_distance"]
float_columns = ["dropout_rate", "avg_class_size"]

df[int_columns] = df[int_columns].astype(int)
df[float_columns] = df[float_columns].astype(float)

# Map class labels to district names
district_mapping = {0: "Kicukiro", 1: "Gasabo", 2: "Nyarugenge"}
df['district_label'] = y
df['district_label'] = df['district_label'].map(district_mapping)

# Save the dataset to CSV
df.to_csv("../data/district_classification_dataset_v5.csv", index=False)

print(df.head(10))


   students_passed  num_teachers  total_students  Total_budget  dropout_rate  \
0              354            77             757         10531          0.22   
1              611            42            2630          9088          0.15   
2              471            16            2295         18900          0.17   
3              762            21            1995          8804          0.05   
4              927            93            1440          9201          0.27   
5              741            37            2138          6995          0.15   
6              565            33             663         17132          0.26   
7              795            22            2553         13191          0.17   
8              959            33             772          7423          0.18   
9              578            34            1493          8877          0.08   

   avg_class_size  school_distance district_label  
0            9.83                1       Kicukiro  
1           62.