In [2]:
from google.colab import drive
drive.mount("/content/gdrive")

Mounted at /content/gdrive


In [3]:
%cd /content/gdrive/MyDrive/MLP

/content/gdrive/MyDrive/MLP


In [None]:
import pandas as pd
import numpy as np

# Set random seed for reproducibility
np.random.seed(42)

# Number of learners
n = 2000

# Generate synthetic data
age = np.random.randint(12, 20, n)  # ages 12 to 19
gender = np.random.choice(["Male", "Female"], n)
province = np.random.choice(
    [
        "Gauteng",
        "KwaZulu-Natal",
        "Western Cape",
        "Eastern Cape",
        "Limpopo",
        "Mpumalanga",
        "North West",
        "Northern Cape",
        "Free State",
    ],
    n,
)
urban_rural = np.random.choice(["Urban", "Rural"], n, p=[0.6, 0.4])
household_income = np.random.normal(8000, 4000, n).clip(0)  # rand income
parent_education = np.random.choice(
    ["None", "Primary", "Secondary", "Tertiary"], n, p=[0.1, 0.3, 0.4, 0.2]
)
attendance = np.random.choice(["Yes", "No"], n, p=[0.85, 0.15])
highest_grade = np.where(
    attendance == "Yes", np.random.randint(1, 12, n), np.random.randint(1, 10, n)
)

# Define dropout label: dropout = not attending school but age < 18
dropout = np.where((attendance == "No") & (age < 18), 1, 0)

# Build DataFrame
df = pd.DataFrame(
    {
        "age": age,
        "gender": gender,
        "province": province,
        "urban_rural": urban_rural,
        "household_income": household_income.round(2),
        "parent_education": parent_education,
        "attendance": attendance,
        "highest_grade": highest_grade,
        "dropout": dropout,
    }
)

# Introduce missing values (about 7%)
for col in ["household_income", "parent_education", "attendance"]:
    df.loc[df.sample(frac=0.07, random_state=42).index, col] = np.nan

# Add noise / outliers
# 1. Extreme household incomes (too high or negative)
outlier_idx = np.random.choice(
    df.index, size=int(0.02 * n), replace=False
)  # 2% outliers
df.loc[outlier_idx, "household_income"] = np.random.choice(
    [1e6, -5000], size=len(outlier_idx)
)

# 2. Unrealistic ages (too young or too old for school)
age_outliers = np.random.choice(
    df.index, size=int(0.01 * n), replace=False
)  # 1% outliers
df.loc[age_outliers, "age"] = np.random.choice([5, 30], size=len(age_outliers))

# 3. Attendance inconsistencies (e.g., dropout=1 but marked as 'Yes')
attend_outliers = np.random.choice(df.index, size=int(0.01 * n), replace=False)
df.loc[attend_outliers, "attendance"] = "Yes"
df.loc[attend_outliers, "dropout"] = 1  # force inconsistency

# Save CSV
df.to_csv("school_dropout_sa_with_noise.csv", index=False)

print(
    "Dataset generated with missing values + noise/outliers saved as 'school_dropout_sa_with_noise.csv'"
)
print(df.describe(include="all"))
print("\nMissing values per column:")
print(df.isnull().sum())

Dataset generated with missing values + noise/outliers saved as 'school_dropout_sa_with_noise.csv'
                age  gender       province urban_rural  household_income  \
count   2000.000000    2000           2000        2000       1861.000000   
unique          NaN       2              9           2               NaN   
top             NaN  Female  Northern Cape       Urban               NaN   
freq            NaN    1023            238        1227               NaN   
mean      15.414500     NaN            NaN         NaN      18924.217157   
std        2.563782     NaN            NaN         NaN     104920.226000   
min        5.000000     NaN            NaN         NaN      -5000.000000   
25%       13.000000     NaN            NaN         NaN       5066.930000   
50%       15.000000     NaN            NaN         NaN       7838.990000   
75%       17.000000     NaN            NaN         NaN      10568.100000   
max       30.000000     NaN            NaN         NaN    1000000

In [3]:
df.head()

Unnamed: 0,age,gender,province,urban_rural,household_income,parent_education,attendance,highest_grade,dropout
0,18,Male,North West,Urban,8931.49,Secondary,Yes,2,0
1,15,Female,Western Cape,Urban,-5000.0,Primary,Yes,9,0
2,5,Female,North West,Rural,4164.1,Secondary,Yes,8,0
3,18,Female,Northern Cape,Urban,11491.67,Primary,No,7,0
4,14,Female,Mpumalanga,Rural,13799.41,Secondary,Yes,2,0
