In [94]:
from mirror.nodes import *
from mirror.edges import *
from mirror.generator import Mirror
import pandas as pd

# DAG that is modeled below

- It simulate a dataset with 11 columns (10 features 1 label): 
    - sex with values of 'M' and 'F'.
    - diversity with values of 'W', 'B', 'A', 'H', 'I', and 'O'.
    - legacy with values of 'Y' and 'N'.
    - SAT with values from a Gaussian distribution N(1500,300^2).
    - GPA with values from a Gaussian distribution N(2.7,0.6^2).
    - Extracurricular with uniform integer values in [0, 10).
    - TOEFL with values from a Gaussian distribution N(90,10^2).
    - APs with integer values in [0, 38), however note that score has 0.5 probability of being [0,5), 
      0.4 probability of being [5,10), and 0.1 probability of being [10,38). Within the ranges/bucket
      integer values are uniform.
    - letters with values from a Uniform distribution U(0,1).
    - subjectTest with uniform integer values in [0, 5).
    - admission with values of 'Y' and 'N'.
- The correlation among above columns are:
    - Sex affects SAT and GPA.
    - Diversity affects TOEFL.
    - admission is determined by sex, diversity, legacy, SAT, GPA, extracurricular, TOEFL, APs, letter, and 
      subjectTest.

![DAG](dag_admission.png)

In [95]:
# size of the data
total_n = 30000

# initialize demographic feature nodes
node_sex = CategoricalNode("sex", {"M": 0.5, "F": 0.5}, sample_n=total_n)
node_diversity = CategoricalNode("diversity", 
                                 {"W": 0.4, "B": 0.1, "A":0.2, "H":0.1,
                                  "I":0.05, "O":0.15}, 
                                 sample_n=total_n)
node_legacy = CategoricalNode("legacy", {"Y": 0.2, "N": 0.8}, sample_n=total_n)

# initialize application feature nodes
node_sat = GaussianNode("SAT", miu=1500, var=300**2, sample_n=total_n) 
node_gpa = GaussianNode("GPA", miu=2.7, var=0.6**2, sample_n=total_n)
node_extracurricular = OrdinalGlobalNode("extracurricular", min=0, max=10, sample_n = total_n)
node_toefl = GaussianNode("TOEFL", miu=90, var=10**2, sample_n=total_n)
node_numAPs = OrdinalLocalNode("AP", {"bound": [0, 5, 10, 38], "probability": [0.5, 0.4, 0.1]}, sample_n=total_n)
node_letters = UniformNode("letterScore", min=0, max=1, sample_n=total_n)
node_subjectTest = OrdinalGlobalNode("subject", min=0, max=5, sample_n = total_n)

# initialize node for admission result
node_admission = CategoricalNode("admission", {"Y": 0.5, "N": 0.5}, sample_n=total_n)

In [96]:
# initialize edges in DAG
edge_sex_SAT = CtoN("sex", "SAT", {"M": ["Gaussian", 1000, 300**2], "F": ["Gaussian", 1200, 300**2]})
edge_sex_GPA = CtoN("sex", "GPA", {"M": ["Gaussian", 2.3, 0.6**2], "F": ["Gaussian", 2.7, 0.6**2]})
edge_diversity_toefl = CtoN("diversity", "TOEFL", {"W": ["Gaussian", 80, 20**2], 
                                                   "B": ["Gaussian", 90, 5**2],
                                                   "A": ["Gaussian", 70, 5**2],
                                                   "H": ["Gaussian", 100, 10**2],
                                                   "I": ["Gaussian", 80, 10**2],
                                                   "O": ["Gaussian", 60, 10**2]})

edge_sex_admission = CtoC("sex", "admission", {"M": {"Y": 0.8, "N": 0.2}, "F": {"Y": 0.2, "N": 0.8}})
edge_diversity_admission = CtoC("diversity", "admission", {"W": {"Y": 0.3, "N": 0.7}, 
                                                           "B": {"Y": 0.7, "N": 0.3},
                                                           "A": {"Y": 0.2, "N": 0.8}, 
                                                           "H": {"Y": 0.5, "N": 0.5},
                                                           "I": {"Y": 0.4, "N": 0.6},
                                                           "O": {"Y": 0.9, "N": 0.1}})
edge_legacy_admission = CtoC("legacy", "admission", {"Y": {"Y": 0.9, "N": 0.1}, "N": {"Y": 0.4, "N": 0.6}})
edge_SAT_admission = NtoC("SAT", "admission", [2000], [{"Y": 0.7, "N": 0.3}, {"Y": 0.3, "N": 0.7}])
edge_GPA_admission = NtoC("GPA", "admission", [3], [{"Y": 0.8, "N": 0.2}, {"Y": 0.2, "N": 0.8}])
edge_extra_admission = NtoC("extracurricular", "admission", [5], [{"Y": 0.8, "N": 0.2}, {"Y": 0.2, "N": 0.8}])
edge_toefl_admission = NtoC("TOEFL", "admission", [100], [{"Y": 0.6, "N": 0.4}, {"Y": 0.4, "N": 0.6}])
edge_aps_admission = NtoC("AP", "admission", [8], [{"Y": 0.8, "N": 0.2}, {"Y": 0.4, "N": 0.6}])
edge_letters_admission = NtoC("letterScore", "admission", [0.9], [{"Y": 0.9, "N": 0.1}, {"Y": 0.3, "N": 0.7}])
edge_subject_admission = NtoC("subject", "admission", [3], [{"Y": 0.7, "N": 0.3}, {"Y": 0.5, "N": 0.5}])

In [97]:
# Create DAG
nodes = [node_sex, node_diversity, node_legacy, node_sat, node_gpa, node_extracurricular, node_toefl, 
         node_numAPs, node_letters, node_subjectTest, node_admission]

edge_relation = {"SAT": edge_sex_SAT,
                 "GPA": edge_sex_GPA,
                 "TOEFL": edge_diversity_toefl,
                 "admission": ([edge_sex_admission, edge_diversity_admission, edge_legacy_admission,
                                edge_SAT_admission, edge_GPA_admission, edge_extra_admission,
                                edge_toefl_admission, edge_aps_admission, edge_letters_admission,
                                edge_subject_admission],
                               [0.2, 0.11, 0.2, 0.07, 0.07, 0.07, 0.07, 0.07, 0.07, 0.07])}

In [98]:
# generate data
mirror = Mirror(seed=0)
mirror.generate_csv(nodes, edge_relation)
mirror.save_to_disc("admission0.csv", excluded_cols=['group','C_SAT','C_GPA','C_extracurricular',
                                                     'C_TOEFL','C_AP','C_letterScore','C_subject'])

sex independet ['sex']
----------------------------------------

diversity independet ['sex', 'diversity']
----------------------------------------

legacy independet ['sex', 'diversity', 'legacy']
----------------------------------------

SAT with parents
One parent <mirror.edges.CtoN object at 0x7fb1aab6f790> ['sex', 'diversity', 'legacy', 'SAT']
----------------------------------------

GPA with parents
One parent <mirror.edges.CtoN object at 0x7fb1aab6f750> ['sex', 'diversity', 'legacy', 'SAT', 'GPA']
----------------------------------------

extracurricular independet ['sex', 'diversity', 'legacy', 'SAT', 'GPA', 'extracurricular']
----------------------------------------

TOEFL with parents
One parent <mirror.edges.CtoN object at 0x7fb1ac5663d0> ['sex', 'diversity', 'legacy', 'SAT', 'GPA', 'extracurricular', 'TOEFL']
----------------------------------------

AP independet ['sex', 'diversity', 'legacy', 'SAT', 'GPA', 'extracurricular', 'TOEFL', 'AP']
-------------------------------