# Data

In [2]:
import os
from pathlib import Path
import pandas as pd
from ast import literal_eval

import warnings

CURR_PATH = Path.cwd()
SEED = 59

warnings.filterwarnings('ignore')
font_family = "Calibri"

data_path = CURR_PATH.parents[0] / 'data'

In [21]:
styles = [
    dict(selector="th", props=[("font-size", "11pt"),
                               ("text-align", "center"),
                               ('font-family', font_family),
                               ('background-color', 'white'),
                               ('border-style', 'solid'),
                               ('border-color', 'grey'),
                               ('border-width', 'thin')]),
    dict(selector="td", props=[("font-size", "11pt"),
                               #("text-align", "center"),
                               ('font-family', font_family),
                               #('background-color', 'white'),
                               ('border-style', 'solid'),
                               ('border-color', 'grey'),
                               ('border-width', 'thin')]),
    dict(selector="caption", props=[("caption-side", "bottom")])
]

## Best Dropout Risk Models
Link to {Download}`best models data<../data/best_models.csv>`.

In [4]:
best_models = pd.read_csv(data_path / 'best_models.csv', sep=';')

best_models

Unnamed: 0,PS,P,S,BACC,Train_Risk,Test_Risk,P1,#Feat,CO,DT,BAL,C,REC
0,AR2,AR,2,0.865881,0.5,0.37069,0.353448,38,0,0.35,S,RF,0.813953
1,AR3,AR,3,0.935011,0.5,0.37069,0.336207,32,4,0.45,R,RF,0.883721
2,CM2,CM,2,0.920015,0.365796,0.632075,0.556604,36,1,0.3,N,SVC,0.865672
3,CM3,CM,3,0.927478,0.5,0.632075,0.566038,74,0,0.45,S,RF,0.880597
4,PT2,PT,2,0.912946,0.5,0.396226,0.358491,16,3,0.3,S,LSVC,0.857143
5,PT3,PT,3,0.881696,0.5,0.396226,0.396226,47,3,0.3,S,LSVC,0.857143


## Risk Data

Link to {Download}`aggregated risk data<../data/risk_data.csv>`.

In [5]:
risk_data = pd.read_csv(data_path / 'risk_data.csv' , sep=';')

print(risk_data.shape, risk_data['T'].unique())
risk_data.sample(n=5, random_state=SEED)

(72, 10) ['AN' 'AN1' 'AN2' 'AN3' 'AN4' 'GN']


Unnamed: 0,ST,PS,0_P1,1_P1,P1,T,0_P2,1_P2,P2,P2-P1
56,G,CM3,38,1,0.025641,AN2,37,2,0.051282,0.025641
62,G,PT2,31,1,0.03125,AN2,20,12,0.375,0.34375
13,D,CM2,9,58,0.865672,AN1,16,51,0.761194,-0.104478
55,G,CM3,38,1,0.025641,AN1,37,2,0.051282,0.025641
36,G,AR2,67,6,0.082192,AN,72,1,0.013699,-0.068493


## Risk Predictions
Link to {Download}`prediction data<../data/pred_data.csv>`.

In [6]:
pred_data = pd.read_csv(data_path / 'pred_data.csv' , sep=';')

print(pred_data.shape, pred_data['T'].unique())
pred_data.sample(n=5, random_state=SEED)

(3300, 14) ['AN' 'AN1' 'AN2' 'AN3' 'AN4' 'GN']


Unnamed: 0,P,S,PS,ST,ST_Num,BAL,CO,DT,CLF,PR1,P1,T,PR2,P2
3092,CM,3,CM3,D,1,S,0,0.45,RF,0.96,1,GN,0.83,1
2665,PT,2,PT2,G,0,S,3,0.3,LSVC,0.020917,0,AN4,0.19357,0
2971,AR,3,AR3,D,1,R,4,0.45,RF,0.11,0,GN,0.21,0
2565,CM,3,CM3,G,0,S,0,0.45,RF,0.04,0,AN4,0.02,0
3152,CM,3,CM3,G,0,S,0,0.45,RF,0.05,0,GN,0.02,0


## Recommendations Data
Link to {Download}`recommendation data<../data/recom_data.csv>`.

In [7]:
recoms = pd.read_csv(data_path / 'recom_data.csv', sep=';')

set_cols = [
    'Passed_Past', 'Passed_Next', 
    'ID_TO', 'D_List', 'Count_Next_TO',
    'Nbrs_Passed_Next_TO', 'AC', 'No_Recom_Rule', 'R', 'C', 'M', 'W', 'NoR',
    'NR'
]

for c in set_cols:
    recoms[c] = recoms[c].replace('set()',
                                  '{}').replace('None',
                                                '{}').apply(literal_eval)

print(recoms.shape, recoms['T'].unique())
recoms.sample(n=5, random_state=SEED)

(16392, 64) ['AN' 'AN1' 'AN2' 'AN3' 'AN4' 'GN']


Unnamed: 0,P,ID,ST,#E_1,#E_2,#E_3,#F_1,#F_2,#F_3,#NE_1,...,ACC,REC_1,REC_0,BACC,PREC_1,F1,MCC,NT,MPT,PS
8464,CM,1974,G,5.0,6.0,6.0,0.0,0.0,0.0,0.0,...,1.0,1.0,0.0,0.5,1.0,1.0,0.0,AN,P4,CM2
7264,CM,421,D,5.0,5.0,7.0,1.0,0.0,2.0,0.0,...,1.0,1.0,0.0,0.5,1.0,1.0,0.0,AN,P4,CM2
9111,CM,2788,D,5.0,6.0,6.0,0.0,1.0,2.0,0.0,...,0.666667,0.8,0.0,0.4,0.8,0.8,-0.2,AN,P3,CM2
1898,AR,1937,G,5.0,6.0,6.0,0.0,0.0,0.0,0.0,...,0.666667,1.0,0.0,0.5,0.666667,0.8,0.0,AN,P2,AR2
1373,AR,1477,G,5.0,6.0,6.0,0.0,0.0,1.0,0.0,...,1.0,1.0,0.0,0.5,1.0,1.0,0.0,GN,P0,AR2
