# Phase 2: Sample Construction

## Setup

First, let's import a few common modules.

In [30]:
# Python â‰¥3.5 is required
import sys
assert sys.version_info >= (3, 5)

# Common imports
import numpy as np
import pandas as pd
import os

# to make this notebook's output stable across runs
import random
seed_val = 43
np.random.seed(seed_val)
random.seed(seed_val)

## Data Loading

In [31]:
seed = 42
df = pd.read_csv("datasets/aca_21-23.csv")

## Separating Test-set from Train-set

Strafied sampling (train-set : test_set = 80:20)

In [32]:
from sklearn.model_selection import StratifiedShuffleSplit

seed = 42

split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=seed)
for train_index, test_index in split.split(df, df['is_drop']):
    train_set = df.loc[train_index]
    test_set = df.loc[test_index]

In [33]:
train_set.reset_index(inplace=True, drop=False)

In [34]:
train_set.set_index(train_set['index'], inplace=True)
train_set.drop(['index'], axis=1, inplace=True)
train_set.index.name = 'index_o'
train_set.to_csv(f'datasets/train_set_{seed}.csv', na_rep='NULL', encoding='utf-8-sig')

In [35]:
test_set.index.name = 'index_o'
test_set.to_csv(f'datasets/test_set_{seed}.csv', na_rep='NULL', encoding='utf-8-sig')

## Filtering out

A function filtering out invalid samples
* Horizon: from [t_begin] to [t_end] (remove t=0 dropout samples)

In [36]:
def prepare_dataset(df, t_begin, t_end, drop_t0):
    if (drop_t0 == 1):
        if (t_begin == 0): raise ValueError
        df = df.loc[~df['state_now'].isin([1,2]), :]

    if (t_end == 1):
        no_next = (df['year'] == 2023) & (df['semester'] == 2)
    elif (t_end == 2):
        no_next = (df['year'] == 2023)
    elif (t_end == 3):
        no_next = (df['year'] == 2023) | ((df['year'] == 2022) & (df['semester'] == 2))

    if (t_end != 0):
        df = df.loc[~no_next, :]

    if (t_end == 0):
        if (t_begin != 0): raise ValueError
        is_leave = (df['state_now'].isin([3]))
        is_drop = (df['state_now'].isin([2]))
    elif (t_end == 1):
        if (t_begin == 0):
            is_leave = (df['state_now'].isin([3])) #& (df['state_next_1'].isin([3]))
        else:
            is_leave = (df['state_next_1'].isin([3]))
    elif (t_end == 2):
        if (t_begin == 0):
            is_leave = (df['state_now'].isin([3])) #& (df['state_next_1'].isin([3])) & (df['state_next_2'].isin([3]))
        else:
            is_leave = (df['state_next_1'].isin([3])) #& (df['state_next_2'].isin([3]))
    elif (t_end == 3):
        if (t_begin == 0):
            is_leave = (df['state_now'].isin([3])) #& (df['state_next_1'].isin([3])) & (df['state_next_2'].isin([3])) & (df['state_next_3'].isin([3]))
        else:
            is_leave = (df['state_next_1'].isin([3])) #& (df['state_next_2'].isin([3])) & (df['state_next_3'].isin([3]))

    if (t_end != 0):
        dataset = df.loc[~is_leave].copy()
    else:
        dataset =df

    if (t_end == 0):
        if (t_begin != 0): raise ValueError
        is_drop = (dataset['state_now'].isin([2]))
    elif (t_end == 1):
        if (t_begin == 0):
            is_drop = (dataset['state_now'].isin([2])) | (dataset['state_next_1'].isin([2]))
        else:
            is_drop = (dataset['state_next_1'].isin([2]))
    elif (t_end == 2):
        if (t_begin == 0):
            is_drop = (dataset['state_now'].isin([2])) | (dataset['state_next_1'].isin([2])) | (dataset['state_next_2'].isin([2]))
        else:
            is_drop = (dataset['state_next_1'].isin([2])) | (dataset['state_next_2'].isin([2]))
    elif (t_end == 3):
        if (t_begin == 0):
            is_drop = (dataset['state_now'].isin([2])) | (dataset['state_next_1'].isin([2])) | (dataset['state_next_2'].isin([2])) | (dataset['state_next_3'].isin([2]))
        else:
            is_drop = (dataset['state_next_1'].isin([2])) | (dataset['state_next_2'].isin([2])) | (dataset['state_next_3'].isin([2]))

    dataset.loc[~is_drop.values, 'is_drop'] = 0
    dataset.loc[is_drop.values, 'is_drop'] = 1

    dataset.reset_index(inplace=True, drop=True)

    return dataset

For $t=1, 2, 3$, filtering out invalid samples and saving the final "train-sets".

In [37]:
seed = 42
df = pd.read_csv(f"datasets/train_set_{seed}.csv")

In [38]:
train_set = {}
for t_end in {1, 2, 3}:
    train_set[t_end] = prepare_dataset(df, t_begin=1, t_end=t_end, drop_t0=1)
    train_set[t_end].drop(columns=['state_now', 'state_next_1', 'state_next_2', 'state_next_3'], inplace=True)
    print(f"t_end={t_end}, train_set size: {train_set[t_end].shape}")
    train_set[t_end].reset_index(inplace=True, drop=True)
    train_set[t_end].to_csv(f"datasets/train_set_{seed}_t_{t_end}.csv", index=False)

t_end=1, train_set size: (31038, 16)
t_end=2, train_set size: (25406, 16)
t_end=3, train_set size: (18384, 16)


For $t=1, 2, 3$, filtering out invalid samples and saving the final "test-sets".

In [39]:
df = pd.read_csv(f"datasets/test_set_{seed}.csv")

In [40]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14815 entries, 0 to 14814
Data columns (total 20 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   index_o            14815 non-null  int64  
 1   year               14815 non-null  int64  
 2   semester           14815 non-null  int64  
 3   grade              14815 non-null  int64  
 4   sex                14815 non-null  object 
 5   gpa_last_seme      14815 non-null  float64
 6   credits_last_seme  14815 non-null  float64
 7   credits_tot        14815 non-null  float64
 8   n_seme             14815 non-null  int64  
 9   years_since        14815 non-null  int64  
 10  state_next_1       11016 non-null  float64
 11  state_next_2       7984 non-null   float64
 12  state_next_3       4778 non-null   float64
 13  state_now          14815 non-null  int64  
 14  college            14815 non-null  object 
 15  adm_unit           14815 non-null  int64  
 16  nation             148

In [41]:
test_set = {}
for t_end in {1, 2, 3}:
    test_set[t_end] = prepare_dataset(df, t_begin=1, t_end=t_end, drop_t0=1)
    test_set[t_end].drop(columns=['state_now', 'state_next_1', 'state_next_2', 'state_next_3'], inplace=True)
    print(f"t_end={t_end}, test_set size: {test_set[t_end].shape}")
    test_set[t_end].reset_index(inplace=True, drop=True)
    test_set[t_end].to_csv(f"datasets/test_set_{seed}_t_{t_end}.csv", index=False)

t_end=1, test_set size: (7776, 16)
t_end=2, test_set size: (6363, 16)
t_end=3, test_set size: (4637, 16)
