# Building Datasets (Temporal Splits)

## Setup

First, let's import a few common modules.

In [11]:
# Python â‰¥3.5 is required
import sys
assert sys.version_info >= (3, 5)

# Common imports
import numpy as np
import pandas as pd

# to make this notebook's output stable across runs
np.random.seed(42)

## Data Loading

In [12]:
df = pd.read_csv("datasets/aca_21-23.csv")

In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 74073 entries, 0 to 74072
Data columns (total 19 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   year               74073 non-null  int64  
 1   semester           74073 non-null  int64  
 2   grade              74073 non-null  int64  
 3   sex                74073 non-null  object 
 4   gpa_last_seme      74073 non-null  float64
 5   credits_last_seme  74073 non-null  float64
 6   credits_tot        74073 non-null  float64
 7   n_seme             74073 non-null  int64  
 8   years_since        74073 non-null  int64  
 9   state_next_1       54687 non-null  float64
 10  state_next_2       39623 non-null  float64
 11  state_next_3       23588 non-null  float64
 12  state_now          74073 non-null  int64  
 13  college            74073 non-null  object 
 14  adm_unit           74073 non-null  int64  
 15  nation             74073 non-null  int64  
 16  in_capa            740

## Filtering out

A function filtering out invalid samples
* Horizon: from [t_begin] to [t_end] (remove t=0 dropout samples)

In [16]:
def prepare_dataset(df, t_begin, t_end, drop_t0):
    if (drop_t0 == 1):
        if (t_begin == 0): raise ValueError
        df = df.loc[~df['state_now'].isin([1,2]), :]

    if (t_end == 1):
        no_next = (df['year'] == 2023) & (df['semester'] == 2)
    elif (t_end == 2):
        no_next = (df['year'] == 2023)
    elif (t_end == 3):
        no_next = (df['year'] == 2023) | ((df['year'] == 2022) & (df['semester'] == 2))

    if (t_end != 0):
        df = df.loc[~no_next, :]

    if (t_end == 0):
        if (t_begin != 0): raise ValueError
        is_leave = (df['state_now'].isin([3]))
        is_drop = (df['state_now'].isin([2]))
    elif (t_end == 1):
        if (t_begin == 0):
            is_leave = (df['state_now'].isin([3])) #& (df['state_next_1'].isin([3]))
        else:
            is_leave = (df['state_next_1'].isin([3]))
    elif (t_end == 2):
        if (t_begin == 0):
            is_leave = (df['state_now'].isin([3])) #& (df['state_next_1'].isin([3])) & (df['state_next_2'].isin([3]))
        else:
            is_leave = (df['state_next_1'].isin([3])) #& (df['state_next_2'].isin([3]))
    elif (t_end == 3):
        if (t_begin == 0):
            is_leave = (df['state_now'].isin([3])) #& (df['state_next_1'].isin([3])) & (df['state_next_2'].isin([3])) & (df['state_next_3'].isin([3]))
        else:
            is_leave = (df['state_next_1'].isin([3])) #& (df['state_next_2'].isin([3])) & (df['state_next_3'].isin([3]))

    if (t_end != 0):
        dataset = df.loc[~is_leave].copy()
    else:
        dataset =df

    if (t_end == 0):
        if (t_begin != 0): raise ValueError
        is_drop = (dataset['state_now'].isin([2]))
    elif (t_end == 1):
        if (t_begin == 0):
            is_drop = (dataset['state_now'].isin([2])) | (dataset['state_next_1'].isin([2]))
        else:
            is_drop = (dataset['state_next_1'].isin([2]))
    elif (t_end == 2):
        if (t_begin == 0):
            is_drop = (dataset['state_now'].isin([2])) | (dataset['state_next_1'].isin([2])) | (dataset['state_next_2'].isin([2]))
        else:
            is_drop = (dataset['state_next_1'].isin([2])) | (dataset['state_next_2'].isin([2]))
    elif (t_end == 3):
        if (t_begin == 0):
            is_drop = (dataset['state_now'].isin([2])) | (dataset['state_next_1'].isin([2])) | (dataset['state_next_2'].isin([2])) | (dataset['state_next_3'].isin([2]))
        else:
            is_drop = (dataset['state_next_1'].isin([2])) | (dataset['state_next_2'].isin([2])) | (dataset['state_next_3'].isin([2]))

    dataset.loc[~is_drop.values, 'is_drop'] = 0
    dataset.loc[is_drop.values, 'is_drop'] = 1

    dataset.reset_index(inplace=True, drop=True)

    return dataset

In [17]:
df = prepare_dataset(df, t_begin=1, t_end=2, drop_t0=1)

## Separating Test-set from Train-set

In [18]:
# temporal split for t=2 (train: 2021-1, 2021-2  /  test: 2022-2)

tempo_train = df[
    ((df['year'] == 2021) & (df['semester'].isin([1, 2])))
].copy()

tempo_test = df[
    ((df['year'] == 2022) & (df['semester'] == 2))
].copy()

print("Train size:", len(tempo_train))
print("Test size :", len(tempo_test))

Train size: 15764
Test size : 8748


In [19]:
print((tempo_train['is_drop']==1).sum())
print((tempo_test['is_drop']==1).sum())
print(len(tempo_train))
print(len(tempo_test))

773
533
15764
8748


In [20]:
tempo_train.reset_index(inplace=True, drop=False)

In [21]:
tempo_train.set_index(tempo_train['index'], inplace=True)
tempo_train.drop(['index'], axis=1, inplace=True)
tempo_train.index.name = 'index_o'
tempo_train.to_csv(f'datasets/train_set_t_2_tempo.csv', na_rep='NULL', encoding='utf-8-sig')

In [22]:
tempo_test.index.name = 'index_o'
tempo_test.to_csv(f'datasets/test_set_t_2_tempo.csv', na_rep='NULL', encoding='utf-8-sig')