In [1]:
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt

import os

from os import listdir
from os.path import isfile, join

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GroupShuffleSplit 
from sklearn.model_selection import StratifiedGroupKFold

# Function used:

In [2]:
def stratified_split_train_test(df, 
                                x_name, 
                                y_name):
    
    """
    Split dataset using stratified sampling and add a column called 'dataset' 
    where specify if sample is in train or test subset.
    
    Parameters
    ----------
    df : pandas.core.frame.DataFrame
        pandas Dataframe where each row correspond to one sample of the dataset
    x_name : str
        Name in df of x values or id
    y_name : str
        Name in df of y values or annotation. This value is used to stratify 
    Returns
    -------
    df_compiled : pandas.core.frame.DataFrame
        Formated regions of interest with fixed size
    """

    X = df[x_name]
    y = df[y_name]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

    df_train = df.loc[X_train.index,:]
    df_train['dataset'] = 'train'
    df_test = df.loc[X_test.index,:]
    df_test['dataset'] = 'test'

    df_compiled = pd.concat([df_train, df_test], axis=0)
    df_compiled.sort_values('sample_name', inplace=True, ignore_index=True)

    return df_compiled


In [3]:
def detect_group_leakage(df, group_name, split_name):
    
    df_overlapping = df.groupby([group_name])[split_name].nunique()
    return any(df_overlapping>1)

def plot_split_size(df, split_name, labels_names):
    
    df_plot = df.groupby([split_name])[labels_names].sum()
    display(round(df_plot/df_plot.sum(),2))
    

In [4]:
# replace with your data
os.chdir('/mnt/batch/tasks/shared/LS_root/mounts/clusters/gpu-baseline/code/Users/jscanass')
path_annotations = 'chorus_experiments/data/datasetv1/multiclass_1/df_train_test_files.csv'

In [5]:
df_annotations = pd.read_csv(path_annotations, sep=',')

FileNotFoundError: [Errno 2] No such file or directory: 'chorus_experiments/data/datasetv1/multiclass_1/df_train_test_files.csv'

In [None]:
df_annotations.head()

In [None]:
detect_group_leakage(df=df_annotations, 
                     group_name='fname',
                     split_name='dataset')

In [None]:
plot_split_size(df=df_annotations, 
                 split_name='dataset', 
                 labels_names=['ABSENCE','BOAFAB_C',	'BOAFAB_M',	'PHYCUV_M'])

# GroupShuffleSplit

In [None]:
splitter = GroupShuffleSplit(test_size=.3, n_splits=1, random_state = 42)
split = splitter.split(df_annotations, groups=df_annotations['fname'])
train_inds, test_inds = next(split)
df_annotations['dataset_groupshufflesplit'] = ['train' if i in train_inds else 'test' for i in range(df_annotations.shape[0])]


In [None]:
detect_group_leakage(df=df_annotations, 
                     group_name='fname',
                     split_name='dataset_groupshufflesplit')

In [None]:
plot_split_size(df=df_annotations, 
                 split_name='dataset_groupshufflesplit', 
                 labels_names=['ABSENCE','BOAFAB_C',	'BOAFAB_M',	'PHYCUV_M'])

# StratifiedGroupKFold:

In [None]:
df_annotations_f = df_annotations[["sample_name", "fname",'ABSENCE','BOAFAB_C',	'BOAFAB_M',	'PHYCUV_M']]
df_annotations_f = df_annotations_f.melt(id_vars=["sample_name", "fname"], var_name='label')
df_annotations_f = df_annotations_f[df_annotations_f['value']==1]
df_annotations_f = df_annotations_f[['sample_name','label']]
df_annotations = pd.merge(df_annotations, df_annotations_f, on='sample_name' , how='left')

In [None]:
X = df_annotations['sample_name']
y = df_annotations['label']
groups = df_annotations['fname']
sgkf = StratifiedGroupKFold(n_splits=3)
for train_inds, test_inds in sgkf.split(X, y, groups=groups):
    df_annotations['dataset_stratifiedgroup'] = ['train' if i in train_inds else 'test' for i in range(df_annotations.shape[0])]
    print('Group Leakage:', detect_group_leakage(df=df_annotations, 
                     group_name='fname',
                     split_name='dataset_stratifiedgroup'))
    plot_split_size(df=df_annotations, 
                 split_name='dataset_stratifiedgroup', 
                 labels_names=['ABSENCE','BOAFAB_C',	'BOAFAB_M',	'PHYCUV_M'])

In [None]:
df_annotations.head()

In [None]:
#df_annotations.to_csv('df_train_test_files_3splits.csv',index=False)