# K-FOLDS
* data is found in data/
* weights are saved in weights/
* csv are saved in results/
* confusion matrices are saved in results/
## Files to make before running this notebook:
* 10 train csvs for each fold (train_{k}.csv)
* 10 val csvs for each fold (val_{k}.csv)
* 10 test csvs for each fold (test_{k}.csv)
* 10 test csvs with labels for each fold (test_id_labels_{k}.csv)
* 1 csv which contains all paths and labels (all_data.csv)
  
Note: Each fold should be balanced wrt. label and gender.

In [None]:
import pandas as pd
import os
import os.path
import sys
import torchaudio
import torch
import torch.nn as nn
import torch.optim as optim
from IPython.display import Audio, display
import numpy as np
import time
from torch.utils.data import Dataset, DataLoader
import matplotlib.pyplot as plt
from transformers import Wav2Vec2FeatureExtractor, Wav2Vec2Model, Wav2Vec2ForCTC, Wav2Vec2Tokenizer, AutoModelForCTC, Wav2Vec2Processor
import seaborn as sns
from itertools import chain
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

sys.path.append(os.path.abspath('../preprocessing'))
sys.path.append(os.path.abspath('../helpfunctions'))
sys.path.append(os.path.abspath('../models'))
from CustomDatasets import CustomDataset1, CustomDataset2
from mergesplits import mergesplits
from padding import padding
from splitdataset import split_audios
from splitpadsave import splitpad
from wav2vecClassefier_jonatasgrosman import wav2vecClassefier_jonatasgrosman
from Wav2Vec2Classifier_librispeech import Wav2Vec2Classifier_librispeech

In [None]:
# where the weights are to be saved
weight_folder = 'weights/pcgita1_kfolds_64conv_8batch_librispeech'
if not os.path.exists(weight_folder): os.makedirs(weight_folder)
# where the results (csvs and confusion matrices) are to be saved
results_folder = 'results/pcgita1_kfolds_64conv_8batch_librispeech'
if not os.path.exists(results_folder): os.makedirs(results_folder)
# a csv with all data (id, path to recording, label)
data = 'pcgita1_kfolds/all_data.csv'

for k in range(2, 11):
    print(f'K-fold {k} / 10')
    torch.cuda.empty_cache()
    # Get IDs
    train = f'pcgita1_kfolds/train_{k}.csv'
    val = f'pcgita1_kfolds/val_{k}.csv'
    test = f'pcgita1_kfolds/test_{k}.csv'
    test_id_labels = f'pcgita1_kfolds/test_id_labels_{k}.csv'
    # Create data
    X_train, y_train, X_val, y_val, X_test, y_test, n_test = splitpad(csv_path=data, TRAIN_ID=train, VAL_ID=val, TEST_ID=test, padding_type='reflect', augment=0, length=10, save='no')
    # Handling dataset
    train = CustomDataset1(X=X_train, y=y_train)
    val = CustomDataset1(X=X_val, y=y_val)
    test = CustomDataset2(X=X_test, y=y_test, n=n_test)
    # Load data with DataLoader
    trainloader = DataLoader(train, batch_size=8, drop_last=True, num_workers=0, shuffle=True)
    valloader = DataLoader(val, batch_size=8, drop_last=True, num_workers=0, shuffle=False)
    testloader = DataLoader(test, batch_size=8, drop_last=True, num_workers=0, shuffle=True)
    # Define model
    model = Wav2Vec2Classifier_librispeech()
    # Train
    print('Start training')
    model.fit(k=k, train_dataloader=trainloader, val_dataloader=valloader, max_epochs=15, lr=0.0001, weights_folder=weight_folder, results_folder=results_folder)
    # Test
    print('Start testing')
    model.test(test_dataloader=testloader, weights_folder=weight_folder, test_id=test_id_labels, k=k, max_epochs=15, results_folder=results_folder)