# Data Augmentation with MIXUP

## Libraries

In [1]:
import pandas as pd
import numpy as np
import random
from tqdm import tqdm
import pickle # for loading data

## Data loading

In [2]:
f_df = open('df.pickle', 'rb')
df = pickle.load(f_df)
f_df.close()

In [3]:
f_train_test = open('train_test.pickle', 'rb')
(train,test) = pickle.load(f_train_test)
f_train_test.close()

In [4]:
f_train_samples = open('train_samples.pickle', 'rb')
(train10, train20, train30, train40, train50, train60, train70, train80, train90) = pickle.load(f_train_samples)
f_train_samples.close()

## Data Augmentation 

In [5]:
def MIXUP_augment_df(df,sc,scl,pr=0.5,samples=100):
    
    random.seed(79068588)
    
    ind = 1
    
    #sts = ["HALV","HAHV","LALV","LAHV"]
    
    sts = ["H","MH","ML","L"]
    
    random.shuffle(sts)
    
    for st in sts:
        df_n=df[df[scl] == st].reset_index(drop=True)
        
        new_text=[]
        sc_list=[]
        
        n_samples = int(samples//4)
        
        if ind==4:
            n_samples = samples-3*n_samples
        
        ## data augmentation loop
        random.seed(79068588)
        for i in tqdm(np.random.randint(0,len(df_n)-1,n_samples)):
            
            random.seed(79068588)
            j = random.randint(0, len(df_n)-1)
            
            text1 = df_n.iloc[i]['lyrics'].split()
            text2 = df_n.iloc[j]['lyrics'].split()
            
            random.seed(79068588)
            text1_l = random.sample(text1, int(len(text1)*(1-pr)))
            text2_l = random.sample(text2, int(len(text2)*pr))
            
            random.seed(79068588)
            aug_text_l = text1_l+text2_l
            random.shuffle(aug_text_l)
            aug_text = ' '.join(map(str, aug_text_l))
            
            new_text.append(aug_text)
            
            new_sc = df_n.iloc[i][sc]*(1-pr)+df_n.iloc[j][sc]*pr

                
            sc_list.append(new_sc)
            
        ind =+ 1
    
        ## dataframe
        new=pd.DataFrame({'lyrics':new_text, sc+' level':st, sc:sc_list})
        df=df.append(new,ignore_index = True)
        
    return df.sample(frac=1).reset_index(drop=True)

### Energy

#### MIXUP (λ=0.5)

In [6]:
n_train = len(train)

train10_energy_aug = MIXUP_augment_df(train10[['lyrics','energy','energy level']], sc='energy', scl='energy level', samples=(n_train-len(train10)))
train20_energy_aug = MIXUP_augment_df(train20[['lyrics','energy','energy level']], sc='energy', scl='energy level', samples=(n_train-len(train20)))
train30_energy_aug = MIXUP_augment_df(train30[['lyrics','energy','energy level']], sc='energy', scl='energy level', samples=(n_train-len(train30)))
train40_energy_aug = MIXUP_augment_df(train40[['lyrics','energy','energy level']], sc='energy', scl='energy level', samples=(n_train-len(train40)))
train50_energy_aug = MIXUP_augment_df(train50[['lyrics','energy','energy level']], sc='energy', scl='energy level', samples=(n_train-len(train50)))
train60_energy_aug = MIXUP_augment_df(train60[['lyrics','energy','energy level']], sc='energy', scl='energy level', samples=(n_train-len(train60)))
train70_energy_aug = MIXUP_augment_df(train70[['lyrics','energy','energy level']], sc='energy', scl='energy level', samples=(n_train-len(train70)))
train80_energy_aug = MIXUP_augment_df(train80[['lyrics','energy','energy level']], sc='energy', scl='energy level', samples=(n_train-len(train80)))
train90_energy_aug = MIXUP_augment_df(train90[['lyrics','energy','energy level']], sc='energy', scl='energy level', samples=(n_train-len(train90)))

train_energy_aug = train[['lyrics','energy','energy level']]

100%|██████████| 5101/5101 [00:03<00:00, 1460.75it/s]
100%|██████████| 5101/5101 [00:03<00:00, 1600.74it/s]
100%|██████████| 5101/5101 [00:03<00:00, 1621.79it/s]
100%|██████████| 5101/5101 [00:04<00:00, 1077.58it/s]
100%|██████████| 4534/4534 [00:03<00:00, 1183.01it/s]
100%|██████████| 4534/4534 [00:03<00:00, 1162.68it/s]
100%|██████████| 4534/4534 [00:03<00:00, 1300.39it/s]
100%|██████████| 4534/4534 [00:03<00:00, 1212.90it/s]
100%|██████████| 3968/3968 [00:04<00:00, 953.19it/s] 
100%|██████████| 3968/3968 [00:02<00:00, 1478.30it/s]
100%|██████████| 3968/3968 [00:02<00:00, 1368.18it/s]
100%|██████████| 3968/3968 [00:03<00:00, 1317.32it/s]
100%|██████████| 3401/3401 [00:02<00:00, 1388.10it/s]
100%|██████████| 3401/3401 [00:02<00:00, 1363.89it/s]
100%|██████████| 3401/3401 [00:03<00:00, 1126.48it/s]
100%|██████████| 3401/3401 [00:02<00:00, 1450.60it/s]
100%|██████████| 2834/2834 [00:02<00:00, 1337.28it/s]
100%|██████████| 2834/2834 [00:02<00:00, 1140.64it/s]
100%|██████████| 2834/2834 [

In [7]:
# Save train augmented samples dfs, lambda=0.5
f_train_energy_samples_aug = open('train_energy_samples_aug.pickle', 'wb')
pickle.dump([train10_energy_aug,train20_energy_aug,train30_energy_aug,train40_energy_aug,train50_energy_aug,train60_energy_aug,train70_energy_aug,train80_energy_aug,train90_energy_aug,train_energy_aug], f_train_energy_samples_aug)
f_train_energy_samples_aug.close()

#### MIXUP (λ=0.25)

In [8]:
n_train = len(train)

train10_energy_aug2 = MIXUP_augment_df(train10[['lyrics','energy','energy level']], sc='energy', scl='energy level', pr=0.25, samples=(n_train-len(train10)))
train20_energy_aug2 = MIXUP_augment_df(train20[['lyrics','energy','energy level']], sc='energy', scl='energy level', pr=0.25, samples=(n_train-len(train20)))
train30_energy_aug2 = MIXUP_augment_df(train30[['lyrics','energy','energy level']], sc='energy', scl='energy level', pr=0.25, samples=(n_train-len(train30)))
train40_energy_aug2 = MIXUP_augment_df(train40[['lyrics','energy','energy level']], sc='energy', scl='energy level', pr=0.25, samples=(n_train-len(train40)))
train50_energy_aug2 = MIXUP_augment_df(train50[['lyrics','energy','energy level']], sc='energy', scl='energy level', pr=0.25, samples=(n_train-len(train50)))
train60_energy_aug2 = MIXUP_augment_df(train60[['lyrics','energy','energy level']], sc='energy', scl='energy level', pr=0.25, samples=(n_train-len(train60)))
train70_energy_aug2 = MIXUP_augment_df(train70[['lyrics','energy','energy level']], sc='energy', scl='energy level', pr=0.25, samples=(n_train-len(train70)))
train80_energy_aug2 = MIXUP_augment_df(train80[['lyrics','energy','energy level']], sc='energy', scl='energy level', pr=0.25, samples=(n_train-len(train80)))
train90_energy_aug2 = MIXUP_augment_df(train90[['lyrics','energy','energy level']], sc='energy', scl='energy level', pr=0.25, samples=(n_train-len(train90)))

100%|██████████| 5101/5101 [00:04<00:00, 1037.48it/s]
100%|██████████| 5101/5101 [00:04<00:00, 1227.77it/s]
100%|██████████| 5101/5101 [00:03<00:00, 1464.47it/s]
100%|██████████| 5101/5101 [00:03<00:00, 1544.03it/s]
100%|██████████| 4534/4534 [00:03<00:00, 1401.70it/s]
100%|██████████| 4534/4534 [00:03<00:00, 1362.75it/s]
100%|██████████| 4534/4534 [00:02<00:00, 1581.86it/s]
100%|██████████| 4534/4534 [00:02<00:00, 1592.45it/s]
100%|██████████| 3968/3968 [00:03<00:00, 1313.82it/s]
100%|██████████| 3968/3968 [00:03<00:00, 1216.23it/s]
100%|██████████| 3968/3968 [00:03<00:00, 1272.05it/s]
100%|██████████| 3968/3968 [00:02<00:00, 1455.02it/s]
100%|██████████| 3401/3401 [00:02<00:00, 1386.44it/s]
100%|██████████| 3401/3401 [00:02<00:00, 1306.68it/s]
100%|██████████| 3401/3401 [00:02<00:00, 1362.36it/s]
100%|██████████| 3401/3401 [00:02<00:00, 1366.83it/s]
100%|██████████| 2834/2834 [00:02<00:00, 1265.94it/s]
100%|██████████| 2834/2834 [00:02<00:00, 1043.30it/s]
100%|██████████| 2834/2834 [

In [9]:
# Save train augmented samples dfs, lambda=0.25
f_train_energy_samples_aug2 = open('train_energy_samples_aug_2.pickle', 'wb')
pickle.dump([train10_energy_aug2,train20_energy_aug2,train30_energy_aug2,train40_energy_aug2,train50_energy_aug2,train60_energy_aug2,train70_energy_aug2,train80_energy_aug2,train90_energy_aug2,train_energy_aug], f_train_energy_samples_aug2)
f_train_energy_samples_aug2.close()

#### MIXUP for different λ values (with 20%, 50% and 80% of training set)

In [10]:
n_train = len(train)

train20_energy_aug_l0 = MIXUP_augment_df(train20[['lyrics','energy','energy level']], sc='energy', scl='energy level', pr=0.05, samples=(n_train-len(train20)))
train20_energy_aug_l1 = MIXUP_augment_df(train20[['lyrics','energy','energy level']], sc='energy', scl='energy level', pr=0.1, samples=(n_train-len(train20)))
train20_energy_aug_l2 = MIXUP_augment_df(train20[['lyrics','energy','energy level']], sc='energy', scl='energy level', pr=0.2, samples=(n_train-len(train20)))
train20_energy_aug_l3 = MIXUP_augment_df(train20[['lyrics','energy','energy level']], sc='energy', scl='energy level', pr=0.3, samples=(n_train-len(train20)))
train20_energy_aug_l4 = MIXUP_augment_df(train20[['lyrics','energy','energy level']], sc='energy', scl='energy level', pr=0.4, samples=(n_train-len(train20)))
train20_energy_aug_l5 = MIXUP_augment_df(train20[['lyrics','energy','energy level']], sc='energy', scl='energy level', pr=0.5, samples=(n_train-len(train20)))

train50_energy_aug_l0 = MIXUP_augment_df(train50[['lyrics','energy','energy level']], sc='energy', scl='energy level', pr=0.05, samples=(n_train-len(train50)))
train50_energy_aug_l1 = MIXUP_augment_df(train50[['lyrics','energy','energy level']], sc='energy', scl='energy level', pr=0.1, samples=(n_train-len(train50)))
train50_energy_aug_l2 = MIXUP_augment_df(train50[['lyrics','energy','energy level']], sc='energy', scl='energy level', pr=0.2, samples=(n_train-len(train50)))
train50_energy_aug_l3 = MIXUP_augment_df(train50[['lyrics','energy','energy level']], sc='energy', scl='energy level', pr=0.3, samples=(n_train-len(train50)))
train50_energy_aug_l4 = MIXUP_augment_df(train50[['lyrics','energy','energy level']], sc='energy', scl='energy level', pr=0.4, samples=(n_train-len(train50)))
train50_energy_aug_l5 = MIXUP_augment_df(train50[['lyrics','energy','energy level']], sc='energy', scl='energy level', pr=0.5, samples=(n_train-len(train50)))

train80_energy_aug_l0 = MIXUP_augment_df(train80[['lyrics','energy','energy level']], sc='energy', scl='energy level', pr=0.05, samples=(n_train-len(train80)))
train80_energy_aug_l1 = MIXUP_augment_df(train80[['lyrics','energy','energy level']], sc='energy', scl='energy level', pr=0.1, samples=(n_train-len(train80)))
train80_energy_aug_l2 = MIXUP_augment_df(train80[['lyrics','energy','energy level']], sc='energy', scl='energy level', pr=0.2, samples=(n_train-len(train80)))
train80_energy_aug_l3 = MIXUP_augment_df(train80[['lyrics','energy','energy level']], sc='energy', scl='energy level', pr=0.3, samples=(n_train-len(train80)))
train80_energy_aug_l4 = MIXUP_augment_df(train80[['lyrics','energy','energy level']], sc='energy', scl='energy level', pr=0.4, samples=(n_train-len(train80)))
train80_energy_aug_l5 = MIXUP_augment_df(train80[['lyrics','energy','energy level']], sc='energy', scl='energy level', pr=0.5, samples=(n_train-len(train80)))


100%|██████████| 4534/4534 [00:03<00:00, 1353.41it/s]
100%|██████████| 4534/4534 [00:03<00:00, 1350.15it/s]
100%|██████████| 4534/4534 [00:04<00:00, 1051.31it/s]
100%|██████████| 4534/4534 [00:03<00:00, 1152.62it/s]
100%|██████████| 4534/4534 [00:03<00:00, 1211.08it/s]
100%|██████████| 4534/4534 [00:04<00:00, 984.15it/s] 
100%|██████████| 4534/4534 [00:03<00:00, 1166.13it/s]
100%|██████████| 4534/4534 [00:03<00:00, 1349.70it/s]
100%|██████████| 4534/4534 [00:03<00:00, 1224.80it/s]
100%|██████████| 4534/4534 [00:03<00:00, 1257.03it/s]
100%|██████████| 4534/4534 [00:03<00:00, 1290.79it/s]
100%|██████████| 4534/4534 [00:03<00:00, 1303.29it/s]
100%|██████████| 4534/4534 [00:03<00:00, 1259.14it/s]
100%|██████████| 4534/4534 [00:03<00:00, 1201.20it/s]
100%|██████████| 4534/4534 [00:03<00:00, 1292.22it/s]
100%|██████████| 4534/4534 [00:04<00:00, 1043.96it/s]
100%|██████████| 4534/4534 [00:04<00:00, 987.78it/s] 
100%|██████████| 4534/4534 [00:04<00:00, 1026.19it/s]
100%|██████████| 4534/4534 [

In [11]:
# Save train augmented samples dfs, different lambdas
f_train_energy_samples_aug_l = open('train_energy_samples_aug_l.pickle', 'wb')
pickle.dump([train20_energy_aug_l0,train20_energy_aug_l1,train20_energy_aug_l2,train20_energy_aug_l3,train20_energy_aug_l4,train20_energy_aug_l5,train50_energy_aug_l0,train50_energy_aug_l1,train50_energy_aug_l2,train50_energy_aug_l3,train50_energy_aug_l4,train50_energy_aug_l5,train80_energy_aug_l0,train80_energy_aug_l1,train80_energy_aug_l2,train80_energy_aug_l3,train80_energy_aug_l4,train80_energy_aug_l5], f_train_energy_samples_aug_l)
f_train_energy_samples_aug_l.close()

### Valence

#### MIXUP (λ=0.5)

In [12]:
n_train = len(train)

train10_valence_aug = MIXUP_augment_df(train10[['lyrics','valence','valence level']], sc='valence', scl='valence level', samples=(n_train-len(train10)))
train20_valence_aug = MIXUP_augment_df(train20[['lyrics','valence','valence level']], sc='valence', scl='valence level', samples=(n_train-len(train20)))
train30_valence_aug = MIXUP_augment_df(train30[['lyrics','valence','valence level']], sc='valence', scl='valence level', samples=(n_train-len(train30)))
train40_valence_aug = MIXUP_augment_df(train40[['lyrics','valence','valence level']], sc='valence', scl='valence level', samples=(n_train-len(train40)))
train50_valence_aug = MIXUP_augment_df(train50[['lyrics','valence','valence level']], sc='valence', scl='valence level', samples=(n_train-len(train50)))
train60_valence_aug = MIXUP_augment_df(train60[['lyrics','valence','valence level']], sc='valence', scl='valence level', samples=(n_train-len(train60)))
train70_valence_aug = MIXUP_augment_df(train70[['lyrics','valence','valence level']], sc='valence', scl='valence level', samples=(n_train-len(train70)))
train80_valence_aug = MIXUP_augment_df(train80[['lyrics','valence','valence level']], sc='valence', scl='valence level', samples=(n_train-len(train80)))
train90_valence_aug = MIXUP_augment_df(train90[['lyrics','valence','valence level']], sc='valence', scl='valence level', samples=(n_train-len(train90)))

train_valence_aug = train[['lyrics','valence','valence level']]

100%|██████████| 5101/5101 [00:03<00:00, 1468.91it/s]
100%|██████████| 5101/5101 [00:03<00:00, 1443.32it/s]
100%|██████████| 5101/5101 [00:03<00:00, 1434.14it/s]
100%|██████████| 5101/5101 [00:03<00:00, 1411.91it/s]
100%|██████████| 4534/4534 [00:03<00:00, 1275.46it/s]
100%|██████████| 4534/4534 [00:03<00:00, 1408.59it/s]
100%|██████████| 4534/4534 [00:03<00:00, 1186.26it/s]
100%|██████████| 4534/4534 [00:03<00:00, 1332.55it/s]
100%|██████████| 3968/3968 [00:03<00:00, 1306.43it/s]
100%|██████████| 3968/3968 [00:03<00:00, 1232.36it/s]
100%|██████████| 3968/3968 [00:04<00:00, 984.87it/s] 
100%|██████████| 3968/3968 [00:02<00:00, 1345.02it/s]
100%|██████████| 3401/3401 [00:02<00:00, 1356.46it/s]
100%|██████████| 3401/3401 [00:02<00:00, 1312.35it/s]
100%|██████████| 3401/3401 [00:02<00:00, 1251.94it/s]
100%|██████████| 3401/3401 [00:02<00:00, 1554.81it/s]
100%|██████████| 2834/2834 [00:02<00:00, 1274.14it/s]
100%|██████████| 2834/2834 [00:01<00:00, 1455.46it/s]
100%|██████████| 2834/2834 [

In [13]:
# Save train augmented samples dfs, lambda=0.5
f_train_valence_samples_aug = open('train_valence_samples_aug.pickle', 'wb')
pickle.dump([train10_valence_aug,train20_valence_aug,train30_valence_aug,train40_valence_aug,train50_valence_aug,train60_valence_aug,train70_valence_aug,train80_valence_aug,train90_valence_aug,train_valence_aug], f_train_valence_samples_aug)
f_train_valence_samples_aug.close()

#### MIXUP (λ=0.25)

In [14]:
n_train = len(train)

train10_valence_aug2 = MIXUP_augment_df(train10[['lyrics','valence','valence level']], sc='valence', scl='valence level', pr=0.25, samples=(n_train-len(train10)))
train20_valence_aug2 = MIXUP_augment_df(train20[['lyrics','valence','valence level']], sc='valence', scl='valence level', pr=0.25, samples=(n_train-len(train20)))
train30_valence_aug2 = MIXUP_augment_df(train30[['lyrics','valence','valence level']], sc='valence', scl='valence level', pr=0.25, samples=(n_train-len(train30)))
train40_valence_aug2 = MIXUP_augment_df(train40[['lyrics','valence','valence level']], sc='valence', scl='valence level', pr=0.25, samples=(n_train-len(train40)))
train50_valence_aug2 = MIXUP_augment_df(train50[['lyrics','valence','valence level']], sc='valence', scl='valence level', pr=0.25, samples=(n_train-len(train50)))
train60_valence_aug2 = MIXUP_augment_df(train60[['lyrics','valence','valence level']], sc='valence', scl='valence level', pr=0.25, samples=(n_train-len(train60)))
train70_valence_aug2 = MIXUP_augment_df(train70[['lyrics','valence','valence level']], sc='valence', scl='valence level', pr=0.25, samples=(n_train-len(train70)))
train80_valence_aug2 = MIXUP_augment_df(train80[['lyrics','valence','valence level']], sc='valence', scl='valence level', pr=0.25, samples=(n_train-len(train80)))
train90_valence_aug2 = MIXUP_augment_df(train90[['lyrics','valence','valence level']], sc='valence', scl='valence level', pr=0.25, samples=(n_train-len(train90)))

100%|██████████| 5101/5101 [00:03<00:00, 1413.59it/s]
100%|██████████| 5101/5101 [00:03<00:00, 1347.84it/s]
100%|██████████| 5101/5101 [00:03<00:00, 1378.50it/s]
100%|██████████| 5101/5101 [00:03<00:00, 1307.56it/s]
100%|██████████| 4534/4534 [00:03<00:00, 1339.28it/s]
100%|██████████| 4534/4534 [00:03<00:00, 1225.76it/s]
100%|██████████| 4534/4534 [00:03<00:00, 1354.46it/s]
100%|██████████| 4534/4534 [00:04<00:00, 1070.55it/s]
100%|██████████| 3968/3968 [00:02<00:00, 1324.71it/s]
100%|██████████| 3968/3968 [00:03<00:00, 1313.08it/s]
100%|██████████| 3968/3968 [00:03<00:00, 1236.53it/s]
100%|██████████| 3968/3968 [00:02<00:00, 1422.82it/s]
100%|██████████| 3401/3401 [00:02<00:00, 1220.49it/s]
100%|██████████| 3401/3401 [00:02<00:00, 1345.54it/s]
100%|██████████| 3401/3401 [00:02<00:00, 1261.61it/s]
100%|██████████| 3401/3401 [00:02<00:00, 1251.51it/s]
100%|██████████| 2834/2834 [00:02<00:00, 1158.30it/s]
100%|██████████| 2834/2834 [00:02<00:00, 1354.97it/s]
100%|██████████| 2834/2834 [

In [15]:
# Save train augmented samples dfs, lambda=0.25
f_train_valence_samples_aug2 = open('train_valence_samples_aug_2.pickle', 'wb')
pickle.dump([train10_valence_aug2,train20_valence_aug2,train30_valence_aug2,train40_valence_aug2,train50_valence_aug2,train60_valence_aug2,train70_valence_aug2,train80_valence_aug2,train90_valence_aug2,train_valence_aug], f_train_valence_samples_aug2)
f_train_valence_samples_aug2.close()

#### MIXUP for different λ values (with 20%, 50% and 80% of training set)

In [16]:
n_train = len(train)

train20_valence_aug_l0 = MIXUP_augment_df(train20[['lyrics','valence','valence level']], sc='valence', scl='valence level', pr=0.05, samples=(n_train-len(train20)))
train20_valence_aug_l1 = MIXUP_augment_df(train20[['lyrics','valence','valence level']], sc='valence', scl='valence level', pr=0.1, samples=(n_train-len(train20)))
train20_valence_aug_l2 = MIXUP_augment_df(train20[['lyrics','valence','valence level']], sc='valence', scl='valence level', pr=0.2, samples=(n_train-len(train20)))
train20_valence_aug_l3 = MIXUP_augment_df(train20[['lyrics','valence','valence level']], sc='valence', scl='valence level', pr=0.3, samples=(n_train-len(train20)))
train20_valence_aug_l4 = MIXUP_augment_df(train20[['lyrics','valence','valence level']], sc='valence', scl='valence level', pr=0.4, samples=(n_train-len(train20)))
train20_valence_aug_l5 = MIXUP_augment_df(train20[['lyrics','valence','valence level']], sc='valence', scl='valence level', pr=0.5, samples=(n_train-len(train20)))

train50_valence_aug_l0 = MIXUP_augment_df(train50[['lyrics','valence','valence level']], sc='valence', scl='valence level', pr=0.05, samples=(n_train-len(train50)))
train50_valence_aug_l1 = MIXUP_augment_df(train50[['lyrics','valence','valence level']], sc='valence', scl='valence level', pr=0.1, samples=(n_train-len(train50)))
train50_valence_aug_l2 = MIXUP_augment_df(train50[['lyrics','valence','valence level']], sc='valence', scl='valence level', pr=0.2, samples=(n_train-len(train50)))
train50_valence_aug_l3 = MIXUP_augment_df(train50[['lyrics','valence','valence level']], sc='valence', scl='valence level', pr=0.3, samples=(n_train-len(train50)))
train50_valence_aug_l4 = MIXUP_augment_df(train50[['lyrics','valence','valence level']], sc='valence', scl='valence level', pr=0.4, samples=(n_train-len(train50)))
train50_valence_aug_l5 = MIXUP_augment_df(train50[['lyrics','valence','valence level']], sc='valence', scl='valence level', pr=0.5, samples=(n_train-len(train50)))

train80_valence_aug_l0 = MIXUP_augment_df(train80[['lyrics','valence','valence level']], sc='valence', scl='valence level', pr=0.05, samples=(n_train-len(train80)))
train80_valence_aug_l1 = MIXUP_augment_df(train80[['lyrics','valence','valence level']], sc='valence', scl='valence level', pr=0.1, samples=(n_train-len(train80)))
train80_valence_aug_l2 = MIXUP_augment_df(train80[['lyrics','valence','valence level']], sc='valence', scl='valence level', pr=0.2, samples=(n_train-len(train80)))
train80_valence_aug_l3 = MIXUP_augment_df(train80[['lyrics','valence','valence level']], sc='valence', scl='valence level', pr=0.3, samples=(n_train-len(train80)))
train80_valence_aug_l4 = MIXUP_augment_df(train80[['lyrics','valence','valence level']], sc='valence', scl='valence level', pr=0.4, samples=(n_train-len(train80)))
train80_valence_aug_l5 = MIXUP_augment_df(train80[['lyrics','valence','valence level']], sc='valence', scl='valence level', pr=0.5, samples=(n_train-len(train80)))


100%|██████████| 4534/4534 [00:03<00:00, 1294.41it/s]
100%|██████████| 4534/4534 [00:03<00:00, 1345.53it/s]
100%|██████████| 4534/4534 [00:03<00:00, 1384.78it/s]
100%|██████████| 4534/4534 [00:03<00:00, 1303.77it/s]
100%|██████████| 4534/4534 [00:03<00:00, 1236.73it/s]
100%|██████████| 4534/4534 [00:03<00:00, 1335.58it/s]
100%|██████████| 4534/4534 [00:03<00:00, 1338.11it/s]
100%|██████████| 4534/4534 [00:03<00:00, 1322.17it/s]
100%|██████████| 4534/4534 [00:03<00:00, 1369.99it/s]
100%|██████████| 4534/4534 [00:03<00:00, 1246.45it/s]
100%|██████████| 4534/4534 [00:03<00:00, 1278.84it/s]
100%|██████████| 4534/4534 [00:03<00:00, 1363.88it/s]
100%|██████████| 4534/4534 [00:03<00:00, 1285.42it/s]
100%|██████████| 4534/4534 [00:03<00:00, 1182.58it/s]
100%|██████████| 4534/4534 [00:03<00:00, 1507.54it/s]
100%|██████████| 4534/4534 [00:03<00:00, 1405.63it/s]
100%|██████████| 4534/4534 [00:03<00:00, 1245.07it/s]
100%|██████████| 4534/4534 [00:03<00:00, 1256.41it/s]
100%|██████████| 4534/4534 [

In [17]:
# Save train augmented samples dfs, different lambdas
f_train_valence_samples_aug_l = open('train_valence_samples_aug_l.pickle', 'wb')
pickle.dump([train20_valence_aug_l0,train20_valence_aug_l1,train20_valence_aug_l2,train20_valence_aug_l3,train20_valence_aug_l4,train20_valence_aug_l5,train50_valence_aug_l0,train50_valence_aug_l1,train50_valence_aug_l2,train50_valence_aug_l3,train50_valence_aug_l4,train50_valence_aug_l5,train80_valence_aug_l0,train80_valence_aug_l1,train80_valence_aug_l2,train80_valence_aug_l3,train80_valence_aug_l4,train80_valence_aug_l5], f_train_valence_samples_aug_l)
f_train_valence_samples_aug_l.close()