# Data Augmentation with Noise Injection

## Libraries

In [1]:
import pandas as pd
import numpy as np
import random
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest, chi2
import math
from textblob import TextBlob as tb
import nltk
#nltk.download('punkt')
from tqdm import tqdm
import pickle # for loading data

## Data loading

In [2]:
f_df = open('df.pickle', 'rb')
df = pickle.load(f_df)
f_df.close()

In [3]:
f_train_test = open('train_test.pickle', 'rb')
(train,test) = pickle.load(f_train_test)
f_train_test.close()

In [4]:
f_train_samples = open('train_samples.pickle', 'rb')
(train10, train20, train30, train40, train50, train60, train70, train80, train90) = pickle.load(f_train_samples)
f_train_samples.close()

## Data Augmentation 

In [5]:
def TopNWords(df,scl,n=10):
    
    l_topN_words = []
    
    tfidf = TfidfVectorizer(min_df=3,
                                max_features=5000,
                                stop_words='english')
    
    corpus = df[['lyrics']].values.flatten()
    
    corpus_tfidf=tfidf.fit_transform(corpus)
    
    feature_names = np.array(tfidf.get_feature_names())
    
    ch2 = SelectKBest(chi2, k=1000)
    ch2.fit(corpus_tfidf, df[[scl]].values)
    
    for st in tqdm(["H","MH","ML","L"]):
        
        df_n=df[df[scl] == st].reset_index(drop=True)
        
        corpus_n = df_n[['lyrics']].values.flatten()
        
        corpus_n_tfidf = tfidf.transform(corpus_n).toarray()
        
        corpus_n_chi = ch2.transform(corpus_n_tfidf)
        
        importance = np.argsort(np.asarray(corpus_n_chi.sum(axis=0)).ravel())[::-1]
        
        topN_words = list(feature_names[importance[:n]])
    
        l_topN_words.append(topN_words)
        
    return(l_topN_words)

In [6]:
def NOISE_augment_df(df,sc,scl,w_l,n=5,samples=100):
    
    random.seed(79068588)
    
    ind = 1
    
    for st in ["H","MH","ML","L"]:
        
        df_n=df[df[scl] == st].reset_index(drop=True)
        
        new_text=[]
        sc_list=[]
        
        n_samples = int(samples//4)
        
        if ind==4:
            n_samples = samples-3*n_samples
        
            
        ## data augmentation loop
        random.seed(79068588)
        for i in tqdm(np.random.randint(0,len(df_n),n_samples)):
            random.seed(79068588)
            text = df_n.iloc[i]['lyrics'].split()
            
            random.seed(79068588)
            words_l = w_l[ind-1]
            #.tolist()
            words_l = random.sample(words_l,n)
            
            augmented_text_l = text+words_l
            random.shuffle(augmented_text_l)
            
            augmented_text = ' '.join(map(str, augmented_text_l))
            
            new_text.append(augmented_text)
                
            sc_list.append(df_n.iloc[i][sc])
    
        ind =+ 1
    
        ## dataframe
        new=pd.DataFrame({'lyrics':new_text, sc+' level':st, sc:sc_list})
        df=df.append(new,ignore_index = True)
        
    return df.sample(frac=1).reset_index(drop=True)

### Noise injection (n=5)

In [7]:
energy_top_words = TopNWords(train, scl='energy level', n=20)

valence_top_words = TopNWords(train, scl='valence level', n=20)

100%|██████████| 4/4 [00:03<00:00,  1.29it/s]
100%|██████████| 4/4 [00:03<00:00,  1.28it/s]


In [8]:
n_train = len(train)

train10_energy_aug2 = NOISE_augment_df(train10[['lyrics','energy','energy level']], sc='energy', scl='energy level', w_l=energy_top_words, samples=(n_train-len(train10)))
train20_energy_aug2 = NOISE_augment_df(train20[['lyrics','energy','energy level']], sc='energy', scl='energy level', w_l=energy_top_words, samples=(n_train-len(train20)))
train30_energy_aug2 = NOISE_augment_df(train30[['lyrics','energy','energy level']], sc='energy', scl='energy level', w_l=energy_top_words, samples=(n_train-len(train30)))
train40_energy_aug2 = NOISE_augment_df(train40[['lyrics','energy','energy level']], sc='energy', scl='energy level', w_l=energy_top_words, samples=(n_train-len(train40)))
train50_energy_aug2 = NOISE_augment_df(train50[['lyrics','energy','energy level']], sc='energy', scl='energy level', w_l=energy_top_words, samples=(n_train-len(train50)))
train60_energy_aug2 = NOISE_augment_df(train60[['lyrics','energy','energy level']], sc='energy', scl='energy level', w_l=energy_top_words, samples=(n_train-len(train60)))
train70_energy_aug2 = NOISE_augment_df(train70[['lyrics','energy','energy level']], sc='energy', scl='energy level', w_l=energy_top_words, samples=(n_train-len(train70)))
train80_energy_aug2 = NOISE_augment_df(train80[['lyrics','energy','energy level']], sc='energy', scl='energy level', w_l=energy_top_words, samples=(n_train-len(train80)))
train90_energy_aug2 = NOISE_augment_df(train90[['lyrics','energy','energy level']], sc='energy', scl='energy level', w_l=energy_top_words, samples=(n_train-len(train90)))

train_energy_aug2 = train[['lyrics','energy','energy level']]

100%|██████████| 5101/5101 [00:01<00:00, 2664.39it/s]
100%|██████████| 5101/5101 [00:02<00:00, 2276.89it/s]
100%|██████████| 5101/5101 [00:01<00:00, 2559.57it/s]
100%|██████████| 5101/5101 [00:01<00:00, 2553.46it/s]
100%|██████████| 4534/4534 [00:01<00:00, 2441.92it/s]
100%|██████████| 4534/4534 [00:01<00:00, 2655.82it/s]
100%|██████████| 4534/4534 [00:01<00:00, 2610.03it/s]
100%|██████████| 4534/4534 [00:01<00:00, 2733.06it/s]
100%|██████████| 3968/3968 [00:01<00:00, 2599.91it/s]
100%|██████████| 3968/3968 [00:01<00:00, 2302.76it/s]
100%|██████████| 3968/3968 [00:01<00:00, 2688.99it/s]
100%|██████████| 3968/3968 [00:01<00:00, 2734.85it/s]
100%|██████████| 3401/3401 [00:01<00:00, 2329.24it/s]
100%|██████████| 3401/3401 [00:01<00:00, 2521.15it/s]
100%|██████████| 3401/3401 [00:01<00:00, 2610.57it/s]
100%|██████████| 3401/3401 [00:01<00:00, 2759.54it/s]
100%|██████████| 2834/2834 [00:01<00:00, 2470.73it/s]
100%|██████████| 2834/2834 [00:01<00:00, 2537.11it/s]
100%|██████████| 2834/2834 [

In [9]:
# Save train augmented samples dfs, n=5
f_train_energy_samples_aug2 = open('train_energy_samples_aug2.pickle', 'wb')
pickle.dump([train10_energy_aug2,train20_energy_aug2,train30_energy_aug2,train40_energy_aug2,train50_energy_aug2,train60_energy_aug2,train70_energy_aug2,train80_energy_aug2,train90_energy_aug2,train_energy_aug2], f_train_energy_samples_aug2)
f_train_energy_samples_aug2.close()

In [10]:
n_train = len(train)

train10_valence_aug2 = NOISE_augment_df(train10[['lyrics','valence','valence level']], sc='valence', scl='valence level', w_l=valence_top_words, samples=(n_train-len(train10)))
train20_valence_aug2 = NOISE_augment_df(train20[['lyrics','valence','valence level']], sc='valence', scl='valence level', w_l=valence_top_words, samples=(n_train-len(train20)))
train30_valence_aug2 = NOISE_augment_df(train30[['lyrics','valence','valence level']], sc='valence', scl='valence level', w_l=valence_top_words, samples=(n_train-len(train30)))
train40_valence_aug2 = NOISE_augment_df(train40[['lyrics','valence','valence level']], sc='valence', scl='valence level', w_l=valence_top_words, samples=(n_train-len(train40)))
train50_valence_aug2 = NOISE_augment_df(train50[['lyrics','valence','valence level']], sc='valence', scl='valence level', w_l=valence_top_words, samples=(n_train-len(train50)))
train60_valence_aug2 = NOISE_augment_df(train60[['lyrics','valence','valence level']], sc='valence', scl='valence level', w_l=valence_top_words, samples=(n_train-len(train60)))
train70_valence_aug2 = NOISE_augment_df(train70[['lyrics','valence','valence level']], sc='valence', scl='valence level', w_l=valence_top_words, samples=(n_train-len(train70)))
train80_valence_aug2 = NOISE_augment_df(train80[['lyrics','valence','valence level']], sc='valence', scl='valence level', w_l=valence_top_words, samples=(n_train-len(train80)))
train90_valence_aug2 = NOISE_augment_df(train90[['lyrics','valence','valence level']], sc='valence', scl='valence level', w_l=valence_top_words, samples=(n_train-len(train90)))

train_valence_aug2 = train[['lyrics','valence','valence level']]

100%|██████████| 5101/5101 [00:01<00:00, 2685.10it/s]
100%|██████████| 5101/5101 [00:01<00:00, 2796.24it/s]
100%|██████████| 5101/5101 [00:02<00:00, 2255.32it/s]
100%|██████████| 5101/5101 [00:01<00:00, 2565.91it/s]
100%|██████████| 4534/4534 [00:02<00:00, 1935.64it/s]
100%|██████████| 4534/4534 [00:02<00:00, 2045.73it/s]
100%|██████████| 4534/4534 [00:01<00:00, 2358.53it/s]
100%|██████████| 4534/4534 [00:02<00:00, 1921.22it/s]
100%|██████████| 3968/3968 [00:01<00:00, 2618.43it/s]
100%|██████████| 3968/3968 [00:01<00:00, 2316.87it/s]
100%|██████████| 3968/3968 [00:01<00:00, 2584.24it/s]
100%|██████████| 3968/3968 [00:01<00:00, 2836.47it/s]
100%|██████████| 3401/3401 [00:01<00:00, 2813.11it/s]
100%|██████████| 3401/3401 [00:01<00:00, 2717.21it/s]
100%|██████████| 3401/3401 [00:01<00:00, 2553.61it/s]
100%|██████████| 3401/3401 [00:01<00:00, 2324.32it/s]
100%|██████████| 2834/2834 [00:01<00:00, 2053.47it/s]
100%|██████████| 2834/2834 [00:01<00:00, 2345.87it/s]
100%|██████████| 2834/2834 [

In [11]:
# Save train augmented samples dfs, n=5
f_train_valence_samples_aug2 = open('train_valence_samples_aug2.pickle', 'wb')
pickle.dump([train10_valence_aug2,train20_valence_aug2,train30_valence_aug2,train40_valence_aug2,train50_valence_aug2,train60_valence_aug2,train70_valence_aug2,train80_valence_aug2,train90_valence_aug2,train_valence_aug2], f_train_valence_samples_aug2)
f_train_valence_samples_aug2.close()

### Noise injection for different n values (with 20%, 50% and 80% of training set)

#### Energy

In [12]:
energy_top_words = TopNWords(train, scl='energy level', n=50)

valence_top_words = TopNWords(train, scl='valence level', n=50)

100%|██████████| 4/4 [00:03<00:00,  1.27it/s]
100%|██████████| 4/4 [00:02<00:00,  1.45it/s]


In [13]:
n_train = len(train)


train20_energy_aug_n1 = NOISE_augment_df(train20[['lyrics','energy','energy level']], sc='energy', scl='energy level', w_l=energy_top_words, n=1, samples=(n_train-len(train20)))
train20_energy_aug_n2 = NOISE_augment_df(train20[['lyrics','energy','energy level']], sc='energy', scl='energy level', w_l=energy_top_words, n=5, samples=(n_train-len(train20)))
train20_energy_aug_n3 = NOISE_augment_df(train20[['lyrics','energy','energy level']], sc='energy', scl='energy level', w_l=energy_top_words, n=10, samples=(n_train-len(train20)))
train20_energy_aug_n4 = NOISE_augment_df(train20[['lyrics','energy','energy level']], sc='energy', scl='energy level', w_l=energy_top_words, n=15, samples=(n_train-len(train20)))
train20_energy_aug_n5 = NOISE_augment_df(train20[['lyrics','energy','energy level']], sc='energy', scl='energy level', w_l=energy_top_words, n=20, samples=(n_train-len(train20)))


train50_energy_aug_n1 = NOISE_augment_df(train50[['lyrics','energy','energy level']], sc='energy', scl='energy level', w_l=energy_top_words, n=1, samples=(n_train-len(train50)))
train50_energy_aug_n2 = NOISE_augment_df(train50[['lyrics','energy','energy level']], sc='energy', scl='energy level', w_l=energy_top_words, n=5, samples=(n_train-len(train50)))
train50_energy_aug_n3 = NOISE_augment_df(train50[['lyrics','energy','energy level']], sc='energy', scl='energy level', w_l=energy_top_words, n=10, samples=(n_train-len(train50)))
train50_energy_aug_n4 = NOISE_augment_df(train50[['lyrics','energy','energy level']], sc='energy', scl='energy level', w_l=energy_top_words, n=15, samples=(n_train-len(train50)))
train50_energy_aug_n5 = NOISE_augment_df(train50[['lyrics','energy','energy level']], sc='energy', scl='energy level', w_l=energy_top_words, n=20, samples=(n_train-len(train50)))


train80_energy_aug_n1 = NOISE_augment_df(train80[['lyrics','energy','energy level']], sc='energy', scl='energy level', w_l=energy_top_words, n=1, samples=(n_train-len(train80)))
train80_energy_aug_n2 = NOISE_augment_df(train80[['lyrics','energy','energy level']], sc='energy', scl='energy level', w_l=energy_top_words, n=5, samples=(n_train-len(train80)))
train80_energy_aug_n3 = NOISE_augment_df(train80[['lyrics','energy','energy level']], sc='energy', scl='energy level', w_l=energy_top_words, n=10, samples=(n_train-len(train80)))
train80_energy_aug_n4 = NOISE_augment_df(train80[['lyrics','energy','energy level']], sc='energy', scl='energy level', w_l=energy_top_words, n=15, samples=(n_train-len(train80)))
train80_energy_aug_n5 = NOISE_augment_df(train80[['lyrics','energy','energy level']], sc='energy', scl='energy level', w_l=energy_top_words, n=20, samples=(n_train-len(train80)))


100%|██████████| 4534/4534 [00:01<00:00, 2638.66it/s]
100%|██████████| 4534/4534 [00:01<00:00, 2520.09it/s]
100%|██████████| 4534/4534 [00:01<00:00, 2828.37it/s]
100%|██████████| 4534/4534 [00:01<00:00, 2915.58it/s]
100%|██████████| 4534/4534 [00:01<00:00, 2352.36it/s]
100%|██████████| 4534/4534 [00:01<00:00, 2600.49it/s]
100%|██████████| 4534/4534 [00:01<00:00, 2615.00it/s]
100%|██████████| 4534/4534 [00:01<00:00, 2974.75it/s]
100%|██████████| 4534/4534 [00:01<00:00, 2750.56it/s]
100%|██████████| 4534/4534 [00:02<00:00, 2103.43it/s]
100%|██████████| 4534/4534 [00:02<00:00, 1707.49it/s]
100%|██████████| 4534/4534 [00:01<00:00, 2355.17it/s]
100%|██████████| 4534/4534 [00:01<00:00, 2343.27it/s]
100%|██████████| 4534/4534 [00:01<00:00, 2554.80it/s]
100%|██████████| 4534/4534 [00:01<00:00, 2615.69it/s]
100%|██████████| 4534/4534 [00:01<00:00, 2532.39it/s]
100%|██████████| 4534/4534 [00:01<00:00, 2504.04it/s]
100%|██████████| 4534/4534 [00:01<00:00, 2499.34it/s]
100%|██████████| 4534/4534 [

In [14]:
# Save train augmented samples dfs, different n values
f_train_energy_samples_aug_n = open('train_energy_samples_aug_n.pickle', 'wb')
pickle.dump([train20_energy_aug_n1,train20_energy_aug_n2,train20_energy_aug_n3,train20_energy_aug_n4,train20_energy_aug_n5,train50_energy_aug_n1,train50_energy_aug_n2,train50_energy_aug_n3,train50_energy_aug_n4,train50_energy_aug_n5,train80_energy_aug_n1,train80_energy_aug_n2,train80_energy_aug_n3,train80_energy_aug_n4,train80_energy_aug_n5], f_train_energy_samples_aug_n)
f_train_energy_samples_aug_n.close()

#### Valence

In [15]:
n_train = len(train)


train20_valence_aug_n1 = NOISE_augment_df(train20[['lyrics','valence','valence level']], sc='valence', scl='valence level', w_l=valence_top_words, n=1, samples=(n_train-len(train20)))
train20_valence_aug_n2 = NOISE_augment_df(train20[['lyrics','valence','valence level']], sc='valence', scl='valence level', w_l=valence_top_words, n=5, samples=(n_train-len(train20)))
train20_valence_aug_n3 = NOISE_augment_df(train20[['lyrics','valence','valence level']], sc='valence', scl='valence level', w_l=valence_top_words, n=10, samples=(n_train-len(train20)))
train20_valence_aug_n4 = NOISE_augment_df(train20[['lyrics','valence','valence level']], sc='valence', scl='valence level', w_l=valence_top_words, n=15, samples=(n_train-len(train20)))
train20_valence_aug_n5 = NOISE_augment_df(train20[['lyrics','valence','valence level']], sc='valence', scl='valence level', w_l=valence_top_words, n=20, samples=(n_train-len(train20)))


train50_valence_aug_n1 = NOISE_augment_df(train50[['lyrics','valence','valence level']], sc='valence', scl='valence level', w_l=valence_top_words, n=1, samples=(n_train-len(train50)))
train50_valence_aug_n2 = NOISE_augment_df(train50[['lyrics','valence','valence level']], sc='valence', scl='valence level', w_l=valence_top_words, n=5, samples=(n_train-len(train50)))
train50_valence_aug_n3 = NOISE_augment_df(train50[['lyrics','valence','valence level']], sc='valence', scl='valence level', w_l=valence_top_words, n=10, samples=(n_train-len(train50)))
train50_valence_aug_n4 = NOISE_augment_df(train50[['lyrics','valence','valence level']], sc='valence', scl='valence level', w_l=valence_top_words, n=15, samples=(n_train-len(train50)))
train50_valence_aug_n5 = NOISE_augment_df(train50[['lyrics','valence','valence level']], sc='valence', scl='valence level', w_l=valence_top_words, n=20, samples=(n_train-len(train50)))


train80_valence_aug_n1 = NOISE_augment_df(train80[['lyrics','valence','valence level']], sc='valence', scl='valence level', w_l=valence_top_words, n=1, samples=(n_train-len(train80)))
train80_valence_aug_n2 = NOISE_augment_df(train80[['lyrics','valence','valence level']], sc='valence', scl='valence level', w_l=valence_top_words, n=5, samples=(n_train-len(train80)))
train80_valence_aug_n3 = NOISE_augment_df(train80[['lyrics','valence','valence level']], sc='valence', scl='valence level', w_l=valence_top_words, n=10, samples=(n_train-len(train80)))
train80_valence_aug_n4 = NOISE_augment_df(train80[['lyrics','valence','valence level']], sc='valence', scl='valence level', w_l=valence_top_words, n=15, samples=(n_train-len(train80)))
train80_valence_aug_n5 = NOISE_augment_df(train80[['lyrics','valence','valence level']], sc='valence', scl='valence level', w_l=valence_top_words, n=20, samples=(n_train-len(train80)))


100%|██████████| 4534/4534 [00:01<00:00, 2846.05it/s]
100%|██████████| 4534/4534 [00:02<00:00, 1961.40it/s]
100%|██████████| 4534/4534 [00:01<00:00, 2294.65it/s]
100%|██████████| 4534/4534 [00:02<00:00, 2168.01it/s]
100%|██████████| 4534/4534 [00:01<00:00, 2704.59it/s]
100%|██████████| 4534/4534 [00:01<00:00, 2739.94it/s]
100%|██████████| 4534/4534 [00:01<00:00, 2799.80it/s]
100%|██████████| 4534/4534 [00:01<00:00, 2840.94it/s]
100%|██████████| 4534/4534 [00:01<00:00, 2459.09it/s]
100%|██████████| 4534/4534 [00:01<00:00, 2634.55it/s]
100%|██████████| 4534/4534 [00:01<00:00, 2663.69it/s]
100%|██████████| 4534/4534 [00:01<00:00, 2716.01it/s]
100%|██████████| 4534/4534 [00:02<00:00, 2136.84it/s]
100%|██████████| 4534/4534 [00:02<00:00, 1993.69it/s]
100%|██████████| 4534/4534 [00:01<00:00, 2526.70it/s]
100%|██████████| 4534/4534 [00:01<00:00, 2506.34it/s]
100%|██████████| 4534/4534 [00:01<00:00, 2501.34it/s]
100%|██████████| 4534/4534 [00:01<00:00, 2513.08it/s]
100%|██████████| 4534/4534 [

In [16]:
# Save train augmented samples dfs, different n values
f_train_valence_samples_aug_n = open('train_valence_samples_aug_n.pickle', 'wb')
pickle.dump([train20_valence_aug_n1,train20_valence_aug_n2,train20_valence_aug_n3,train20_valence_aug_n4,train20_valence_aug_n5,train50_valence_aug_n1,train50_valence_aug_n2,train50_valence_aug_n3,train50_valence_aug_n4,train50_valence_aug_n5,train80_valence_aug_n1,train80_valence_aug_n2,train80_valence_aug_n3,train80_valence_aug_n4,train80_valence_aug_n5], f_train_valence_samples_aug_n)
f_train_valence_samples_aug_n.close()