In [1]:
import pandas as pd
import numpy as np
import random

In [2]:
random.seed(24)
np.random.seed(32)

In [3]:
from ast import literal_eval

In [6]:
tcc = pd.read_csv("data/MUSIC4ALL/processed/tcc_music4all.csv", index_col=0)

In [7]:
tcc.genres = tcc.genres.apply(literal_eval)

In [8]:
tcc.columns

Index(['lyrics', 'artist', 'song', 'album_name', 'genres', 'spotify_id',
       'popularity', 'danceability', 'energy', 'key', 'mode', 'valence',
       'tempo', 'duration_ms', 'acousticness', 'instrumentalness', 'liveness',
       'loudness', 'speechiness', 'time_signature', 'explicit', 'nb_genres',
       'year'],
      dtype='object')

In [9]:
tcc = tcc.drop(columns=["spotify_id", "instrumentalness", "genres"])

In [10]:
tcc.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6513 entries, 35983 to 41006
Data columns (total 20 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   lyrics          6513 non-null   object 
 1   artist          6513 non-null   object 
 2   song            6513 non-null   object 
 3   album_name      6513 non-null   object 
 4   popularity      6513 non-null   int64  
 5   danceability    6513 non-null   float64
 6   energy          6513 non-null   float64
 7   key             6513 non-null   int64  
 8   mode            6513 non-null   int64  
 9   valence         6513 non-null   float64
 10  tempo           6513 non-null   float64
 11  duration_ms     6513 non-null   int64  
 12  acousticness    6513 non-null   float64
 13  liveness        6513 non-null   float64
 14  loudness        6513 non-null   float64
 15  speechiness     6513 non-null   float64
 16  time_signature  6513 non-null   int64  
 17  explicit        6513 non-nul

In [11]:
tcc = tcc.sort_values("year")

# OoT Separation

In [12]:
tcc.groupby("explicit").size()/len(tcc)

explicit
0    0.970674
1    0.029326
dtype: float64

In [13]:
tcc.year.unique()

array([1957, 1958, 1963, 1964, 1965, 1966, 1967, 1968, 1969, 1970, 1971,
       1972, 1973, 1974, 1975, 1976, 1977, 1978, 1979, 1980, 1981, 1982,
       1983, 1984, 1985, 1986, 1987, 1988, 1989, 1990, 1991, 1992, 1993,
       1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004,
       2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015,
       2016, 2017, 2018, 2019, 2020, 2021, 2022])

In [14]:
for y in range(2012, 2021):
    print(y)
    filtered = tcc[tcc.year >= y]
    print(filtered.groupby("explicit").size()/len(filtered))
    print(len(filtered))
    print(len(filtered)/len(tcc))
    print("--------")

2012
explicit
0    0.944199
1    0.055801
dtype: float64
3172
0.4870259481037924
--------
2013
explicit
0    0.938489
1    0.061511
dtype: float64
2845
0.4368186703516045
--------
2014
explicit
0    0.931529
1    0.068471
dtype: float64
2512
0.385690158145248
--------
2015
explicit
0    0.923621
1    0.076379
dtype: float64
2121
0.3256563795485951
--------
2016
explicit
0    0.910329
1    0.089671
dtype: float64
1762
0.27053585137417474
--------
2017
explicit
0    0.897866
1    0.102134
dtype: float64
1312
0.20144326731153078
--------
2018
explicit
0    0.899865
1    0.100135
dtype: float64
739
0.11346537693843083
--------
2019
explicit
0    0.878947
1    0.121053
dtype: float64
190
0.02917242438200522
--------
2020
explicit
0    1.0
dtype: float64
14
0.0021495470597267005
--------


In [15]:
chosen_year = 2018

In [16]:
oot_testing_set = tcc[tcc.year >= chosen_year].drop(columns="year")

In [17]:
print(oot_testing_set.groupby("explicit").size()/len(oot_testing_set))
print(len(oot_testing_set))
print(len(oot_testing_set)/len(tcc))

explicit
0    0.899865
1    0.100135
dtype: float64
739
0.11346537693843083


In [18]:
oot_dev_set = tcc[tcc.year < chosen_year].drop(columns="year")

In [19]:
print(oot_dev_set.groupby("explicit").size()/len(oot_dev_set))
print(len(oot_dev_set))
print(len(oot_dev_set)/len(tcc))

explicit
0    0.979737
1    0.020263
dtype: float64
5774
0.8865346230615692


In [20]:
oot_dev_set.to_csv("data/dataset/oot_dev_set.csv", index=True)

In [21]:
oot_testing_set.to_csv("data/dataset/oot_test_set.csv", index=True)

# OoS Separation

In [22]:
len(tcc[tcc.explicit==1].artist.unique())

101

In [23]:
pickable = tcc.copy(True)

In [24]:
oos_test_set_idx = set()
explicit = 0

In [25]:
while len(pickable) > 0:
    if len(oos_test_set_idx) >= 700:
        break
    
    if explicit == 0 or explicit/len(oos_test_set_idx) <= 0.03:
        art = random.choice(pickable[pickable.explicit==1].artist.unique())
        explicit += len(pickable[(pickable.explicit==1) & (pickable.artist==art)])
    else:
        art = random.choice(pickable[pickable.explicit==0].artist.unique())
    
    idxs = pickable[pickable.artist==art].index
    pickable = pickable.drop(index=idxs)
    oos_test_set_idx.update(idxs)

In [26]:
oos_test_set = tcc[tcc.index.isin(oos_test_set_idx)]

In [27]:
oos_dev_set = tcc[~tcc.index.isin(oos_test_set_idx)]

In [30]:
oos_dev_set.to_csv("data/dataset/oos_dev_set.csv", index=True)

In [4]:
oos_dev_set = pd.read_csv("data/dataset/oos_dev_set.csv", index_col=0)

In [8]:
print(oos_dev_set.groupby("explicit").size()/len(oos_dev_set))
print(oos_dev_set.groupby("explicit").size())
print(len(oos_dev_set))
print(len(oos_dev_set)/len(tcc))

explicit
0    0.972299
1    0.027701
dtype: float64
explicit
0    5651
1     161
dtype: int64
5812
0.8923691079379702


In [31]:
oos_test_set.to_csv("data/dataset/oos_test_set.csv", index=True)

In [11]:
oos_test_set = pd.read_csv("data/dataset/oos_test_set.csv", index_col=0)

In [12]:
print(oos_test_set.groupby("explicit").size()/len(oos_dev_set))
print(oos_test_set.groupby("explicit").size())
print(len(oos_test_set))
print(len(oos_test_set)/len(tcc))

explicit
0    0.957204
1    0.042796
dtype: float64
explicit
0    671
1     30
dtype: int64
701
0.10763089206202979
