In [1]:
import pandas as pd

# Raw Data

In [2]:
langs = pd.read_csv("data/MUSIC4ALL/raw/id_lang.csv", sep='\t')
information = pd.read_csv("data/MUSIC4ALL/raw/id_information.csv", sep='\t')
genres = pd.read_csv("data/MUSIC4ALL/raw/id_genres.csv", sep='\t')
metadata = pd.read_csv("data/MUSIC4ALL/raw/id_metadata.csv", sep='\t')

In [3]:
metadata_list = [langs, information, genres, metadata]

# Processed Data

In [4]:
lyrics = pd.read_csv("data/MUSIC4ALL/processed/id_lyrics.csv", index_col=0)

# Merge Data

In [5]:
merged_data = lyrics.copy()

for df in metadata_list:
    merged_data = merged_data.merge(df, on='id')

In [6]:
merged_data.head()

Unnamed: 0,id,lyrics,lang,artist,song,album_name,genres,spotify_id,popularity,release,danceability,energy,key,mode,valence,tempo,duration_ms
0,VWpSLG9I1Zp0NifI,"Tonight feels like we can do anything we like,...",en,Jennifer Lopez,Goin' In,Goin' In,pop,3FcuEYORBT54KSVDRhr4zC,48.0,2012,0.674,0.852,6.0,1.0,0.342,129.966,247880
1,pY1EjdIGhkZwlxje,Medicated to the point of apathy\nEach day ano...,en,Framing Hanley,Puzzle Pieces,Puzzle Pieces,"alternative rock,rock,post-hardcore",1fa5xpAkZOkOhEE4jJfbFk,56.0,2018,0.508,0.749,2.0,1.0,0.245,136.135,220387
2,8EGkyPDVZgJPOS5J,INSTRUMENTAL,INTRUMENTAL,Tim Hecker,In the Fog I,"Ravedeath, 1972","ambient,drone,electronic,experimental",59X2sWl3SBtUgkMEE1RlEx,25.0,2011,0.151,0.506,5.0,0.0,0.0726,105.305,292160
3,rHJ2S6bULIjzHN1O,If you wake up and don't want to smile\nIf it ...,en,Fleetwood Mac,Don't Stop,Rumours (Super Deluxe),"classic rock,rock,pop",3CPeWqqaHR0hmyfsWhMJQs,63.0,1977,0.672,0.72,9.0,1.0,0.828,118.731,193347
4,XQpW1rpTeT3bQWSd,If I was beautiful\nIf I had the time\nThey'd ...,en,Moby,Signs of Love,18 & 18 B-Sides,"electronic,ambient,electronica",3Mk4vWIMEBUcpVLR77NJtY,24.0,2002,0.412,0.537,2.0,1.0,0.0962,91.412,266200


# Filtering Out Instrumental Songs

In [22]:
filtered_data_by_lyrics = merged_data[merged_data.lyrics.str.lower().str.strip()!='instrumental']

In [23]:
filtered_data_by_lang = merged_data[merged_data.lang.str.lower().str.strip()!='intrumental']

In [24]:
len(filtered_data_by_lyrics)==len(filtered_data_by_lang)

False

In [25]:
len(filtered_data_by_lyrics)

99850

In [26]:
len(filtered_data_by_lang)

99852

# Investigate difference between sets

In [27]:
set(filtered_data_by_lang.index) - set(filtered_data_by_lyrics.index)

{5280, 101119}

In [28]:
set(filtered_data_by_lyrics.index) - set(filtered_data_by_lang.index)

set()

In [29]:
filtered_data_by_lang[filtered_data_by_lang.index.isin([5280, 101119])]

Unnamed: 0,id,lyrics,lang,artist,song,album_name,genres,spotify_id,popularity,release,danceability,energy,key,mode,valence,tempo,duration_ms
5280,abwQ0BlKplV2YGSs,INSTRUMENTAL,id,Miles Davis,Right Off,The Complete Jack Johnson Sessions,"jazz,jazz fusion",1wv6WYD9bRXAQiLE45elk6,27.0,1971,0.405,0.68,5.0,1.0,0.355,123.457,1613693
101119,h5D4QLD9oGvxxXiS,INSTRUMENTAL,id,J Dilla,One for Ghost,Donuts,soul,7AD3F8BbgPCgW9Q46duca9,33.0,2006,0.591,0.543,2.0,1.0,0.366,85.012,78227


# Investigate id lang

In [30]:
data_id_lang = merged_data[merged_data.lang.str.lower().str.strip()=='id']

In [31]:
data_id_lang.head()

Unnamed: 0,id,lyrics,lang,artist,song,album_name,genres,spotify_id,popularity,release,danceability,energy,key,mode,valence,tempo,duration_ms
181,6M8HCxqRBktCdXpW,"Kendalikan massa, informasi palsu\nMereka ingi...",id,Seringai,Disinformasi,Seperti Api,"hardcore,stoner metal,stoner rock,rock",2oDf7bYldoLwPVE19A9Gn9,29.0,2018,0.51,0.911,1.0,0.0,0.511,169.979,192981
192,ZwWNb5Wu3bRfqvca,Ku berjalan arungi seribu malam\nKusembunyi di...,id,Barasuara,Seribu Racun,Pikiran dan Perjalanan,"rock,indie rock",39g77d5bDuayJ0HDYbOlqa,40.0,2019,0.553,0.853,0.0,0.0,0.485,126.056,211814
1385,s4nHLXhszRtXLGXp,Ma Amba Lalitha Devi\nParashakti Sundari\nNama...,id,Craig Pruess and Ananda,Devi Prayer,108 Sacred Names of Mother Divine - Sacred Cha...,yoga,3fujO0PLd0MoerzZeq1n2X,55.0,2002,0.182,0.0755,10.0,1.0,0.0425,135.333,1283747
2744,pdb6bluKWOUyz2GX,Terang masih saja milik malam\nBahkan malam ya...,id,Payung Teduh,Malam,Dunia Batas,"folk,jazz",33LlJCqwuyvSB78SYK8AWM,40.0,2014,0.467,0.464,2.0,0.0,0.381,174.075,290345
3849,A9PgCD4YhoQ8kLaE,"Ini masalah kuasa, alibimu berharga\nkalau kam...",id,Efek Rumah Kaca,Mosi Tidak Percaya,Kamar Gelap,"alternative rock,pop",19ydLYSYO4fcJ3RTADrLUN,35.0,2008,0.532,0.763,2.0,0.0,0.545,132.954,231551


In [32]:
filtered_data_by_lyrics.to_csv('data/MUSIC4ALL/processed/filtered_music4all_v1.csv', index=True)