# Script to combine MusicxMatch Lyrics data with the Million Song Dataset

### Import the required libraries

In [1]:
import sqlite3
import pandas as pd
import numpy as np

In [2]:
import findspark
findspark.init()
import pyspark

In [3]:
from pyspark.sql import SparkSession
from pyspark.sql import SQLContext
from pyspark.sql.functions import *
from pyspark.sql import functions as F
from pyspark.sql.window import Window
from pyspark.sql.functions import rank, col

### Create a Spark Session

In [4]:
# sc = pyspark.SparkContext(appName="dataset_creation")
# sqlContext = SQLContext(sc)
spark = SparkSession.builder \
    .master("local[1]") \
    .appName("lyrics_dataset_creation") \
    .getOrCreate()

In [5]:
# sc.stop()
# spark.stop()

## Read data

### Song Data from MSD

In [6]:
# set the path to 'track_metadata.db' file here
track_metadata_db_path = "C:\\Users\\mgandhi39\\Downloads\\DVA Project\\Dataset\MSD\\track_metadata.db"

In [7]:
conn = sqlite3.connect(track_metadata_db_path)
cursor = conn.cursor()

In [8]:
query = 'SELECT name FROM sqlite_master WHERE type="table"'
cursor.execute(query)
print(cursor.fetchall())

[('songs',)]


In [9]:
query = "SELECT * FROM songs"
pd_songs = pd.read_sql_query(query, conn)

In [10]:
pd_songs.head()

Unnamed: 0,track_id,title,song_id,release,artist_id,artist_mbid,artist_name,duration,artist_familiarity,artist_hotttnesss,year,track_7digitalid,shs_perf,shs_work
0,TRMMMYQ128F932D901,Silent Night,SOQMMHC12AB0180CB8,Monster Ballads X-Mas,ARYZTJS1187B98C555,357ff05d-848a-44cf-b608-cb34b5701ae5,Faster Pussy cat,252.05506,0.649822,0.394032,2003,7032331,-1,0
1,TRMMMKD128F425225D,Tanssi vaan,SOVFVAK12A8C1350D9,Karkuteillä,ARMVN3U1187FB3A1EB,8d7ef530-a6fd-4f8f-b2e2-74aec765e0f9,Karkkiautomaatti,156.55138,0.439604,0.356992,1995,1514808,-1,0
2,TRMMMRX128F93187D9,No One Could Ever,SOGTUKN12AB017F4F1,Butter,ARGEKB01187FB50750,3d403d44-36ce-465c-ad43-ae877e65adc4,Hudson Mohawke,138.97098,0.643681,0.437504,2006,6945353,-1,0
3,TRMMMCH128F425532C,Si Vos Querés,SOBNYVR12A8C13558C,De Culo,ARNWYLR1187B9B2F9C,12be7648-7094-495f-90e6-df4189d68615,Yerba Brava,145.05751,0.448501,0.372349,2003,2168257,-1,0
4,TRMMMWA128F426B589,Tangle Of Aspens,SOHSBXH12A8C13B0DF,Rene Ablaze Presents Winter Sessions,AREQDTE1269FB37231,,Der Mystic,514.29832,0.0,0.0,0,2264873,-1,0


In [11]:
pd_songs.describe()

Unnamed: 0,duration,artist_familiarity,artist_hotttnesss,year,track_7digitalid,shs_perf,shs_work
count,1000000.0,1000000.0,1000000.0,1000000.0,1000000.0,1000000.0,1000000.0
mean,249.500755,0.556915,0.379797,1030.325652,4213995.0,940.437395,480.173895
std,126.229636,0.140207,0.12605,998.745002,2640600.0,9168.962303,5437.859242
min,0.31302,-1.0,-1.0,0.0,-1.0,-1.0,-938.0
25%,180.74077,0.478219,0.336321,0.0,1974300.0,-1.0,0.0
50%,228.85832,0.558878,0.386194,1969.0,3947503.0,-1.0,0.0
75%,289.93261,0.641392,0.444066,2002.0,6469678.0,-1.0,0.0
max,3034.90567,1.0,1.082503,2011.0,9091277.0,151082.0,117287.0


### Lyrics data from MusicxMatch

In [12]:
# set the path to 'mxm_dataset.db' file here
lyrics_db_path = "C:\\Users\\mgandhi39\\Downloads\\DVA Project\\Dataset\\MusicxMatch\\mxm_dataset.db"

In [13]:
conn2 = sqlite3.connect(lyrics_db_path)
cursor2 = conn2.cursor()

In [14]:
query = 'SELECT name FROM sqlite_master WHERE type="table"'
cursor2.execute(query)
print(cursor2.fetchall())

[('words',), ('lyrics',)]


In [15]:
pd_lyrics = pd.read_sql_query('SELECT * FROM lyrics', conn2)

In [16]:
pd_lyrics.head()

Unnamed: 0,track_id,mxm_tid,word,count,is_test
0,TRAAAAV128F421A322,4623710,i,6,0
1,TRAAAAV128F421A322,4623710,the,4,0
2,TRAAAAV128F421A322,4623710,you,2,0
3,TRAAAAV128F421A322,4623710,to,2,0
4,TRAAAAV128F421A322,4623710,and,5,0


In [17]:
pd_lyrics.shape

(19045332, 5)

In [18]:
pd.unique(pd_lyrics["track_id"]).shape

(237662,)

## Trim songs data to match lyrics data size


In [19]:
song_data = pd_songs[pd_songs["track_id"].isin(pd_lyrics["track_id"])]

In [20]:
song_data.shape

(237662, 14)

In [21]:
song_data.head()

Unnamed: 0,track_id,title,song_id,release,artist_id,artist_mbid,artist_name,duration,artist_familiarity,artist_hotttnesss,year,track_7digitalid,shs_perf,shs_work
15,TRMMMKI128F931D80D,006,SOSDCFG12AB0184647,Lena 20 År,ARSB5591187B99A848,fba3e876-68f1-4a1f-99d9-c604480202ba,Lena Philipsson,262.26893,0.529819,0.410229,1998,6010886,-1,0
16,TRMMMUT128F42646E8,(Looking For) The Heart Of Saturday,SOBARPM12A8C133DFF,Cover Girl,ARDW5AW1187FB55708,42222090-c5e5-4243-8582-c29bc8b63ec6,Shawn Colvin,216.47628,0.685503,0.446733,1994,3156269,-1,0
17,TRMMMQY128F92F0EA3,Ethos of Coercion,SOKOVRQ12A8C142811,Descend Into Depravity,ARGWPP11187B9AEF43,f76167bb-c117-4022-8b6b-54c796edf5c9,Dying Fetus,196.0224,0.734471,0.511976,2009,6782293,-1,0
18,TRMMMTK128F424EF7C,Rock-N-Rule,SOIMMJJ12AF72AD643,I'm Only A Man (Bonus Track Version),ARDT9VH1187B999C0B,6b22de04-fb48-44aa-bd02-c1427f635477,Emery,217.57342,0.738996,0.563367,2007,1501464,-1,0
20,TRMMMQV12903CA201E,I Made It Over,SOOUESZ12AB0189AFD,Let's Celebrate (He Is Risen),ARVF2AD1187FB47580,fd711779-5524-4ed3-8d68-da0b867caa34,Rev. Timothy Wright,353.77587,0.37719,0.290242,0,6345139,-1,0


## Output songs and lyrics data to CSV

In [22]:
# set songs.csv output path here
song_data_output_path = "C:\\Users\\mgandhi39\\Downloads\\DVA Project\\Dataset\songs.csv"
song_data.to_csv(song_data_output_path, index = False)

In [23]:
# set lyrics.csv output path here
lyrics_data_output_path = "C:\\Users\\mgandhi39\\Downloads\\DVA Project\\Dataset\lyrics.csv"
pd_lyrics.to_csv(lyrics_data_output_path, index = False)