## Spotify Train Test Splitting

This notebook splits the full data into training and test sets.

In [2]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split

In [3]:
np.random.seed(62)
df = pd.read_csv('data/final_spotify_data.csv')

In [5]:
df.head()

Unnamed: 0,track_id,album_id,track_number,track_count,duration,explicit,track_pop,album_pop,comparative_pop,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo
0,47HusOVsGPUcsJtMq40aRd,3hl2uaqB2zBZdetkeATUBe,2,13,357812,0,37,29,8.666667,0.843,0.46,-13.512,0.054,0.00891,0.00247,0.132,0.84,109.123
1,74fXmxkP8507tIXYkVbKHM,3aqSysSJTyqHNyyiDiNGsI,2,14,356680,0,0,0,0.0,0.719,0.849,-8.704,0.0555,0.0542,0.93,0.702,0.0675,139.976
2,6cf6rLb8qcklvJv90W6HCW,1KlU96Hw9nlvqpBPlSqcTV,19,22,220320,0,50,69,-19.904762,0.73,0.587,-5.815,0.0283,0.306,0.0,0.143,0.649,125.07
3,1Ic1ugEU9PT8RvNippxDSG,2X8CnmJ7E4OgCQenKJSNrs,3,9,275360,0,16,12,4.5,0.74,0.956,-5.205,0.0861,0.342,0.122,0.63,0.193,112.004
4,4kln61xMRKk2bwgrCXY4cV,2G6chemqdiNHxEw1ucZ7pw,7,14,289373,0,33,48,-16.153846,0.577,0.458,-7.115,0.0338,0.022,0.00422,0.084,0.395,127.33


In [6]:
df.columns

Index(['track_id', 'album_id', 'track_number', 'track_count', 'duration',
       'explicit', 'track_pop', 'album_pop', 'comparative_pop', 'danceability',
       'energy', 'loudness', 'speechiness', 'acousticness', 'instrumentalness',
       'liveness', 'valence', 'tempo'],
      dtype='object')

In [7]:
X_train, X_test, y_train, y_test = train_test_split(df.loc[:, df.columns != 'comparative_pop'], 
                                                         df.comparative_pop, test_size=0.2, 
                                                         random_state = 62, 
                                                         stratify = df.explicit)

In [14]:
X_train['comparative_pop'] = y_train

In [16]:
X_test['comparative_pop'] = y_test

In [15]:
X_train.head()

Unnamed: 0,track_id,album_id,track_number,track_count,duration,explicit,track_pop,album_pop,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,comparative_pop
144409,3FVunrxKMJKPvDxnRttGci,0V8UU4es3kdF1djFZKlduq,8,12,329175,1,42,53,0.819,0.529,-11.54,0.0714,0.0351,0.0517,0.183,0.562,114.987,-12.0
28077,4LQNFT3PncoZ9WgZasPSSo,0F9EVGAk5Pi7um5YRPNgsO,11,11,161200,0,15,22,0.486,0.497,-7.575,0.0403,0.646,0.0,0.165,0.664,84.113,-7.7
65677,5ylBnaUYbWixo1ptaaWP2P,2wV8aQLr32lVSeTHDaEC0I,12,13,272066,0,26,40,0.433,0.502,-6.157,0.0278,0.00401,4e-06,0.0725,0.038,68.009,-15.166667
148353,48gzGGNiaTawmhN6Gkhmp9,2xYfsxlu6Qg4dlC3ShGIsP,1,5,1310000,0,6,6,0.232,0.203,-21.255,0.0385,0.964,0.893,0.187,0.0727,151.676,0.0
60069,6zlSNFZjKhLi6KcMAd0X1s,19Gy0qg8XCbCvnRo5GtrNU,7,13,190893,0,11,17,0.543,0.158,-11.656,0.0328,0.923,0.876,0.0987,0.0587,80.041,-6.5


In [17]:
X_test.head()

Unnamed: 0,track_id,album_id,track_number,track_count,duration,explicit,track_pop,album_pop,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,comparative_pop
40215,4HPYBv8o3tzRI4BDLhPMmX,0iZZwv2fp5uF8hTYFNWFbW,6,13,370426,0,2,10,0.495,0.164,-17.273,0.0284,0.819,0.00164,0.08,0.24,90.854,-8.666667
133056,4JQSMg83F8qYwSBt5xOXsQ,1ciAVKFdlpLi2eGDlXv6Bo,2,17,225146,0,61,57,0.561,0.707,-4.91,0.0465,0.00114,0.0,0.112,0.386,91.943,4.25
100015,0FU2aPc9u6Ae2klbFsgbkI,0YaeFHEYGpdzdFIxDRFvCv,12,15,371320,0,34,50,0.478,0.611,-5.929,0.0299,0.0195,0.0649,0.164,0.558,62.022,-17.142857
23930,2wgk0vMQCnwkETPJiqsQTt,2gDCBa5P6SnAaY0cadefQS,3,14,301813,0,43,54,0.528,0.808,-5.198,0.0504,0.0545,8e-06,0.073,0.649,154.948,-11.846154
19082,5gTuRkjPru8nt2wH2xToG4,6oYPDF3t8SJ8zjWva4CRAL,19,19,276600,0,7,18,0.587,0.629,-7.439,0.0424,0.183,0.0,0.0686,0.425,114.993,-11.611111


In [18]:
len(X_train) + len(X_test) == len(df)

True

In [19]:
# Write the different sets to CSVs
X_train.to_csv("data/training.csv", index=False)
X_test.to_csv("data/testing.csv", index=False)