In [1]:
import sys
import random

import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import numpy as np
import seaborn as sns

from datetime import date
from sklearn.cluster import KMeans

%matplotlib inline

# Model

In [2]:
def load_data(filename):
    """
    Loads given file as pandas dataframe.
    
    Argument:
        filename: string, name of file
        
    Return:
        pandas dataframe
    """
    return pd.read_csv(filename)

In [3]:
file_id = random.randint(0, 11)
filename = "./Songs_features/songs" + str(file_id) + ".csv"

songs = load_data(filename)

In [4]:
today = date(2019,11,19)
songs["release_day"] = pd.to_datetime(songs["release_date"], format='%Y-%m-%d', errors='coerce')
songs["release_day"] = today - songs["release_day"].dt.date.values
songs["release_day"] = [x.days for x in songs["release_day"]] 
del songs["release_date"]

In [5]:
params = ["duration_ms", "danceability", "acousticness", "energy", "key", "mode", "instrumentalness", "liveness", "valence", "tempo", "popularity", "release_day"]
data = songs[params]

In [6]:
def train(df, n_clusters, min_clusters=2, m=3):
    n_clusters = random.randint(n_clusters - m, n_clusters + m)
    if n_clusters < min_clusters:
        n_clusters = min_clusters
    kmeans = KMeans(n_clusters=n_clusters, init='k-means++')
    kmeans.fit(df)
    return kmeans.labels_

In [7]:
def gen_playlist(df, min_features, max_features, divisor=10000):
    n_clusters = int(len(df)/divisor)
    if n_clusters > 6:
        n_clusters = 6
    
    best_score = -sys.maxsize - 1
    best_playlist = []
    
    df = df.dropna()
    
    for i in range(10):
        print(str(i) + "-th run...")
        output = train(df, n_clusters)
        df.loc[:, "clusters"] = output
        playlists = df.sort_values(["danceability"],ascending=False).groupby("clusters").head(10)
        lo_score = playlists.groupby("clusters")[min_features].agg("mean")
        hi_score = playlists.groupby("clusters")[max_features].agg("mean")
        score = hi_score - lo_score
        if best_score < score.max():
            best_score = score.max()
            best_playlist = playlists[playlists["clusters"] == score.idxmax()]
    
    return best_playlist, best_score

In [8]:
best_playlist, best_score = gen_playlist(data, "acousticness", "danceability")

0-th run...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


1-th run...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


2-th run...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


3-th run...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


4-th run...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


5-th run...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


6-th run...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


7-th run...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


8-th run...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


9-th run...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [9]:
songs[songs.index.isin(best_playlist.index)]

Unnamed: 0,pid,pos,artist_name,track_uri,artist_uri,track_name,album_uri,duration_ms,album_name,danceability,...,key,loudness,mode,acousticness,instrumentalness,liveness,valence,tempo,popularity,release_day
3521,57,20,Vanilla Ice,spotify:track:11d9oUiwHuYt216EFA2tiz,spotify:artist:7GXXMm3DB1VswVcuGyInUd,Ice Ice Baby,spotify:album:1LHacvoBTd7o2d7wwQ9EZD,271466,To The Extreme,0.98,...,2.0,-16.035,1.0,0.027,2e-06,0.104,0.675,115.744,63.0,10914.0
5629,96,71,Tone-Loc,spotify:track:5YIF6HSOtHN9HdcE5IPzMe,spotify:artist:5Y8EphH8Vdqu5SLj6K5vjj,Funky Cold Medina,spotify:album:6xE6A0Vwd2LmopR6Mn8UFG,248160,Loc-ed After Dark,0.988,...,5.0,-15.012,0.0,0.0755,2e-06,0.0668,0.929,117.493,57.0,11279.0
10579,169,33,Vanilla Ice,spotify:track:11d9oUiwHuYt216EFA2tiz,spotify:artist:7GXXMm3DB1VswVcuGyInUd,Ice Ice Baby,spotify:album:1LHacvoBTd7o2d7wwQ9EZD,271466,To The Extreme,0.98,...,2.0,-16.035,1.0,0.027,2e-06,0.104,0.675,115.744,63.0,10914.0
18671,284,55,Vanilla Ice,spotify:track:3XVozq1aeqsJwpXrEZrDJ9,spotify:artist:7GXXMm3DB1VswVcuGyInUd,Ice Ice Baby,spotify:album:20O6lfaDAoMhUj5TAvVbb6,254466,Vanilla Ice Is Back! - Hip Hop Classics,0.978,...,2.0,-2.79,1.0,0.0023,5e-06,0.102,0.597,116.006,72.0,4032.0
18853,287,61,Vanilla Ice,spotify:track:3XVozq1aeqsJwpXrEZrDJ9,spotify:artist:7GXXMm3DB1VswVcuGyInUd,Ice Ice Baby,spotify:album:20O6lfaDAoMhUj5TAvVbb6,254466,Vanilla Ice Is Back! - Hip Hop Classics,0.978,...,2.0,-2.79,1.0,0.0023,5e-06,0.102,0.597,116.006,72.0,4032.0
26580,389,14,Vanilla Ice,spotify:track:3XVozq1aeqsJwpXrEZrDJ9,spotify:artist:7GXXMm3DB1VswVcuGyInUd,Ice Ice Baby,spotify:album:20O6lfaDAoMhUj5TAvVbb6,254466,Vanilla Ice Is Back! - Hip Hop Classics,0.978,...,2.0,-2.79,1.0,0.0023,5e-06,0.102,0.597,116.006,72.0,4032.0
34060,492,40,Vanilla Ice,spotify:track:11d9oUiwHuYt216EFA2tiz,spotify:artist:7GXXMm3DB1VswVcuGyInUd,Ice Ice Baby,spotify:album:1LHacvoBTd7o2d7wwQ9EZD,271466,To The Extreme,0.98,...,2.0,-16.035,1.0,0.027,2e-06,0.104,0.675,115.744,63.0,10914.0
35618,511,104,Vanilla Ice,spotify:track:11d9oUiwHuYt216EFA2tiz,spotify:artist:7GXXMm3DB1VswVcuGyInUd,Ice Ice Baby,spotify:album:1LHacvoBTd7o2d7wwQ9EZD,271466,To The Extreme,0.98,...,2.0,-16.035,1.0,0.027,2e-06,0.104,0.675,115.744,63.0,10914.0
55198,803,0,Vanilla Ice,spotify:track:3XVozq1aeqsJwpXrEZrDJ9,spotify:artist:7GXXMm3DB1VswVcuGyInUd,Ice Ice Baby,spotify:album:20O6lfaDAoMhUj5TAvVbb6,254466,Vanilla Ice Is Back! - Hip Hop Classics,0.978,...,2.0,-2.79,1.0,0.0023,5e-06,0.102,0.597,116.006,72.0,4032.0
57218,833,166,Vanilla Ice,spotify:track:3XVozq1aeqsJwpXrEZrDJ9,spotify:artist:7GXXMm3DB1VswVcuGyInUd,Ice Ice Baby,spotify:album:20O6lfaDAoMhUj5TAvVbb6,254466,Vanilla Ice Is Back! - Hip Hop Classics,0.978,...,2.0,-2.79,1.0,0.0023,5e-06,0.102,0.597,116.006,72.0,4032.0


In [10]:
hi_score = songs.groupby("pid")["danceability"].agg("mean")
lo_score = songs.groupby("pid")["acousticness"].agg("mean")
score = hi_score - lo_score

In [11]:
score = hi_score - lo_score

In [12]:
songs[songs["pid"] == score.idxmax()]

Unnamed: 0,pid,pos,artist_name,track_uri,artist_uri,track_name,album_uri,duration_ms,album_name,danceability,...,key,loudness,mode,acousticness,instrumentalness,liveness,valence,tempo,popularity,release_day
52862,756,0,LMFAO,spotify:track:1V4jC0vJ5525lEF1bFgPX2,spotify:artist:3sgFRtyBnxXD5ESfmbK4dl,Shots,spotify:album:7qtQYJc0H6s3CK4c7Gp8GR,222133,Party Rock,0.823,...,1.0,-2.055,1.0,0.0118,2e-06,0.205,0.187,127.996,65.0,3974.0
52863,756,1,Aqua,spotify:track:7atIHMnLUJ5ikdnosz1GwU,spotify:artist:6kBjAFKyd0he7LiA5GQ3Gz,Barbie Girl,spotify:album:2fMLZjqCrVeAknRbcPKwGz,197906,Aquarium,0.81,...,1.0,-5.753,0.0,0.0167,0.281,0.35,0.961,129.996,14.0,8357.0
52864,756,2,Lil Jon & The East Side Boyz,spotify:track:5KY7zgFeH2GWoL1zP9mME6,spotify:artist:3ciRvbBIVz9fBoPbtSYq4x,Get Low - Street,spotify:album:3JMQJnoGhwnwAwLGuJAUEJ,334093,Kings Of Crunk,0.763,...,1.0,-7.457,1.0,0.031,0.0,0.118,0.152,101.045,2.0,6166.0
52865,756,3,Technotronic,spotify:track:7BwwjDFcpn72BrxCCRqs7d,spotify:artist:2Cd98zHVdZeOCisc6Gi2sB,Pump Up The Jam - Edit,spotify:album:6nqizbBgkRug5YRnP1YWiO,215040,Best Of,0.867,...,10.0,-7.375,0.0,0.0239,0.0,0.0491,0.675,124.81,11.0,3244.0
52866,756,4,A$AP Ferg,spotify:track:54oACFHRfeOa7VD0FTQSkJ,spotify:artist:5dHt1vcEm9qb8fCyLcB3HL,Work,spotify:album:1TDGiNgZKFw38Jv3pli7xt,178506,Work,0.726,...,10.0,-5.564,1.0,0.152,0.683,0.0845,0.561,130.04,51.0,2492.0


In [13]:
score.max()

0.7507199999999999

In [14]:
best_score

0.9602999999999998