-
Notifications
You must be signed in to change notification settings - Fork 13
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
0 parents
commit 2835bfd
Showing
3 changed files
with
334 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,193 @@ | ||
require(ggplot2) | ||
require(reshape2) | ||
|
||
# To avoid scientific notation | ||
options(scipen=999) | ||
df.1 <- read.csv("me.csv") | ||
df.2 <- read.csv("her.csv") | ||
|
||
# NOTE: Everything df.1 is me, and df.2 is her | ||
|
||
df.1 <- df.1[complete.cases(df.1),] | ||
df.2 <- df.2[complete.cases(df.2),] | ||
|
||
df.1.mean <- sapply(df.1[1:13],mean) | ||
df.1.sd <- sapply(df.1[1:13],sd) | ||
|
||
df.1.stats <- data.frame(feature=colnames(df.1[1:13]), mean = df.1.mean, | ||
sd = df.1.sd, type = c('i','i', 'e', 'i', 'i', 'i', | ||
'e', 'i', 'e', 'e', 'e', 'i', 'e')) | ||
rownames(df.1.stats) <- NULL | ||
|
||
df.2.mean <- sapply(df.2[1:13],mean) | ||
df.2.sd <- sapply(df.2[1:13],sd) | ||
|
||
df.2.stats <- data.frame(feature=colnames(df.2[1:13]), mean = df.2.mean, | ||
sd = df.2.sd, type = c('i','i', 'e', 'i', 'i', 'i', | ||
'e', 'i', 'e', 'e', 'e', 'i', 'e')) | ||
rownames(df.2.stats) <- NULL | ||
print(df.1.stats) | ||
|
||
|
||
# Implicit features | ||
# Mean of features | ||
df.1.i <- df.1.stats[df.1.stats$type == 'i',] | ||
|
||
ggplot(df.1.i, aes(x = reorder(feature, -mean), y = mean)) + | ||
geom_bar(stat='identity', fill = '#00BFC4') + | ||
theme(axis.text = element_text(colour = 'black')) + | ||
ggtitle("Mean value of audio features of my playlist") + | ||
xlab("Features") + | ||
ylab("Mean") + | ||
theme(axis.text.x = element_text(angle = 90, hjust = 1)) | ||
|
||
df.2.i <- df.2.stats[df.2.stats$type == 'i',] | ||
|
||
ggplot(df.2.i, aes(x = reorder(feature, -mean), y = mean)) + | ||
geom_bar(stat='identity', fill = '#F8766D') + | ||
theme(axis.text = element_text(colour = 'black')) + | ||
ggtitle("Mean value of audio features of her playlist") + | ||
xlab("Features") + | ||
ylab("Mean") + | ||
theme(axis.text.x = element_text(angle = 90, hjust = 1)) | ||
|
||
df.mean.difference <- data.frame(feature = df.1.i$feature, | ||
difference = df.1.i$mean - df.2.i$mean) | ||
|
||
df.mean.difference$who <- ifelse(df.mean.difference$difference > 0, 'me', 'her') | ||
|
||
ggplot(df.mean.difference, aes(x = reorder(feature, -difference), | ||
y = difference, fill = who)) + | ||
geom_bar(stat='identity') + | ||
ggtitle("Difference between audio features mean of my songs and hers") + | ||
xlab("Feature") + | ||
ylab("Difference") + | ||
theme(axis.text.x = element_text(angle = 90, hjust = 1)) | ||
|
||
# Sparsity, or variety of playlists | ||
ggplot(df.1.i, aes(x = reorder(feature, -sd), y = sd)) + | ||
geom_bar(stat='identity', fill = '#00BFC4') + | ||
theme(axis.text = element_text(colour = 'black')) + | ||
ggtitle("Standard deviation of the audio features scores of my playlist") + | ||
xlab("Feature")+ | ||
ylab("Standard Deviation") + | ||
theme(axis.text.x = element_text(angle = 90, hjust = 1)) | ||
|
||
sum(df.1.i$sd) | ||
|
||
ggplot(df.2.i, aes(x = reorder(feature, -sd), y = sd)) + | ||
geom_bar(stat='identity', fill = '#F8766D') + | ||
theme(axis.text = element_text(colour = 'black')) + | ||
ggtitle("Standard deviation of the audio features scores of her playlist") + | ||
xlab("Feature")+ | ||
ylab("Standard Deviation") + | ||
theme(axis.text.x = element_text(angle = 90, hjust = 1)) | ||
|
||
sum(df.2.i$sd) | ||
|
||
df.1.long <- df.1[ ,c('energy', 'liveness', 'speechiness', | ||
'acousticness', 'instrumentalness', | ||
'danceability', 'valence')] | ||
df.1.long <- melt(df.1.long) | ||
|
||
ggplot(df.1.long, aes(factor(variable), value)) + geom_boxplot() | ||
|
||
df.2.long <- df.2[ ,c('energy', 'liveness', 'speechiness', | ||
'acousticness', 'instrumentalness', | ||
'danceability', 'valence')] | ||
df.2.long <- melt(df.2.long) | ||
|
||
ggplot(df.2.long, aes(factor(variable), value)) + geom_boxplot() | ||
|
||
## Correlations | ||
df.1.cor <- cor(df.1[c(1,2,4:6,8,12)]) | ||
ggplot(df.1, aes(x = energy, y = danceability)) + | ||
geom_point() + | ||
geom_smooth(method = "loess", se = FALSE) + | ||
ggtitle("Correlation between danceability and energy (me)") | ||
|
||
ggplot(df.1, aes(x = energy, y = acousticness)) + | ||
geom_point() + | ||
geom_smooth(method = "loess", se = FALSE) + | ||
ggtitle("Correlation between acousticness and energy (me)") | ||
|
||
df.2.cor <- cor(df.2[c(1,2,4:6,8,12)]) | ||
|
||
ggplot(df.2, aes(x = energy, y = valence)) + | ||
geom_point() + | ||
geom_smooth(method = "loess", se = FALSE) + | ||
ggtitle("Correlation between valence and energy (her)") | ||
|
||
ggplot(df.2, aes(x = energy, y = acousticness)) + | ||
geom_point() + | ||
geom_smooth(method = "loess", se = FALSE) + | ||
ggtitle("Correlation between acousticness and energy (her)") | ||
|
||
# mean and sd of all scores | ||
mean(as.vector(t(df.1[c(1,2,4:6,8,12)]))) | ||
|
||
mean(as.vector(t(df.2[c(1,2,4:6,8,12)]))) | ||
|
||
sd(as.vector(t(df.1[c(1,2,4:6,8,12)]))) | ||
|
||
sd(as.vector(t(df.2[c(1,2,4:6,8,12)]))) | ||
|
||
|
||
|
||
|
||
# Boringness | ||
|
||
# NOTE: for loudness, the higher the value, the loudest the song | ||
|
||
# NOTE: the lowest boringness is, the more boring the song is | ||
boringness <- function(df){ | ||
return ((df$loudness) + (df$energy*100) + (df$danceability*100) + (df$tempo)) | ||
} | ||
|
||
boring.1 <- data.frame(boringness = boringness(df.1), uri = df.1$uri, who = 'me') | ||
head(arrange(boring.1, (boringness)), 30) | ||
head(arrange(boring.1, desc(boringness)), 30) | ||
summary(boring.1) | ||
|
||
ggplot(boring.1, aes(boringness)) + | ||
geom_histogram() + | ||
ggtitle("Histogram of the boringness score (me)") | ||
|
||
boring.2 <- data.frame(boringness = boringness(df.2), uri = df.2$uri, who = 'her') | ||
|
||
ggplot(boring.2, aes(boringness)) + | ||
geom_histogram() + | ||
ggtitle("Histogram of the boringness score (her)") | ||
|
||
boring.total <- rbind(boring.2, boring.1) | ||
ggplot(boring.total, aes(x = boringness, fill = who)) + | ||
geom_histogram(alpha=0.6, position='identity') | ||
|
||
head(arrange(boring.2, (boringness))) | ||
head(arrange(boring.2, desc(boringness))) | ||
|
||
summary(boring.2) | ||
|
||
boring.stats <- data.frame(mean = c(mean(boring.1$boringness), mean(boring.2$boringness)), | ||
sd = c(sd(boring.1$boringness), sd(boring.2$boringness)), | ||
who = c('me', 'her')) | ||
|
||
|
||
ggplot(boring.stats, aes(x = who,y = mean,fill = who)) + | ||
geom_bar(stat="identity") + | ||
ggtitle("Boringness Score Mean") | ||
|
||
ggplot(boring.stats, aes(x = who,y = sd,fill = who)) + | ||
geom_bar(stat="identity") + | ||
ggtitle("Boringness Score standard deviation") | ||
|
||
# Write a csv with just the implicit audio features | ||
implicit.1 <- df.1[c('energy', 'liveness', 'speechiness', 'acousticness', 'instrumentalness', 'danceability', 'valence')] | ||
implicit.2 <- df.2[c('energy', 'liveness', 'speechiness', 'acousticness', 'instrumentalness', 'danceability', 'valence')] | ||
implicit.1$who <- 'me' | ||
implicit.2$who <- 'her' | ||
implicit <- rbind(implicit.1, implicit.2) | ||
write.table(implicit, file='implicit_features.csv', row.names = FALSE, sep=',') | ||
|
||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,100 @@ | ||
import argparse | ||
import pprint | ||
import sys | ||
import os | ||
import subprocess | ||
import json | ||
import spotipy | ||
import spotipy.util as util | ||
import pandas as pd | ||
|
||
from spotipy.oauth2 import SpotifyClientCredentials | ||
|
||
|
||
client_credentials_manager = SpotifyClientCredentials() | ||
|
||
|
||
def get_playlist_content(username, playlist_id, sp): | ||
offset = 0 | ||
songs = [] | ||
while True: | ||
content = sp.user_playlist_tracks(username, playlist_id, fields=None, | ||
limit=100, offset=offset, market=None) | ||
songs += content['items'] | ||
if content['next'] is not None: | ||
offset += 100 | ||
else: | ||
break | ||
|
||
with open('{}-{}'.format(username, playlist_id), 'w') as outfile: | ||
json.dump(songs, outfile) | ||
|
||
|
||
def get_playlist_audio_features(username, playlist_id, sp): | ||
offset = 0 | ||
songs = [] | ||
items = [] | ||
ids = [] | ||
while True: | ||
content = sp.user_playlist_tracks(username, playlist_id, fields=None, limit=100, offset=offset, market=None) | ||
songs += content['items'] | ||
if content['next'] is not None: | ||
offset += 100 | ||
else: | ||
break | ||
|
||
for i in songs: | ||
ids.append(i['track']['id']) | ||
|
||
index = 0 | ||
audio_features = [] | ||
while index < len(ids): | ||
audio_features += sp.audio_features(ids[index:index + 50]) | ||
index += 50 | ||
|
||
features_list = [] | ||
for features in audio_features: | ||
features_list.append([features['energy'], features['liveness'], | ||
features['tempo'], features['speechiness'], | ||
features['acousticness'], features['instrumentalness'], | ||
features['time_signature'], features['danceability'], | ||
features['key'], features['duration_ms'], | ||
features['loudness'], features['valence'], | ||
features['mode'], features['type'], | ||
features['uri']]) | ||
|
||
df = pd.DataFrame(features_list, columns=['energy', 'liveness', | ||
'tempo', 'speechiness', | ||
'acousticness', 'instrumentalness', | ||
'time_signature', 'danceability', | ||
'key', 'duration_ms', 'loudness', | ||
'valence', 'mode', 'type', 'uri']) | ||
df.to_csv('{}-{}.csv'.format(username, playlist_id), index=False) | ||
|
||
|
||
def get_user_playlist(username, sp): | ||
playlists = sp.user_playlists(username) | ||
for playlist in playlists['items']: | ||
print("Name: {}, Number of songs: {}, Playlist ID: {} ". | ||
format(playlist['name'].encode('utf8'), | ||
playlist['tracks']['total'], | ||
playlist['id'])) | ||
|
||
|
||
def main(username, playlist): | ||
sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager) | ||
print "Getting user playlist" | ||
get_user_playlist(username, sp) | ||
print "Getting playlist content" | ||
get_playlist_content(username, playlist, sp) | ||
print "Getting playlist audio features" | ||
get_playlist_audio_features(username, playlist, sp) | ||
|
||
|
||
if __name__ == '__main__': | ||
print 'Starting...' | ||
parser = argparse.ArgumentParser(description='description') | ||
parser.add_argument('--username', help='username') | ||
parser.add_argument('--playlist', help='username') | ||
args = parser.parse_args() | ||
main(args.username, args.playlist) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,41 @@ | ||
import pandas as pd | ||
from sklearn.linear_model import SGDClassifier | ||
from sklearn.model_selection import GridSearchCV | ||
from sklearn import metrics | ||
|
||
|
||
def who_prediction(): | ||
df = pd.read_csv('implicit_features.csv') | ||
# data is a data frame consisting of the predictors (columns 1 to 7) | ||
data = df[df.columns[:7]] | ||
labels = df.who | ||
|
||
parameters = { | ||
'alpha': (0.001, 0.0001, 0.00001, 0.000001), | ||
'penalty': ('l2', 'elasticnet',), | ||
'n_iter': (10, 50, 100), | ||
'loss': ('log',) | ||
} | ||
|
||
# Perform a grid search with cross validation to search for the best parameters. | ||
grid_search = GridSearchCV(SGDClassifier(), parameters, n_jobs=-1, | ||
verbose=1, cv=5, scoring='accuracy') | ||
grid_search.fit(data, labels) | ||
print "Best score: {}".format(grid_search.best_score_) | ||
print "Best parameters: {}".format(grid_search.cv_results_['params'][grid_search.best_index_]) | ||
# pd.DataFrame(grid_search.cv_results_) | ||
# grid_search.best_estimator_.coef_ | ||
|
||
# features order: energy, liveness, speechiness, acousticness, instrumentalness | ||
# danceability, and valence | ||
# A Better Beginning (Mass Effect Andromeda OST), Spotify track ID: 4dU7fHmu3y9CrOTotmjkgf | ||
print grid_search.predict([[0.266, 0.0944, 0.0380, 0.579, 0.923, 0.248, 0.0483]]) | ||
# Love On The Brain (Rihanna) | ||
print grid_search.predict([[0.637, 0.0789, 0.0471, 0.0717, 0.0000108, 0.509, 0.385]]) | ||
|
||
|
||
def main(): | ||
who_prediction() | ||
|
||
if __name__ == "__main__": | ||
main() |
2835bfd
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Hi @juandes,
I read your medium post, cool stuff.
Is there any way I can contact you, I tried info@juandes.com but it failed.
You can reach me at ludo@playlistpush.com
2835bfd
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Hi @lu3do Ill send you an email.