Skip to content

Commit

Permalink
Initial commit, added scripts
Browse files Browse the repository at this point in the history
  • Loading branch information
juandes committed May 13, 2017
0 parents commit 2835bfd
Show file tree
Hide file tree
Showing 3 changed files with 334 additions and 0 deletions.
193 changes: 193 additions & 0 deletions analysis.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,193 @@
require(ggplot2)
require(reshape2)

# To avoid scientific notation
options(scipen=999)
df.1 <- read.csv("me.csv")
df.2 <- read.csv("her.csv")

# NOTE: Everything df.1 is me, and df.2 is her

df.1 <- df.1[complete.cases(df.1),]
df.2 <- df.2[complete.cases(df.2),]

df.1.mean <- sapply(df.1[1:13],mean)
df.1.sd <- sapply(df.1[1:13],sd)

df.1.stats <- data.frame(feature=colnames(df.1[1:13]), mean = df.1.mean,
sd = df.1.sd, type = c('i','i', 'e', 'i', 'i', 'i',
'e', 'i', 'e', 'e', 'e', 'i', 'e'))
rownames(df.1.stats) <- NULL

df.2.mean <- sapply(df.2[1:13],mean)
df.2.sd <- sapply(df.2[1:13],sd)

df.2.stats <- data.frame(feature=colnames(df.2[1:13]), mean = df.2.mean,
sd = df.2.sd, type = c('i','i', 'e', 'i', 'i', 'i',
'e', 'i', 'e', 'e', 'e', 'i', 'e'))
rownames(df.2.stats) <- NULL
print(df.1.stats)


# Implicit features
# Mean of features
df.1.i <- df.1.stats[df.1.stats$type == 'i',]

ggplot(df.1.i, aes(x = reorder(feature, -mean), y = mean)) +
geom_bar(stat='identity', fill = '#00BFC4') +
theme(axis.text = element_text(colour = 'black')) +
ggtitle("Mean value of audio features of my playlist") +
xlab("Features") +
ylab("Mean") +
theme(axis.text.x = element_text(angle = 90, hjust = 1))

df.2.i <- df.2.stats[df.2.stats$type == 'i',]

ggplot(df.2.i, aes(x = reorder(feature, -mean), y = mean)) +
geom_bar(stat='identity', fill = '#F8766D') +
theme(axis.text = element_text(colour = 'black')) +
ggtitle("Mean value of audio features of her playlist") +
xlab("Features") +
ylab("Mean") +
theme(axis.text.x = element_text(angle = 90, hjust = 1))

df.mean.difference <- data.frame(feature = df.1.i$feature,
difference = df.1.i$mean - df.2.i$mean)

df.mean.difference$who <- ifelse(df.mean.difference$difference > 0, 'me', 'her')

ggplot(df.mean.difference, aes(x = reorder(feature, -difference),
y = difference, fill = who)) +
geom_bar(stat='identity') +
ggtitle("Difference between audio features mean of my songs and hers") +
xlab("Feature") +
ylab("Difference") +
theme(axis.text.x = element_text(angle = 90, hjust = 1))

# Sparsity, or variety of playlists
ggplot(df.1.i, aes(x = reorder(feature, -sd), y = sd)) +
geom_bar(stat='identity', fill = '#00BFC4') +
theme(axis.text = element_text(colour = 'black')) +
ggtitle("Standard deviation of the audio features scores of my playlist") +
xlab("Feature")+
ylab("Standard Deviation") +
theme(axis.text.x = element_text(angle = 90, hjust = 1))

sum(df.1.i$sd)

ggplot(df.2.i, aes(x = reorder(feature, -sd), y = sd)) +
geom_bar(stat='identity', fill = '#F8766D') +
theme(axis.text = element_text(colour = 'black')) +
ggtitle("Standard deviation of the audio features scores of her playlist") +
xlab("Feature")+
ylab("Standard Deviation") +
theme(axis.text.x = element_text(angle = 90, hjust = 1))

sum(df.2.i$sd)

df.1.long <- df.1[ ,c('energy', 'liveness', 'speechiness',
'acousticness', 'instrumentalness',
'danceability', 'valence')]
df.1.long <- melt(df.1.long)

ggplot(df.1.long, aes(factor(variable), value)) + geom_boxplot()

df.2.long <- df.2[ ,c('energy', 'liveness', 'speechiness',
'acousticness', 'instrumentalness',
'danceability', 'valence')]
df.2.long <- melt(df.2.long)

ggplot(df.2.long, aes(factor(variable), value)) + geom_boxplot()

## Correlations
df.1.cor <- cor(df.1[c(1,2,4:6,8,12)])
ggplot(df.1, aes(x = energy, y = danceability)) +
geom_point() +
geom_smooth(method = "loess", se = FALSE) +
ggtitle("Correlation between danceability and energy (me)")

ggplot(df.1, aes(x = energy, y = acousticness)) +
geom_point() +
geom_smooth(method = "loess", se = FALSE) +
ggtitle("Correlation between acousticness and energy (me)")

df.2.cor <- cor(df.2[c(1,2,4:6,8,12)])

ggplot(df.2, aes(x = energy, y = valence)) +
geom_point() +
geom_smooth(method = "loess", se = FALSE) +
ggtitle("Correlation between valence and energy (her)")

ggplot(df.2, aes(x = energy, y = acousticness)) +
geom_point() +
geom_smooth(method = "loess", se = FALSE) +
ggtitle("Correlation between acousticness and energy (her)")

# mean and sd of all scores
mean(as.vector(t(df.1[c(1,2,4:6,8,12)])))

mean(as.vector(t(df.2[c(1,2,4:6,8,12)])))

sd(as.vector(t(df.1[c(1,2,4:6,8,12)])))

sd(as.vector(t(df.2[c(1,2,4:6,8,12)])))




# Boringness

# NOTE: for loudness, the higher the value, the loudest the song

# NOTE: the lowest boringness is, the more boring the song is
boringness <- function(df){
return ((df$loudness) + (df$energy*100) + (df$danceability*100) + (df$tempo))
}

boring.1 <- data.frame(boringness = boringness(df.1), uri = df.1$uri, who = 'me')
head(arrange(boring.1, (boringness)), 30)
head(arrange(boring.1, desc(boringness)), 30)
summary(boring.1)

ggplot(boring.1, aes(boringness)) +
geom_histogram() +
ggtitle("Histogram of the boringness score (me)")

boring.2 <- data.frame(boringness = boringness(df.2), uri = df.2$uri, who = 'her')

ggplot(boring.2, aes(boringness)) +
geom_histogram() +
ggtitle("Histogram of the boringness score (her)")

boring.total <- rbind(boring.2, boring.1)
ggplot(boring.total, aes(x = boringness, fill = who)) +
geom_histogram(alpha=0.6, position='identity')

head(arrange(boring.2, (boringness)))
head(arrange(boring.2, desc(boringness)))

summary(boring.2)

boring.stats <- data.frame(mean = c(mean(boring.1$boringness), mean(boring.2$boringness)),
sd = c(sd(boring.1$boringness), sd(boring.2$boringness)),
who = c('me', 'her'))


ggplot(boring.stats, aes(x = who,y = mean,fill = who)) +
geom_bar(stat="identity") +
ggtitle("Boringness Score Mean")

ggplot(boring.stats, aes(x = who,y = sd,fill = who)) +
geom_bar(stat="identity") +
ggtitle("Boringness Score standard deviation")

# Write a csv with just the implicit audio features
implicit.1 <- df.1[c('energy', 'liveness', 'speechiness', 'acousticness', 'instrumentalness', 'danceability', 'valence')]
implicit.2 <- df.2[c('energy', 'liveness', 'speechiness', 'acousticness', 'instrumentalness', 'danceability', 'valence')]
implicit.1$who <- 'me'
implicit.2$who <- 'her'
implicit <- rbind(implicit.1, implicit.2)
write.table(implicit, file='implicit_features.csv', row.names = FALSE, sep=',')



100 changes: 100 additions & 0 deletions get_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
import argparse
import pprint
import sys
import os
import subprocess
import json
import spotipy
import spotipy.util as util
import pandas as pd

from spotipy.oauth2 import SpotifyClientCredentials


client_credentials_manager = SpotifyClientCredentials()


def get_playlist_content(username, playlist_id, sp):
offset = 0
songs = []
while True:
content = sp.user_playlist_tracks(username, playlist_id, fields=None,
limit=100, offset=offset, market=None)
songs += content['items']
if content['next'] is not None:
offset += 100
else:
break

with open('{}-{}'.format(username, playlist_id), 'w') as outfile:
json.dump(songs, outfile)


def get_playlist_audio_features(username, playlist_id, sp):
offset = 0
songs = []
items = []
ids = []
while True:
content = sp.user_playlist_tracks(username, playlist_id, fields=None, limit=100, offset=offset, market=None)
songs += content['items']
if content['next'] is not None:
offset += 100
else:
break

for i in songs:
ids.append(i['track']['id'])

index = 0
audio_features = []
while index < len(ids):
audio_features += sp.audio_features(ids[index:index + 50])
index += 50

features_list = []
for features in audio_features:
features_list.append([features['energy'], features['liveness'],
features['tempo'], features['speechiness'],
features['acousticness'], features['instrumentalness'],
features['time_signature'], features['danceability'],
features['key'], features['duration_ms'],
features['loudness'], features['valence'],
features['mode'], features['type'],
features['uri']])

df = pd.DataFrame(features_list, columns=['energy', 'liveness',
'tempo', 'speechiness',
'acousticness', 'instrumentalness',
'time_signature', 'danceability',
'key', 'duration_ms', 'loudness',
'valence', 'mode', 'type', 'uri'])
df.to_csv('{}-{}.csv'.format(username, playlist_id), index=False)


def get_user_playlist(username, sp):
playlists = sp.user_playlists(username)
for playlist in playlists['items']:
print("Name: {}, Number of songs: {}, Playlist ID: {} ".
format(playlist['name'].encode('utf8'),
playlist['tracks']['total'],
playlist['id']))


def main(username, playlist):
sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager)
print "Getting user playlist"
get_user_playlist(username, sp)
print "Getting playlist content"
get_playlist_content(username, playlist, sp)
print "Getting playlist audio features"
get_playlist_audio_features(username, playlist, sp)


if __name__ == '__main__':
print 'Starting...'
parser = argparse.ArgumentParser(description='description')
parser.add_argument('--username', help='username')
parser.add_argument('--playlist', help='username')
args = parser.parse_args()
main(args.username, args.playlist)
41 changes: 41 additions & 0 deletions who_prediction.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
import pandas as pd
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import GridSearchCV
from sklearn import metrics


def who_prediction():
df = pd.read_csv('implicit_features.csv')
# data is a data frame consisting of the predictors (columns 1 to 7)
data = df[df.columns[:7]]
labels = df.who

parameters = {
'alpha': (0.001, 0.0001, 0.00001, 0.000001),
'penalty': ('l2', 'elasticnet',),
'n_iter': (10, 50, 100),
'loss': ('log',)
}

# Perform a grid search with cross validation to search for the best parameters.
grid_search = GridSearchCV(SGDClassifier(), parameters, n_jobs=-1,
verbose=1, cv=5, scoring='accuracy')
grid_search.fit(data, labels)
print "Best score: {}".format(grid_search.best_score_)
print "Best parameters: {}".format(grid_search.cv_results_['params'][grid_search.best_index_])
# pd.DataFrame(grid_search.cv_results_)
# grid_search.best_estimator_.coef_

# features order: energy, liveness, speechiness, acousticness, instrumentalness
# danceability, and valence
# A Better Beginning (Mass Effect Andromeda OST), Spotify track ID: 4dU7fHmu3y9CrOTotmjkgf
print grid_search.predict([[0.266, 0.0944, 0.0380, 0.579, 0.923, 0.248, 0.0483]])
# Love On The Brain (Rihanna)
print grid_search.predict([[0.637, 0.0789, 0.0471, 0.0717, 0.0000108, 0.509, 0.385]])


def main():
who_prediction()

if __name__ == "__main__":
main()

2 comments on commit 2835bfd

@lu3do
Copy link

@lu3do lu3do commented on 2835bfd Oct 1, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hi @juandes,

I read your medium post, cool stuff.
Is there any way I can contact you, I tried info@juandes.com but it failed.

You can reach me at ludo@playlistpush.com

@juandes
Copy link
Owner Author

@juandes juandes commented on 2835bfd Oct 2, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hi @lu3do Ill send you an email.

Please sign in to comment.