# Machine Learning Project

Spotify dataset

## Description des données

In [None]:
spotify_data <- read.table(file = "data/spotify-extr.txt", header = T, sep = " ")

In [None]:
head(spotify_data)

In [None]:
dim(spotify_data)

In [None]:
sapply(spotify_data, class)

In [None]:
sum(is.na(spotify_data))

In [None]:
spotify_data$pop.class <- factor(spotify_data$pop.class, order = T)
spotify_data$key <- factor(spotify_data$key)
spotify_data$mode <- factor(spotify_data$mode)

In [None]:
levels(spotify_data$pop.class)
levels(spotify_data$key)
levels(spotify_data$mode)

In [None]:
pop.class <- spotify_data$pop.class
key <- spotify_data$key
popularity <- spotify_data$popularity

In [None]:
percentage <- prop.table(table(pop.class)) * 100
freqs <- cbind(freq = table(pop.class), percentage = percentage)
freqs

In [None]:
percentage <- prop.table(table(key)) * 100
freqs <- cbind(freq = table(key), percentage = percentage)
freqs

In [None]:
summary(spotify_data)

## Analyses uni et multidimensionnelles

In [None]:
library(vcd)

data_quant <- spotify_data[, -c(8, 11, 15)]

options(repr.plot.width = 15, repr.plot.height = 20)
par(mfrow = c(4, 3))
for (i in 1:12) {
    hist(data_quant[, i], main = paste("Histogram of", colnames(data_quant)[i], sep = " "), 
        breaks = ifelse(i == 2, 100, 20), xlab = colnames(data_quant)[i])
}

In [None]:
spotify_data$log_duration <- log(spotify_data$duration)
hist(spotify_data$log_duration, main = "Histogram of Log of Duration", breaks = 50, 
    xlab = "Duration")

In [None]:
spotify_data$tempo_norm <- scale(spotify_data$tempo)
hist(spotify_data$tempo_norm, main = "Histogram of Tempo Normalized", breaks = 50, 
    xlab = "Tempo")

In [None]:
spotify_data$dance_norm <- scale(spotify_data$danceability)
hist(spotify_data$tempo_norm, main = "Histogram of Danceability Normalized", breaks = 50, 
    xlab = "Danceability")

In [None]:
spotify_data <- spotify_data[, -c(4, 5, 14)]
head(spotify_data)

In [None]:
pop_vec <- data.frame(pop.class)
pop_vec <- dplyr::count(pop_vec, pop.class)
pop_vec

In [None]:
library(ggplot2)

p <- ggplot(pop_vec, aes(x = pop.class, y = n, fill = pop.class, label = n)) + geom_col() + 
    geom_text(nudge_y = 20) + ggtitle("Songs per Popularity Class") + xlab("Popularity Class") + 
    ylab("Song Count") + theme_bw() + theme(legend.position = "none") + theme(plot.title = element_text(hjust = 0.5))
p

In [None]:
key_vec <- data.frame(key)
key_vec <- dplyr::count(key_vec, key)

p <- ggplot(key_vec, aes(x = key, y = n, fill = key, label = n)) + geom_col() + geom_text(nudge_y = 20) + 
    ggtitle("Songs per Key") + xlab("Key") + ylab("Song Count") + theme_bw() + theme(legend.position = "none") + 
    theme(plot.title = element_text(hjust = 0.5))
p

In [None]:
library(WVPlots)

ScatterHist(spotify_data, "popularity", "year", title = "Popularity vs. Year")

In [None]:
ScatterHist(spotify_data, "acousticness", "energy", title = "Acousticness vs. Energy")

In [None]:
library(RColorBrewer)
library(corrplot)

data_quant <- spotify_data[, -c(6, 9, 12)]
M <- cor(data_quant)
corrplot(M, method = "square", col = brewer.pal(n = 10, name = "RdYlBu"))

In [None]:
attributes = c("acousticness", "energy", "loudness", "popularity")

acoustic <- spotify_data$acousticness
energy <- spotify_data$energy
loud <- spotify_data$loudness

pairs(~popularity + acoustic + energy + loud, data = spotify_data)

In [None]:
pairs(~., data = data_quant, col = "lightskyblue")

### ACP

In [None]:
library(FactoMineR)

data_quant <- data_quant[, -c(2)]
data.pca <- PCA(data_quant, scale.unit = T, graph = F, ncp = 6)

In [None]:
options(repr.plot.width = 16, repr.plot.height = 9)
par(mfrow = c(1, 2))
barplot(data.pca$eig[, 2], ylab = "Percentage", main = "Proportion of Inertia")
boxplot(data.pca$ind$coord, main = "Coordinates of Individuals")

In [None]:
library(factoextra)

eig.val <- get_eig(data.pca)
eig.val

In [None]:
plot(data.pca, choix = "varcor")
plot(data.pca, choix = "ind")

In [None]:
plot(data.pca, choix = "varcor", axes = c(1, 3))

In [None]:
# Scree plot - Eigenvalues
fviz_eig(data.pca, addlabels = TRUE)

In [None]:
barplot(eig.val[, 3], col = rgb(70/255, 130/255, 180/255))
abline(h = 80, col = "red", lty = 2)

In [None]:
var <- get_pca_var(data.pca)
var

In [None]:
fviz_pca_var(data.pca, col.var = "cos2", gradient.cols = c("#00AFBB", "#E7B800", 
    "#FC4E07"), repel = TRUE)

In [None]:
corrplot(var$cos2, is.corr = FALSE)
fviz_cos2(data.pca, choice = "var", axes = 1:2)

## Apprentissage

In [None]:
set.seed(42)
test.ratio <- 0.2
npop <- nrow(spotify_data)
nvar <- ncol(spotify_data)
ntest <- ceiling(npop * test.ratio)
testi <- sample(1:npop, ntest)
appri <- setdiff(1:npop, testi)

In [None]:
# construction de l'échantillon d'apprentissage
datappr <- spotify_data[appri, -12]
# construction de l'échantillon test
datestr <- spotify_data[testi, -12]
# vérification
str(datappr)
str(datestr)
# summary(datappr)

In [None]:
plot.res <- function(x, y, titre = "titre") {
    plot(x, y, col = "blue", xlim = c(0, 250), ylim = c(-100, 100), ylab = "Résidus", 
        xlab = "Valeurs prédites", main = titre, pch = 20)
    # points(x2, y, col='red')
    abline(h = 0, col = "green")
}

In [None]:
reg.lm <- aov(popularity ~ ., data = datappr)

# Extraction des résidus et des valeurs ajustées de ce modèle
res.lm <- reg.lm$residuals
fit.lm <- reg.lm$fitted.values

# Graphe des résidus.
plot.res(fit.lm, res.lm, "Régression linéaire sans sélection de variables")

In [None]:
summary(reg.lm)

In [None]:
str(datappr)

In [None]:
library(glmnet)

# avec des variables quantitatives seulement
reg.lasso.quanti <- glmnet(y = datappr[, 10],
                           x = as.matrix(datappr[, -c(6, 9)]))
# avec toutes les variables, créer d'abord la matrice d'expériences 
# avec 'model.matrix' (penser à retirer l'intercept du modèle)
x.mat <- model.matrix(popularity ~ . - 1, data = datappr)
reg.lasso <- glmnet(y = datappr$popularity, x = x.mat)
options(repr.plot.width = 12, repr.plot.height = 10)
plot(reg.lasso, xvar = "lambda", label = TRUE)
legend("topright", 
       legend = paste(1:ncol(x.mat), " - ", colnames(x.mat)))

In [None]:
reg.lasso.cv <- cv.glmnet(y = datappr[, 10], x = x.mat)
plot(reg.lasso.cv)

In [None]:
# valeur estimée
paste("CV estimate of lambda :", round(reg.lasso.cv$lambda.1se, 3))
# modèle correspondant
coef(reg.lasso.cv, s = "lambda.1se")

In [None]:
# Extraction des valeurs ajustées et des résidus
fit.lasso <- predict(reg.lasso.cv, s = "lambda.min", newx = x.mat)
res.lasso <- datappr$popularity - fit.lasso
# Graphe des résidus
options(repr.plot.width = 16, repr.plot.height = 9)
par(mfrow = c(1, 2))
plot.res(fit.lm, res.lm, "linéaire")
plot.res(fit.lasso, res.lasso, "linéaire, pénalité L1")