# Projet de Machine Learning

Notebook R avec les codes utilisés pour le rapport final.

## Importation des librairies

In [0]:
library(vcd)
library(ggplot2)
# library(WVPlots)
library(RColorBrewer)
library(corrplot)
library(FactoMineR)
library(factoextra)
library(glmnet)

# Obtenir les données

In [0]:
spotify_data <- read.table(file = "spotify-extr.txt", header = T, sep = " ")

# Description de l'ensemble du jeu de données

In [0]:
head(spotify_data)

In [0]:
dim(spotify_data)

In [0]:
sapply(spotify_data, class)

In [0]:
sum(is.na(spotify_data))

In [0]:
spotify_data$pop.class <- factor(spotify_data$pop.class, order = T)
spotify_data$key <- factor(spotify_data$key)
spotify_data$mode <- factor(spotify_data$mode)

In [0]:
levels(spotify_data$pop.class)
levels(spotify_data$key)
levels(spotify_data$mode)

In [0]:
pop.class <- spotify_data$pop.class
key <- spotify_data$key
popularity <- spotify_data$popularity

In [0]:
percentage <- prop.table(table(pop.class)) * 100
freqs <- cbind(freq = table(pop.class), percentage = percentage)
freqs

In [0]:
percentage <- prop.table(table(key)) * 100
freqs <- cbind(freq = table(key), percentage = percentage)
freqs

In [0]:
summary(spotify_data)

# Analyse uni et multidimensionnelles

In [0]:
data_quant <- spotify_data[, -c(8, 11, 15)]

options(repr.plot.width = 15, repr.plot.height = 20)
par(mfrow = c(4, 3))
for (i in 1:12) {
    hist(data_quant[, i], main = paste("Histogram of", colnames(data_quant)[i], sep = " "), 
        breaks = ifelse(i == 2, 100, 20), xlab = colnames(data_quant)[i])
}

In [0]:
spotify_data$log_duration <- log(spotify_data$duration)
hist(spotify_data$log_duration, main = "Histogram of Log of Duration", breaks = 50, 
    xlab = "Duration")

In [0]:
spotify_data$tempo_norm <- scale(spotify_data$tempo)
hist(spotify_data$tempo_norm, main = "Histogram of Tempo Normalized", breaks = 50, 
    xlab = "Tempo")

In [0]:
spotify_data$dance_norm <- scale(spotify_data$danceability)
hist(spotify_data$tempo_norm, main = "Histogram of Danceability Normalized", breaks = 50, 
    xlab = "Danceability")

In [0]:
spotify_data <- spotify_data[, -c(4, 5, 14)]
head(spotify_data)

In [0]:
pop_vec <- data.frame(pop.class)
pop_vec <- dplyr::count(pop_vec, pop.class)
pop_vec

In [0]:
library(ggplot2)
p <- ggplot(pop_vec, aes(x = pop.class, y = n, fill = pop.class, label = n)) + geom_col() + 
    geom_text(nudge_y = 20) + ggtitle("Songs per Popularity Class") + xlab("Popularity Class") + 
    ylab("Song Count") + theme_bw() + theme(legend.position = "none") + theme(plot.title = element_text(hjust = 0.5))
p

In [0]:
key_vec <- data.frame(key)
key_vec <- dplyr::count(key_vec, key)

p <- ggplot(key_vec, aes(x = key, y = n, fill = key, label = n)) + geom_col() + geom_text(nudge_y = 20) + 
    ggtitle("Songs per Key") + xlab("Key") + ylab("Song Count") + theme_bw() + theme(legend.position = "none") + 
    theme(plot.title = element_text(hjust = 0.5))
p

In [0]:
ScatterHist(spotify_data, "popularity", "year", title = "Popularity vs. Year")

In [0]:
ScatterHist(spotify_data, "acousticness", "energy", title = "Acousticness vs. Energy")

In [0]:
data_quant <- spotify_data[, -c(6, 9, 12)]
M <- cor(data_quant)
corrplot(M, method = "square", col = brewer.pal(n = 10, name = "RdYlBu"))

In [0]:
attributes = c("acousticness", "energy", "loudness", "popularity")

acoustic <- spotify_data$acousticness
energy <- spotify_data$energy
loud <- spotify_data$loudness

pairs(~popularity + acoustic + energy + loud, data = spotify_data)

In [0]:
pairs(~., data = data_quant, col = "lightskyblue")

# ACP

In [0]:
data_quant <- data_quant[, -c(2)]
data.pca <- PCA(data_quant, scale.unit = T, graph = F, ncp = 6)

In [0]:
options(repr.plot.width = 16, repr.plot.height = 9)
par(mfrow = c(1, 2))
barplot(data.pca$eig[, 2], ylab = "Percentage", main = "Proportion of Inertia")
boxplot(data.pca$ind$coord, main = "Coordinates of Individuals")

In [0]:
eig.val <- get_eig(data.pca)
eig.val

In [0]:
plot(data.pca, choix = "varcor")
plot(data.pca, choix = "ind")

In [0]:
plot(data.pca, choix = "varcor", axes = c(1, 3))

In [0]:
# Scree plot - Eigenvalues
fviz_eig(data.pca, addlabels = TRUE)

In [0]:
barplot(eig.val[, 3], col = rgb(70/255, 130/255, 180/255))
abline(h = 80, col = "red", lty = 2)

In [0]:
var <- get_pca_var(data.pca)
var

In [0]:
fviz_pca_var(data.pca, col.var = "cos2", gradient.cols = c("#00AFBB", "#E7B800", 
    "#FC4E07"), repel = TRUE)

In [0]:
corrplot(var$cos2, is.corr = FALSE)
fviz_cos2(data.pca, choice = "var", axes = 1:2)

# Préparation des données

In [0]:
spotify_pop_class = spotify_data$pop.class
spotify_key = spotify_data$key

In [0]:
spotify_pop_class_encoded = as.numeric(factor(spotify_pop_class)) - 1
spotify_pop_class_encoded[0:10]

In [0]:
spotify_key_encoded = as.numeric(factor(spotify_key)) - 1
spotify_key_encoded[0:10]

In [0]:
spotify_data$key = spotify_key_encoded
spotify_data$pop.class = spotify_pop_class_encoded

In [0]:
library(purrr)
library(raster)
X = subset(spotify_data, select = -c(popularity, pop.class))
y_class = spotify_data$pop.class
y_reg = spotify_data$popularity
# y_class = flatten(y_class)
# y_reg = flatten(y_reg)

In [0]:
head(X)

In [0]:
print(y_reg[0:5])
print(y_class[0:15])

# Apprentissage

In [0]:
set.seed(42)
test.ratio <- 0.2
npop <- nrow(spotify_data)
nvar <- ncol(spotify_data)
ntest <- ceiling(npop * test.ratio)
testi <- sample(1:npop, ntest)
appri <- setdiff(1:npop, testi)

In [0]:
# construction de l'échantillon d'apprentissage
datappr <- spotify_data[appri, -12] # avec la commande -12 on enlève la variable pop.class
# construction de l'échantillon test
datestr <- spotify_data[testi, -12]
# vérification
str(datappr)
str(datestr)
# summary(datappr)

In [0]:
plot.res <- function(x, y, titre = "titre") {
    plot(x, y, col = "blue", xlim = c(0, 250), ylim = c(-100, 100), ylab = "Résidus", 
        xlab = "Valeurs prédites", main = titre, pch = 20)
    # points(x2, y, col='red')
    abline(h = 0, col = "green")
}

In [0]:
reg.lm <- aov(popularity ~ ., data = datappr)

# Extraction des résidus et des valeurs ajustées de ce modèle
res.lm <- reg.lm$residuals
fit.lm <- reg.lm$fitted.values

# Graphe des résidus.
plot.res(fit.lm, res.lm, "Régression linéaire sans sélection de variables")

In [0]:
summary(reg.lm)

In [0]:
str(datappr)

In [0]:
# avec des variables quantitatives seulement
reg.lasso.quanti <- glmnet(y = datappr[, 10],
                           x = as.matrix(datappr[, -c(6, 9)]))
# avec toutes les variables, créer d'abord la matrice d'expériences 
# avec 'model.matrix' (penser à retirer l'intercept du modèle)
x.mat <- model.matrix(popularity ~ . - 1, data = datappr)
reg.lasso <- glmnet(y = datappr$popularity, x = x.mat)
options(repr.plot.width = 12, repr.plot.height = 10)
plot(reg.lasso, xvar = "lambda", label = TRUE)
legend("topright", 
       legend = paste(1:ncol(x.mat), " - ", colnames(x.mat)))

In [0]:
reg.lasso.cv <- cv.glmnet(y = datappr[, 10], x = x.mat)
plot(reg.lasso.cv)

In [0]:
# valeur estimée
paste("CV estimate of lambda :", round(reg.lasso.cv$lambda.1se, 3))
# modèle correspondant
coef(reg.lasso.cv, s = "lambda.1se")

In [0]:
# valeur estimée
paste("CV estimate of lambda :", round(reg.lasso.cv$lambda.1se, 3))
# modèle correspondant
coef(reg.lasso.cv, s = "lambda.1se")# Extraction des valeurs ajustées et des résidus
fit.lasso <- predict(reg.lasso.cv, s = "lambda.min", newx = x.mat)
res.lasso <- datappr$popularity - fit.lasso
# Graphe des résidus
options(repr.plot.width = 16, repr.plot.height = 9)
par(mfrow = c(1, 2))
plot.res(fit.lm, res.lm, "linéaire")
plot.res(fit.lasso, res.lasso, "linéaire, pénalité L1")

## Régression

### Random forest

In [0]:
install.packages("randomForest")

In [0]:
library(randomForest)
rf.reg=randomForest(popularity~., data=datappr,xtest=datestr[,-10],ytest=datestr[,"popularity"],
   ntree=500,do.trace=50,importance=TRUE)

In [0]:
fit.rfr = rf.reg$test$predicted
res.rfr = fit.rfr -datestr[,"popularity"]
plot.res(fit.rfr,res.rfr)

In [0]:
set.seed(1)
bestMtry <- tuneRF(datappr[,-10],datappr[,"popularity"], mtryStart=2, stepFactor = 1, improve = 1e-5, ntree=500, plot=FALSE)

### Decision Tree

In [0]:
library(rpart)
tree.reg = rpart(popularity~.,data=datappr,control=rpart.control(cp=0.001))

In [0]:
plot(tree.reg)
text(tree.reg)

In [0]:
xmat=xpred.rpart(tree.reg)
xerr=(xmat-datappr[,"popularity"])^2
CVerr=apply(xerr,2,sum)
CVerr  #    CP           erreur

# xmat_i = Y^chapeau_i est la valeur predite par le modèle qui n'a pas utilisé le fold conentant l'observation i
# xerr = (Y_i - Y_i^chapeau)^2 pour i=1,...,n

# L'erreur décroit avec la compléxité (ici ce n'est pas tout à fait le cas car xpred.rpart fait de la validation 
# croisée sur l'echantillon d'apprentissage)

# En gras c'est les valeurs de gamma et à coté l'erreur estimée par validation croissée. On choisit le gamma avec
# l'erreur la plus petite.

On cherche la valeur de *cp* correspondant à la plus petite erreur.

In [0]:
as.numeric(attributes(which.min(CVerr))$names)

In [0]:
tree.reg=rpart(popularity~.,data=datappr,control=rpart.control(cp=as.numeric(attributes(which.min(CVerr))$names)))

In [0]:
#install.packages("partykit")
library(partykit)
plot(as.party(tree.reg), type="simple")

Graphe des résidus

In [0]:
fit.tree=predict(tree.reg,newdata=datestr)
res.tree=fit.tree-datestr[,"popularity"]
plot.res(fit.tree,res.tree)

### SVR

In [0]:
library(e1071)
svm.reg0 = svm(popularity ~ ., data = datappr)
summary(svm.reg0)

In [0]:
#set.seed(2021)
svm.reg.tune = tune.svm(popularity ~ ., data = datappr, cost = c(1, 1.5, 2, 2.5, 3, 3.5), gamma = seq(0.02, 0.1, by = 0.02))
plot(svm.reg.tune)

In [0]:
svm.reg = svm(popularity ~ ., data = datappr, cost = svm.reg.tune$best.parameters$cost, 
    gamma = svm.reg.tune$best.parameters$gamma)

In [0]:
# calcul et graphe des résidus
fit.svmr = svm.reg$fitted
res.svmr = fit.svmr - datappr[, "populratiy"]
plot.res(fit.svmr, res.svmr)