In [None]:
rm(list = ls())  # Erase everything, start from scratch!

# Load the data from package funFEM
suppressPackageStartupMessages({
    library(funFEM)  # Suppress all loading package messages
})
data(velib)

In [None]:
# Data preparation
x <- as.matrix(velib$data)
# colnames(x) <- 1:ncol(x)
rownames(x) <- velib$names

# Consider a subsample
set.seed(0)
n <- nrow(x)
stations <- 1:n
coord <- velib$position[stations, ]

# Select exactly 7 days of data (we remove the first 13 dates)
dates <- 14:181
x <- x[stations, dates]
colnames(x) <- 1:length(dates)

In [None]:
timeTick <- 1 + 24 * (0:6)  # vector corresponding to the beginning of days
par(mfrow = c(1, 1))
options(repr.plot.width = 15, repr.plot.height = 6)

plot(x[1, ], col = "blue", type = "l", ylim = c(0, 1), xlab = "Time", ylab = "Loading", 
    main = rownames(x)[1])
abline(v = timeTick, lty = "dotted")

In [None]:
par(mfrow = c(4, 4))
options(repr.plot.width = 15, repr.plot.height = 15)
for (i in 1:16) {
    plot(x[i, ], col = "blue", type = "l", ylim = c(0, 1), xlab = "Time", ylab = "Loading", 
        main = rownames(x)[i])
    abline(v = timeTick, lty = "dotted")
}

In [None]:
options(repr.plot.width = 16, repr.plot.height = 9)
boxplot(x)
abline(v = timeTick, lty = "dotted", col = "blue", lwd = 2)

In [None]:
h <- 1
t <- 1

mycor <- round(cor(x[, t], x[, t + h]), 2)
plot(x[, t], x[, t + h], main = paste("Loading at time", t + h, "vs loading at time", 
    t, ", correlation:", mycor))

In [None]:
par(mfrow = c(2, 4))
options(repr.plot.width = 15, repr.plot.height = 8)
for (t in 5:12) {
    plot(x[, t], x[, t + h], xlab = "t", ylab = "t + h")
}

In [None]:
library(corrplot)

cormat <- cor(x)
par(mfrow = c(1, 2))
corrplot(cormat, tl.pos = "n", method = "color")
corrplot(cormat[1:24, 1:24], method = "color")

In [None]:
# this cell should display an interactive map

library(sp)
library(leaflet)
library(leaflet.extras)

longitude <- velib$position$longitude
latitude <- velib$position$latitude

df <- data.frame(longitude, latitude)
coordinates(df) <- ~longitude + latitude
pal <- colorFactor(velib$bonus, domain = velib$bonus)

leaflet(df) %>%
    addTiles() %>%
    addCircleMarkers(radius = 1.1, col = ifelse(velib$bonus == 1, "red", "blue"), 
        label = velib$names) %>%
    addFullscreenControl() %>%
    addLegend("topright", labels = c("hill", "no hill"), colors = c("red", "blue"))

In [None]:
plot(longitude, latitude, col = ifelse(velib$bonus == 1, "red", "blue"), xlab = "Longitude", 
    ylab = "Latitude", pch = velib$bonus + 1)
legend("topright", legend = c("Not On Hill", "On Hill"), col = c("blue", "red"), 
    pch = c(1, 2))

In [None]:
options(repr.plot.width = 16, repr.plot.height = 9)
colline <- which(velib$bonus == 1)
boxplot(x[colline, ], medcol = "#cb4154")
abline(v = timeTick, lty = "dotted", col = "blue", lwd = 3)

In [None]:
options(repr.plot.width = 16, repr.plot.height = 9)
no.colline <- which(velib$bonus == 0)
boxplot(x[no.colline, ], medcol = "#cb4154", )
abline(v = timeTick, lty = "dotted", col = "blue", lwd = 3)

### PCA

In [None]:
library(FactoMineR)
library(factoextra)

ncp <- 15  # number of principal components
rownames(x) <- 1:nrow(x)
PCA.out = PCA(x, scale.unit = T, ncp = ncp, graph = F)

In [None]:
eig.val <- get_eig(PCA.out)

options(repr.plot.width = 16, repr.plot.height = 9)

bp1 <- barplot(eig.val[1:15, 2], ylab = "Explained variance ratio", ylim = c(0, 45), 
    col = "#4682B4")
text(bp1, eig.val[1:15, 2] + 2, labels = paste(round(eig.val[1:15, 2], 2), "%"))
lines(bp1, eig.val[1:15, 2])

bp2 <- barplot(eig.val[1:15, 3], ylab = "Shared variance", ylim = c(0, 105), col = "#4682B4")
text(bp2, eig.val[1:15, 3] + 2, labels = paste(round(eig.val[1:15, 3], 1), "%"))
lines(bp2, eig.val[1:15, 3])

In [None]:
boxplot(PCA.out$ind$coord[, 1:15])

In [None]:
fviz_pca_var(PCA.out)

In [None]:
fviz_pca_var(PCA.out, axes = c(1, 3), alpha = 0.4)

In [None]:
fviz_pca_biplot(PCA.out, geom = "point")
fviz_pca_biplot(PCA.out, repel = TRUE, geom = "point", col.ind = "black", pointshape = 21, 
    pointsize = 2, alpha.var = "contrib", col.var = "contrib", gradient.cols = c("#00AFBB", 
        "#E7B800", "#FC4E07"), legend.title = list(color = "Contrib", alpha = "Contrib"))

In [None]:
fviz_pca_ind(PCA.out, geom = "point", col.ind = ifelse(velib$bonus == 1, "Hill", 
    "No Hill"))

### CAH on raw data

In [None]:
options(repr.plot.width = 16, repr.plot.height = 12)
CAHclust <- hclust(dist(x, method = "euclidean"), method = "ward.D2")
CAHcut <- cutree(CAHclust, k = 5)
plot(CAHclust, labels = FALSE)
abline(h = 40, col = "red")

In [None]:
options(repr.plot.width = 12, repr.plot.height = 8)
heights = CAHclust$height
heights = sort(heights, decreasing = TRUE)
plot(heights[1:16])

In [None]:
# fviz_cluster(list(data = velib$position, cluster = CAHclust)) # Utiliser pour
# les données ACP
library(dplyr)
library(ggplot2)

options(repr.plot.width = 16, repr.plot.height = 12)
Stations_CAHclust <- mutate(velib$position, cluster = CAHcut)
ggplot(Stations_CAHclust, aes(x = longitude, y = latitude, color = factor(cluster))) + 
    geom_point()

In [None]:
timeTick <- 1 + 24 * (0:6)  # vector corresponding to the beginning of days
par(mfrow = c(3, 2))
colours <- c("red", "blue", "green", "orange", "pink")

options(repr.plot.width = 20, repr.plot.height = 10)
Cclust = cbind(x, CAHcut)
for (i in 1:5) {
    boxplot(Cclust[Cclust[, 169] == i, ], xlab = "Time", ylab = "Loading", col = colours[i], 
        type = "l", ylim = c(0, 1))
    abline(v = timeTick, lty = "dotted")
}

### CAH on the PCA results

In [None]:
CAHclustACP <- hclust(dist(PCA.out$ind$coord[,1:5], method = "euclidean"), method = "ward.D2")
CAHcutACP <- cutree(CAHclustACP, k = 5)
plot(CAHclustACP, labels = FALSE)
abline(h = 100, col = "red")

In [None]:
options(repr.plot.width = 12, repr.plot.height = 8)
heights = CAHclustACP$height
heights = sort(heights, decreasing = TRUE)
plot(heights[1:16])

In [None]:
timeTick <- 1 + 24 * (0:6)  # vector corresponding to the beginning of days
par(mfrow = c(3, 2))
colours <- c("red", "blue", "green", "orange", "pink")

options(repr.plot.width = 20, repr.plot.height = 10)
Cclust = cbind(x, CAHcutACP)
for (i in 1:5) {
    boxplot(Cclust[Cclust[, 169] == i, ], xlab = "Time", ylab = "Loading", col = colours[i], 
        type = "l", ylim = c(0, 1))
    abline(v = timeTick, lty = "dotted")
}

In [None]:
tabCompCluster <- table(CAHcut, CAHcutACP)  # seem to correspond up to a permutation
tabCompCluster
clusterPerm <- apply(tabCompCluster, 2, which.max)  # permutation maximizing the diagonal terms
clusterPerm
table(CAHcut, clusterPerm[CAHcutACP])

### K-Means

#### Raw Data

In [None]:
K <- 6
reskm <- kmeans(x, centers = K)

In [None]:
new_k <- vector(length = 9)
somme <- 0
for (k in 1:9) {
    new_k[k] <- kmeans(x, centers = k)$tot.withinss
    somme <- somme + kmeans(x, centers = k)$totss
}
plot(x = c(1:9), new_k/somme, type = "b", xlab = "Number of Clusters")

In [None]:
timeTick <- 1 + 24 * (0:6)  # vector corresponding to the beginning of days
par(mfrow = c(3, 2))
colours <- c("red", "blue", "green", "orange", "pink", "yellow")

options(repr.plot.width = 20, repr.plot.height = 10)

for (i in 1:6) {
    plot(reskm$centers[i, ], xlab = "Time", ylab = "Loading", col = colours[i], type = "l", 
        ylim = c(0, 1), lwd = 3)
    abline(v = timeTick, lty = "dotted")
}

In [None]:
timeTick <- 1 + 24 * (0:6)  # vector corresponding to the beginning of days
par(mfrow = c(3, 2))

options(repr.plot.width = 20, repr.plot.height = 10)
kmclust = cbind(x, reskm$cluster)
for (i in 1:6) {
    boxplot(kmclust[kmclust[, 169] == i, ], xlab = "Time", ylab = "loading", col = colours[i], 
        type = "l", ylim = c(0, 1))
    abline(v = timeTick, lty = "dotted")
}

In [None]:
fviz_cluster(reskm, x)

In [None]:
KMclust <- mutate(velib$position, cluster = reskm$cluster)
ggplot(KMclust, aes(x = longitude, y = latitude, color = factor(cluster))) + geom_point()

#### On PCA data

In [None]:
reskmPCA <- kmeans(PCA.out$ind$coord[,1:5], centers = K)

In [None]:
new_k <- vector(length = 9)
somme <- 0
for (k in 1:9) {
    new_k[k] <- kmeans(PCA.out$ind$coord[,1:5], centers = k)$tot.withinss
    somme <- somme + kmeans(PCA.out$ind$coord[,1:5], centers = k)$totss
}
plot(x = c(1:9), new_k/somme, type = "b", xlab = "Number of Clusters")

In [None]:
reskmPCA <- kmeans(PCA.out$ind$coord[,1:5], centers = K)
fviz_cluster(reskmPCA, PCA.out$ind$coord[, 1:2])

In [None]:
timeTick <- 1 + 24 * (0:6)  # vector corresponding to the beginning of days
par(mfrow = c(3, 2))

options(repr.plot.width = 20, repr.plot.height = 10)
kmclust = cbind(x, reskmPCA$cluster)
for (i in 1:6) {
    boxplot(kmclust[kmclust[, 169] == i, ], xlab = "Time", ylab = "loading", col = colours[i], 
        type = "l", ylim = c(0, 1))
    abline(v = timeTick, lty = "dotted")
}

In [None]:
tabCompCluster <- table(reskm$cluster, reskmPCA$cluster)  # seem to correspond up to a permutation
tabCompCluster
clusterPerm <- apply(tabCompCluster, 2, which.max)  # permutation maximizing the diagonal terms
clusterPerm
table(reskm$cluster, clusterPerm[reskmPCA$cluster])

### Gaussian mixture

In [None]:
library(mclust)

nbr_clusters = 6
gm = Mclust(PCA.out$ind$coord[, 1:2], G = nbr_clusters, modelNames = "VVV")
summary(gm)

In [None]:
plot(gm, what = "classification")