In [None]:
import pandas as pd
import sklearn
from sklearn.manifold import TSNE
from sklearn.preprocessing import StandardScaler
from sklearn import datasets
from sklearn.decomposition import PCA
import numpy as np
import matplotlib.pyplot as plt
import plotnine
from plotnine import *
import seaborn as sns

In [None]:
url='https://raw.githubusercontent.com/jverwaer/ugain_machine_learning/main/Data/olympic.csv' #raw github link
df=pd.read_csv(url)
df.head()

In [None]:
sns.pairplot(df)

In [None]:
#with for loop scatters
olympic_array=np.array(df.loc[:, 'm100':'m1500'])
num_var=olympic_array.shape[1]

fig, axs = plt.subplots(num_var, num_var)

for i in range(num_var):
    for j in range(num_var):
        axs[i, j].scatter(olympic_array[:, i], olympic_array[:, j])
        axs[i, j].axis('off')
        rect = plt.Rectangle((0.1*i, 0.1*j), 0.10, 0.10, fill=False, color="k", lw=2, zorder=1000, transform=fig.transFigure, figure=fig)
        fig.patches.extend([rect])

plt.tight_layout()
plt.show()

In [None]:
#PCA
X=df.loc[:, 'm100':'m1500']
y=df.loc[:, 'score']

# scale data and fit pca
scaler = StandardScaler()
scaler.fit(X)
X=scaler.transform(X)    
pca_model = PCA()
x_new = pca_model.fit_transform(X)

In [None]:
def scree_plot(pca_model):
    PC_values = np.arange(pca_model.n_components_) + 1
    plt.plot(PC_values, pca_model.explained_variance_ratio_, 'ro-', linewidth=2)
    plt.title('Scree Plot')
    plt.xlabel('Principal Component')
    plt.ylabel('Proportion of Variance Explained')
    return()

In [None]:
def biplot_func(score, pca_model, labels=None, y=None, show_index=False):
    coeff=np.transpose(pca_model.components_[0:2, :]).copy()
    xs = score[:, 0]
    ys = score[:, 1]
    n = coeff.shape[0]
    coeff[:, 0] = coeff[:,0]*xs.std()*np.sqrt(len(xs))
    coeff[:, 1] = coeff[:,1]*ys.std()*np.sqrt(len(ys))
    fig, ax = plt.subplots()
    im = ax.scatter(xs, ys, c = y) #use y as color
    if show_index:
        for i in range(len(xs)):
            ax.text(xs[i], ys[i], i+1)
    if not y is None:
        fig.colorbar(im, ax=ax)
    for i in range(n):
        ax.arrow(0, 0, coeff[i, 0], coeff[i,1], color = 'r',alpha = 0.5)
        if labels is None:
            ax.text(1.1*coeff[i,0], 1.1*coeff[i,1], "Var"+str(i+1), color = 'g', ha = 'center', va = 'center')
        else:
            ax.text(1.1*coeff[i,0], 1.1*coeff[i,1], labels[i], color = 'g', ha = 'center', va = 'center')           
    ax.set_xlim(1.2*min(np.min(coeff[:, 0]), np.min(xs)), 1.2*max(np.max(coeff[:, 0]), np.max(xs)))
    ax.set_ylim(1.2*min(np.min(coeff[:, 1]), np.min(ys)), 1.2*max(np.max(coeff[:, 1]), np.max(ys)))
    ax.set_xlabel("PC{}".format(1))
    ax.set_ylabel("PC{}".format(2))
    ax.grid()
    return(fig, ax)

In [None]:
#Call the function with 2 pcs
[fig, ax] = biplot_func(x_new[:, 0:2], pca_model, labels=df.loc[:, 'm100':'m1500'].columns, y=df['score'], show_index=True)

In [None]:
y=df['score']
y

In [None]:
#scores figure
def scores_plot(x_new, y):
    fig, ax = plt.subplots()
    pos1=x_new[:, 0]
    pos2=x_new[:, 1]
    color_score=np.zeros((len(pos1)), )
    for i in range(len(pos1)):
        color_score[i]=(y[i]-min(y))/(max(y)-min(y))
        ax.text(pos1[i], pos2[i], s=y[i])
    ax.scatter(pos1, pos2, c=color_score, cmap='hot')
    return(fig, ax)

In [None]:
[fig, ax] = scores_plot(x_new, df['score'])

In [None]:
scree_plot(pca_model)

In [None]:
iris = datasets.load_iris()
X = iris.data
y = iris.target

# normalise and fit
scaler.fit(X)
X=scaler.transform(X)    
pca_model = PCA()
x_new = pca_model.fit_transform(X)

#Call the function. Use only the 2 PCs.
[fig, ax] = biplot_func(x_new[:, 0:2], pca_model, y=y, labels= iris.feature_names, show_index=False)

In [None]:
scree_plot(pca_model)

## t-SNE

In [None]:
# inlezen data
url='https://raw.githubusercontent.com/jverwaer/ugain_machine_learning/main/Data/iris.csv'

df = pd.read_csv(url, sep = ";")
df.head()

In [None]:
X=np.array(df.loc[:, 'sepal length':'petal width']) #selecteer numerieke data
tsne = TSNE(n_components=2, n_iter=1000)
points = tsne.fit_transform(X)

In [None]:
#omvormen naar dataframe
df_tsne=pd.DataFrame()
df_tsne['Dim1']=points[:, 0] #kolom 1
df_tsne['Dim2']=points[:, 1] #kolom 2
df_tsne['Soort']=df['soort'] #soort

In [None]:
#met plotnine
(
ggplot(mapping= aes(x='Dim1', y='Dim2'), data=df_tsne)
+ geom_point(aes(color='Soort'))
)

## UMAP

In de onderstaande sectie wordt het gebruik van UMAP (Uniform Manifold Approximation and Projection) geillustreerd. UMAP is een 'manifold learning technique', net zoals MDS. UMAP is een niet-lineaire techniek die over het algemeen goed is in het leren van een locale *en* een globale structuur tegelijk. De computatietijd van UMAP schaalt beter voor grote datasets in vergelijking met MDS. In tegenstelling tot MDS wordt een UMAP altijd rechtstreeks op de dataset uitgevoerd (er kan dus geen gebruik gemaakt worden van een afstandsmatrix).

We maken gebruik van de Iris dataset vanuit de voorgaande oefeningen. Op basis van de UMAP technieken zullen we proberen om de verschillende soorten van elkaar te onderscheiden op basis van de lengte en breedte van de sepalen en petalen.

**Stap 1: inladen UMAP en dataset**

In [None]:
!python -m pip install umap-learn
#er is een andere module met dezelfde naam, dus je moet op deze manier importeren:
import umap.umap_ as umap 

# inlezen data
url='https://raw.githubusercontent.com/jverwaer/ugain_machine_learning/main/Data/iris.csv'

df = pd.read_csv(url, sep = ";")
df.head()

**Stap 2: fit een reducer op de data (cfr. PCA)**

In [None]:
#via random_state kan je ervoor zorgen dat je 
#altijd hetzelfde resultaat krijgt als je de code opnieuw loopt
iris_array=np.array(df.loc[:, 'sepal length':'petal width']) #selecteer numerieke data
reducer = umap.UMAP(random_state=9000) 
reducer.fit(iris_array)

**Stap 3: gebruik de reduceren om de data te transformeren naar 2D coordinaten**

In [None]:
embedding = reducer.transform(iris_array)

#omvormen naar dataframe
df_umap=pd.DataFrame()
df_umap['Dim1']=embedding[:, 0] #kolom 1
df_umap['Dim2']=embedding[:, 1] #kolom 2
df_umap['Soort']=df['soort'] #soort

**Stap 4: plot de nieuwe coordinaten**

In [None]:
#met plotnine
(
ggplot(mapping = aes(x='Dim1', y='Dim2'), data=df_umap)
+ geom_point(aes(color='Soort'))
)

## Extra: UMAP op digits dataset

**Inladen**

In [None]:
#laad de digits dataset
from sklearn.datasets import load_digits
digits = load_digits()

In [None]:
#illustratie data
fig, ax_array = plt.subplots(20, 20)
axes = ax_array.flatten()
for i, ax in enumerate(axes):
    ax.imshow(digits.images[i], cmap='gray_r')
plt.setp(axes, xticks=[], yticks=[], frame_on=False)
plt.tight_layout(h_pad=0.5, w_pad=0.01)

**reducer**

In [None]:
reducer = umap.UMAP(random_state=9000) 
reducer.fit(digits.data)

**embedding**

In [None]:
embedding = reducer.transform(digits.data)
#hoe ziet de data eruit?
embedding.shape

In [None]:
#omvormen naar dataframe
df_umap=pd.DataFrame()
df_umap['Dim1']=embedding[:, 0] #kolom 1
df_umap['Dim2']=embedding[:, 1] #kolom 2
df_umap['Getal']=digits.target #getal

**plot**

In [None]:
#met plotnine
(
ggplot(mapping = aes(x='Dim1', y='Dim2'), data=df_umap)
+ geom_point(aes(color='Getal'))
+ scale_color_gradient(low='blue', high='red')
)

In [None]:
#met matplotlib
plt.scatter(embedding[:, 0], embedding[:, 1], c=digits.target, cmap='Spectral', s=5)
plt.gca().set_aspect('equal', 'datalim')
plt.colorbar(boundaries=np.arange(11)-0.5).set_ticks(np.arange(10))