# Dimensionality Reduction

In [1]:
import numpy as np
import pandas as pd
import pathlib as pl
from umap import UMAP
import plotly.express as px
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.datasets import fetch_openml
from sklearn.manifold import TSNE
from sklearn.datasets import make_classification

In [2]:
current_path=pl.Path.cwd()
X, y = make_classification(
    n_features=6,   # 6 features
    n_classes=3,    # 3 classes
    n_samples=5000, # 5000 samples
    n_informative=2,# 2 of these features are important
    random_state=5, # random seed
    n_clusters_per_class=1, # each class has 1 cluster
)
fig = px.scatter_3d(x=X[:, 0], y=X[:, 1], z=X[:, 2], color=y, opacity=0.8)
fig.show()

In [3]:
df = pd.DataFrame(X, columns=[f'Feature_{i}' for i in range(1, X.shape[1] + 1)])
df.head()

Unnamed: 0,Feature_1,Feature_2,Feature_3,Feature_4,Feature_5,Feature_6
0,0.764443,-0.605229,-0.824071,0.064053,-0.628055,-0.100303
1,-1.266179,-1.464584,-1.410566,0.662873,2.209847,-2.09973
2,-0.353143,-0.332338,-1.172816,2.12179,0.944769,-1.221912
3,0.114953,-0.009188,-0.83602,-1.686162,0.205629,-0.596427
4,-0.764535,-1.232059,-1.135743,0.175162,1.454019,-1.499716


# PCA Principal Component Analysis

In [4]:
pca = PCA(n_components=2)
components = pca.fit_transform(X)
total_var = pca.explained_variance_ratio_.sum() * 100

# inercia, es que sea mayor a 0.80

fig = px.scatter(
    components,
    x=0, y=1,
    color=y, 
    title=f'Total Explained Variance: {total_var:.2f}%',
    labels={'0': 'PC 1', '1': 'PC 2'}
    )
fig.write_html(current_path.joinpath('Biplot_scatter_with_explained_variance.html'))
fig.show()

In [5]:
pca = PCA(n_components=3)
components = pca.fit_transform(X)
total_var = pca.explained_variance_ratio_.sum() * 100

fig = px.scatter_3d(
    components, x=0, y=1, z=2, color=y,
    title=f'Total Explained Variance: {total_var:.2f}%',
    labels={'0': 'PC 1', '1': 'PC 2', '2': 'PC 3'}
)
fig.write_html(current_path.joinpath('3D_scatter_plot_with_explained_variance_3d.html'))
fig.show()

In [6]:
pca = PCA()
components = pca.fit_transform(X)
exp_var_cumul = np.cumsum(pca.explained_variance_ratio_)

fig = px.area(
    x=range(1, exp_var_cumul.shape[0] + 1),
    y=exp_var_cumul,
    labels={"x": "# Components", "y": "Explained Variance"}
)
fig.write_html(current_path.joinpath('PCA_explained_variance_cumulative.html'))
fig.show()

In [8]:
pca = PCA()
components = pca.fit_transform(X)

# Create a pandas DataFrame with the explained variance ratio and component number
pca_data = pd.DataFrame({'Component': range(1, pca.n_components_ + 1), 'Explained Variance Ratio': pca.explained_variance_ratio_})

# Create a bar plot using Plotly Express
fig = px.bar(pca_data, x='Component', y='Explained Variance Ratio', title='PCA Explained Variance')

# Customize the plot
fig.update_layout(
    xaxis_title='PCA Features',
    yaxis_title='Variance %',
    xaxis_tickmode='linear',  # Display all component numbers on the x-axis
    xaxis_tickvals=pca_data['Component'],  # Set the tick values to component numbers
    xaxis_ticktext=pca_data['Component'],  # Set the tick labels to component numbers
)

# Save the plot as an HTML file
fig.write_html(current_path.joinpath('PCA_explained_variance_bar.html'))

# Display the plot
fig.show()

In [9]:
features = ['Feature_1', 'Feature_2', 'Feature_3', 'Feature_4', 'Feature_5',
    'Feature_6']

In [12]:
n_components=2
pca = PCA(n_components)
components = pca.fit_transform(X)
total_var = pca.explained_variance_ratio_.sum() * 100
loadings = (pca.components_.T * np.sqrt(pca.explained_variance_))* 3


labels = {
    str(i): f"PC {i+1} ({var:.1f}%)"
    for i, var in enumerate(pca.explained_variance_ratio_ * 100)
}
fig = px.scatter(
    components,
    labels=labels,
    x=0, y=1,
    color=y, 
    title=f'Total Explained Variance: {total_var:.2f}% with Loadings'
    )


for i, feature in enumerate(features):
    fig.add_shape(
        type='line',
        x0=0, y0=0,
        x1=loadings[i, 0],
        y1=loadings[i, 1]
    )
    fig.add_annotation(
        x=loadings[i, 0],
        y=loadings[i, 1],
        ax=0, ay=0,
        xanchor="center",
        yanchor="bottom",
        text=feature,
    )
fig.write_html(current_path.joinpath('PCA_loadings.html'))
fig.show()

#Este plot puede tener problema porque el dataset esta normalizado,revisar si es asi

# t-SNE t-distributed stochastic neighbor embedding

https://www.datacamp.com/tutorial/introduction-t-sne

https://datarefiner.com/feed/why-tda

In [None]:
# Perform dimensionality reduction using TSNE
tsne = TSNE(n_components=2, random_state=42)
X_tsne = tsne.fit_transform(X)


# KL Divergence of t-SNE distribution Kullback-Leibler

tsne.kl_divergence_

In [None]:
fig = px.scatter(x=X_tsne[:, 0], y=X_tsne[:, 1], color=y)
fig.update_layout(
    title="t-SNE visualization of Custom Classification dataset",
    xaxis_title="First t-SNE",
    yaxis_title="Second t-SNE",
)
fig.show()

In [None]:
perplexity = np.arange(5, 55, 5)
divergence = []

for i in perplexity:
    model = TSNE(n_components=2, init="pca", perplexity=i)
    reduced = model.fit_transform(X)
    divergence.append(model.kl_divergence_)
fig = px.line(x=perplexity, y=divergence, markers=True)
fig.update_layout(xaxis_title="Perplexity Values", yaxis_title="Divergence")
fig.update_traces(line_color="red", line_width=1)
fig.show()

In [None]:
# Perform dimensionality reduction using TSNE
tsne = TSNE(n_components=2, random_state=42, perplexity=40)
X_tsne = tsne.fit_transform(X)

In [None]:
fig = px.scatter(x=X_tsne[:, 0], y=X_tsne[:, 1], color=y)
fig.update_layout(
    title="t-SNE visualization of Custom Classification dataset",
    xaxis_title="First t-SNE",
    yaxis_title="Second t-SNE",
)
fig.show()

# UMAP Uniform Manifold Approximation and Projection

In [None]:
umap = UMAP(n_components=2, random_state=42)
X_umap = umap.fit_transform(X)

In [None]:
fig = px.scatter(x=X_umap[:, 0], y=X_umap[:, 1], color=y)
fig.update_layout(
    title="UMAP visualization of Custom Classification dataset",
    xaxis_title="First UMAP",
    yaxis_title="Second UMAP",
)
fig.show()

In [None]:
n_neighbors = np.arange(5, 55, 5)

for i in n_neighbors:
    umap = UMAP(n_components=2, n_neighbors=i, random_state=42)
    X_umap = umap.fit_transform(X)
    
    fig = px.scatter(x=X_umap[:, 0], y=X_umap[:, 1], color=y)
    fig.update_layout(
        title=f"UMAP visualization with n_neighbors = {i}",
        xaxis_title="First UMAP component",
        yaxis_title="Second UMAP component"
    )
    fig.show()

In [None]:
min_dist_values = np.arange(0.0, 1.0, 0.1)

for min_dist in min_dist_values:
    umap = UMAP(n_components=2, min_dist=min_dist, random_state=42)
    X_umap = umap.fit_transform(X)
    
    fig = px.scatter(x=X_umap[:, 0], y=X_umap[:, 1], color=y)
    fig.update_layout(
        title=f"UMAP visualization with min_dist = {min_dist:.1f}",
        xaxis_title="First UMAP component",
        yaxis_title="Second UMAP component"
    )
    fig.show()

In [None]:
# Function to create bar charts with different colors for each objective
def create_bar_chart_with_colors(data, objective_number, color_planeado, color_alcanzado):
    # Filter data for the given objective
    objective_data = data[data['Objetivo'] == objective_number]
    
    # Set the positions and width for the bars
    positions = range(len(objective_data))
    bar_width = 0.35
    
    # Plotting the bar chart
    fig, ax = plt.subplots()
    bars_planeados = ax.bar(positions, objective_data['% Planeado'], bar_width, 
                            label='Planeado', color=color_planeado)
    bars_alcanzados = ax.bar([p + bar_width for p in positions], objective_data['% Alcanzado'], bar_width, 
                             label='Alcanzado', color=color_alcanzado)

    # Adding labels and title
    ax.set_xlabel('Indicadores')
    ax.set_ylabel('Porcentajes')
    ax.set_title(f'Comparación de Indicadores Planeados vs Alcanzados para Objetivo {objective_number}')
    ax.set_xticks([p + bar_width / 2 for p in positions])
    ax.set_xticklabels(objective_data['Indicadores planeados'])
    ax.legend()

    # Displaying the bar chart
    plt.show()

# Define new colors for the 'Planeado' and 'Alcanzado' bars
color_planeado = 'skyblue'
color_alcanzado = 'orange'

# Create bar charts with new colors for each unique objective in the dataframe
for objective in unique_objectives:
    create_bar_chart_with_colors(df, objective, color_planeado, color_alcanzado)
