#Task 1: PCA (Principal Component Analysis)

In [None]:
from sklearn.datasets import load_wine
import pandas as pd

wine = load_wine()
df = pd.DataFrame(data=wine.data, columns=wine.feature_names)
df['target'] = wine.target
df.head()


Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline,target
0,14.23,1.71,2.43,15.6,127.0,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065.0,0
1,13.2,1.78,2.14,11.2,100.0,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050.0,0
2,13.16,2.36,2.67,18.6,101.0,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185.0,0
3,14.37,1.95,2.5,16.8,113.0,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480.0,0
4,13.24,2.59,2.87,21.0,118.0,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735.0,0


In [None]:
from sklearn.preprocessing import StandardScaler

# Separate features from the target
X = df.drop('target', axis=1)

# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
#Standardization is necessary to ensure that each feature contributes equally to the analysis.

In [None]:
from sklearn.decomposition import PCA

# Apply PCA and choose the number of components
pca = PCA(n_components=13)
pca.fit(X_scaled)

# Transform the data to the new PCA space
X_pca = pca.transform(X_scaled)

# Display the explained variance ratio for each component
pca.explained_variance_ratio_


array([0.36198848, 0.1920749 , 0.11123631, 0.0706903 , 0.06563294,
       0.04935823, 0.04238679, 0.02680749, 0.02222153, 0.01930019,
       0.01736836, 0.01298233, 0.00795215])

In [None]:
#PCA loadings indicate how much each feature contributes to each principal component.

import numpy as np

# Compute loadings
loadings = pca.components_.T * np.sqrt(pca.explained_variance_)
loadings_df = pd.DataFrame(loadings, index=wine.feature_names, columns=[f'PC{i+1}' for i in range(13)])
loadings_df


Unnamed: 0,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9,PC10,PC11,PC12,PC13
alcohol,0.313977,0.766413,-0.250087,-0.017166,-0.246087,0.171534,-0.041982,0.234515,-0.274141,0.106292,-0.107652,-0.109703,0.004827
malic_acid,-0.533385,0.356434,0.107342,0.516132,0.032619,0.43122,0.313041,0.03897,0.040577,-0.155255,0.036446,0.050136,0.008372
ash,-0.004462,0.500855,0.755176,-0.205895,-0.132486,0.124089,-0.111044,-0.100794,0.165845,-0.013625,-0.237633,-0.020443,-0.045533
alcalinity_of_ash,-0.520622,-0.016782,0.73812,0.058506,0.061232,-0.080992,-0.213622,0.253359,-0.108041,0.026522,0.228399,-0.022965,0.029561
magnesium,0.308892,0.474812,0.157682,-0.338195,0.673472,0.030641,0.240357,-0.092566,-0.146284,0.034092,0.03397,0.025633,0.018306
total_phenols,0.858552,0.103064,0.17628,0.19041,-0.138315,-0.067575,-0.020788,-0.240314,-0.15417,-0.160807,0.145022,-0.125192,-0.149578
flavanoids,0.920058,-0.005324,0.18171,0.146406,-0.100992,-0.015198,-0.045174,-0.11085,-0.026722,-0.081953,-0.012244,-0.017673,0.268346
nonflavanoid_phenols,-0.649434,0.045605,0.20545,-0.19544,-0.463806,-0.207727,0.443255,-0.138105,-0.105374,0.108266,0.055702,0.017448,0.03677
proanthocyanins,0.68184,0.062279,0.18023,0.383627,0.126775,-0.428795,0.277023,0.217991,0.112727,0.067403,-0.113106,-0.039365,-0.037698
color_intensity,-0.192778,0.839852,-0.16558,0.063377,-0.070804,-0.336295,-0.16951,-0.020008,-0.030301,-0.146061,0.015172,0.248924,-0.003867


In [None]:
# Display the important features for each component
loadings_df.abs().sort_values(by='PC1', ascending=False).head()


Unnamed: 0,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9,PC10,PC11,PC12,PC13
flavanoids,0.920058,0.005324,0.18171,0.146406,0.100992,0.015198,0.045174,0.11085,0.026722,0.081953,0.012244,0.017673,0.268346
total_phenols,0.858552,0.103064,0.17628,0.19041,0.138315,0.067575,0.020788,0.240314,0.15417,0.160807,0.145022,0.125192,0.149578
od280/od315_of_diluted_wines,0.818321,0.260667,0.200188,0.177002,0.093706,0.213557,0.033322,0.04624,0.073964,0.263065,0.022121,0.24758,0.050531
proanthocyanins,0.68184,0.062279,0.18023,0.383627,0.126775,0.428795,0.277023,0.217991,0.112727,0.067403,0.113106,0.039365,0.037698
nonflavanoid_phenols,0.649434,0.045605,0.20545,0.19544,0.463806,0.207727,0.443255,0.138105,0.105374,0.108266,0.055702,0.017448,0.03677


In [None]:
import plotly.express as px
import plotly.graph_objects as go
fig = go.Figure(data=go.Heatmap(
    z=loadings_df.values,
    x=loadings_df.columns,
    y=loadings_df.index,
    colorscale='Viridis',
    showscale=True))

fig.update_layout(
    title='PCA Loadings Heatmap',
    xaxis_nticks=36,
    height=600,
    width=800)

fig.show()


A scree plot visualizes the explained variance of each principal component.

In [None]:
import plotly.graph_objects as go

fig = go.Figure()
fig.add_trace(go.Scatter(
    x=list(range(1, len(explained_variance) + 1)),
    y=explained_variance,
    mode='lines+markers',
    name='Explained Variance'
))

fig.update_layout(
    title='Scree Plot',
    xaxis_title='Principal Component',
    yaxis_title='Explained Variance Ratio',
    height=600,
    width=800
)

fig.show()


#Plot Cumulative Explained Variance
This shows how much variance is explained cumulatively by the components.

In [None]:
fig = go.Figure()
fig.add_trace(go.Scatter(
    x=list(range(1, len(cumulative_explained_variance) + 1)),
    y=cumulative_explained_variance,
    mode='lines+markers',
    name='Cumulative Explained Variance'
))

fig.update_layout(
    title='Cumulative Explained Variance',
    xaxis_title='Number of Principal Components',
    yaxis_title='Cumulative Explained Variance',
    height=600,
    width=800
)

fig.show()


#Results




PCA Loadings Heatmap: The heatmap above visualizes the loadings, showing how much each feature contributes to each principal component. Features with higher absolute values are more influential for that component. For example, flavanoids and alcohol have significant contributions to the first few components.

Scree Plot: This plot shows the explained variance ratio for each principal component. The first two components capture most of the variance, as indicated by the sharp drop after PC2.

Cumulative Explained Variance: The cumulative plot demonstrates that around 80% of the variance is captured by the first 3 components. This suggests that a reduction to three dimensions retains most of the original information.



The first principal component (PC1) is heavily influenced by flavanoids, proline, and alcohol, which are important for distinguishing between wine types.

Based on the scree plot, we can reduce the dataset to 2 or 3 dimensions without losing too much information.

The cumulative variance plot confirms that 3 components are sufficient to explain around 80% of the data's variance, allowing effective dimensionality reduction while retaining key information. ​

#Task 2: t-SNE (t-Distributed Stochastic Neighbor Embedding)

**Find Kullback-Leibler (KL) Divergence and Perplexity**

In t-SNE, perplexity is a parameter that controls the balance between local and global aspects of the data when projecting to a lower-dimensional space. The KL divergence is a measure of how well the projected low-dimensional space approximates the original data distribution.

Perplexity is generally chosen between 5 and 50. We'll define it during t-SNE execution and observe the KL divergence after performing the t-SNE transformation.

In [36]:
import numpy as np
from sklearn.manifold import TSNE
import plotly.graph_objects as go

X = data.data
y = data.target

# Analyze KL Divergence and Perplexity
perplexities = np.arange(3, 51, 3)  # Perplexity values to test
kl_divergences = []

for perplexity in perplexities:
    tsne = TSNE(n_components=2, perplexity=perplexity, random_state=42, n_iter=1000)
    tsne_result = tsne.fit_transform(X)

    # Compute KL Divergence
    kl_divergence = tsne.kl_divergence_
    kl_divergences.append(kl_divergence)

fig_kl = go.Figure()
fig_kl.add_trace(go.Scatter(x=perplexities, y=kl_divergences, mode='lines+markers', name='KL Divergence'))
fig_kl.update_layout(title='KL Divergence vs Perplexity',
                     xaxis_title='Perplexity',
                     yaxis_title='KL Divergence',
                     showlegend=True)
fig_kl.show()


'n_iter' was renamed to 'max_iter' in version 1.5 and will be removed in 1.7.


'n_iter' was renamed to 'max_iter' in version 1.5 and will be removed in 1.7.


'n_iter' was renamed to 'max_iter' in version 1.5 and will be removed in 1.7.


'n_iter' was renamed to 'max_iter' in version 1.5 and will be removed in 1.7.


'n_iter' was renamed to 'max_iter' in version 1.5 and will be removed in 1.7.


'n_iter' was renamed to 'max_iter' in version 1.5 and will be removed in 1.7.


'n_iter' was renamed to 'max_iter' in version 1.5 and will be removed in 1.7.


'n_iter' was renamed to 'max_iter' in version 1.5 and will be removed in 1.7.


'n_iter' was renamed to 'max_iter' in version 1.5 and will be removed in 1.7.


'n_iter' was renamed to 'max_iter' in version 1.5 and will be removed in 1.7.


'n_iter' was renamed to 'max_iter' in version 1.5 and will be removed in 1.7.


'n_iter' was renamed to 'max_iter' in version 1.5 and will be removed in 1.7.


'n_iter' was renamed to 'max_iter' in v

# from this plot we can see that the curve stabilizes around value of 21 , so that would be optimal perplexity value

In [37]:
selected_perplexity = 21

# Perform t-SNE with the selected perplexity
tsne_final = TSNE(n_components=2, perplexity=selected_perplexity, random_state=42)
tsne_result_final = tsne_final.fit_transform(X)

fig_tsne = go.Figure()
for i in np.unique(y):
    fig_tsne.add_trace(go.Scatter(
        x=tsne_result_final[y == i, 0],
        y=tsne_result_final[y == i, 1],
        mode='markers',
        name=data.target_names[i],
        marker=dict(size=10)
    ))

fig_tsne.update_layout(title='t-SNE Visualization with Perplexity = {}'.format(selected_perplexity),
                        xaxis_title='t-SNE Component 1',
                        yaxis_title='t-SNE Component 2',
                        showlegend=True)
fig_tsne.show()

In [None]:
import plotly.express as px

fig = px.scatter(df_tsne, x='TSNE1', y='TSNE2', color=df['target'].astype(str),
                 title='t-SNE Visualization of Wine Dataset',
                 labels={'color': 'Wine Class'},
                 height=600, width=800)

fig.show()


#Explanation of Results:
KL Divergence: This value measures how much information is lost during the dimensionality reduction process. Lower KL divergence indicates that the t-SNE projection is more faithful to the original data distribution.

Perplexity: We set the perplexity to 21, which is a good balance between considering local and global structure.

Clustering: The t-SNE plot will show how well the data points are clustered. If the wine classes form distinct clusters in the plot, it suggests that t-SNE has effectively preserved the structure of the data in lower dimensions.

#**Task 3: Compare PCA and t-SNE**

PCA will show the first two principal components, which explain the largest portion of the variance in the dataset, while t-SNE shows the low-dimensional representation using its non-linear transformation.

In [38]:
# Perform PCA to reduce the data to 2 dimensions for comparison
pca_2d = PCA(n_components=2)
X_pca_2d = pca_2d.fit_transform(X_scaled)

# Create a DataFrame for PCA results
df_pca_2d = pd.DataFrame(X_pca_2d, columns=['PCA1', 'PCA2'])
df_pca_2d['target'] = df['target']

# Plot both PCA and t-SNE side by side using Plotly
import plotly.subplots as sp

# Create subplots: 1 row, 2 columns
fig = sp.make_subplots(rows=1, cols=2, subplot_titles=("PCA", "t-SNE"))

# Add PCA plot
fig.add_trace(px.scatter(df_pca_2d, x='PCA1', y='PCA2', color=df_pca_2d['target'].astype(str)).data[0], row=1, col=1)

# Add t-SNE plot
fig.add_trace(px.scatter(df_tsne, x='TSNE1', y='TSNE2', color=df_tsne['target'].astype(str)).data[0], row=1, col=2)

# Update layout
fig.update_layout(height=600, width=1000, title_text="Comparison of PCA and t-SNE Visualizations")
fig.show()


 # Compare Explained Variance (PCA vs t-SNE)

Visualizations:

PCA: The plot from PCA shows that the first two components explain the majority of the variance in the data. However, PCA tends to work better with linear relationships, so the clusters may not be well-separated if the data is not linearly separable.

t-SNE: The t-SNE plot, on the other hand, is likely to show better-separated clusters, even for non-linear relationships, as it tries to preserve the local neighborhood of data points. This is why t-SNE is often preferred for visualizing high-dimensional data.

Variance Explained:

PCA: PCA's strength lies in its ability to explain variance. If the first two components explain a high percentage of the variance (say, 70-80%), you can be confident that the dimensionality reduction is retaining most of the data's information.

t-SNE: While t-SNE doesn't have a concept of explained variance, it excels at separating data into well-defined clusters, especially when the data structure is non-linear. However, t-SNE can be sensitive to parameters like perplexity, and its results can be harder to interpret quantitatively.

In [None]:
# Calculate and display cumulative explained variance for the first two PCA components
explained_variance_pca = np.sum(pca_2d.explained_variance_ratio_)
explained_variance_pca


0.5540633835693526

#Conclusion:
PCA: Great for linear dimensionality reduction and interpreting variance explained. It can be used to understand how much information is retained.

t-SNE: Excellent for visualizing complex, non-linear relationships in the data, making it useful for clustering. t-SNE is often better at showing groupings, but it doesn't explain variance in the same way as PCA.

#**Task 4: Building a Simple Neural Network**

In [None]:
from sklearn.datasets import load_wine
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.utils import to_categorical

wine = load_wine()
X = wine.data
y = wine.target

# One-hot encode the target labels
y = to_categorical(y, num_classes=3)

# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split the dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)


Build a Neural Network

In [None]:
from tensorflow.keras.layers import Input

# Build the neural network
model = Sequential()

# Input layer using Input() followed by a Dense hidden layer
model.add(Input(shape=(X_train.shape[1],)))
model.add(Dense(64, activation='relu'))

# Add a second hidden layer
model.add(Dense(32, activation='relu'))

# Add an output layer with 3 neurons (one for each class) and softmax activation
model.add(Dense(3, activation='softmax'))
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])


In [None]:
# Training the model
history = model.fit(X_train, y_train, epochs=50, batch_size=32, validation_split=0.2, verbose=2)

# Evaluating the model on the test set
test_loss, test_accuracy = model.evaluate(X_test, y_test)
print(f'Test Accuracy: {test_accuracy:.4f}')


Epoch 1/50
4/4 - 2s - 586ms/step - accuracy: 0.4336 - loss: 1.0343 - val_accuracy: 0.5517 - val_loss: 0.9474
Epoch 2/50
4/4 - 0s - 29ms/step - accuracy: 0.5044 - loss: 0.9060 - val_accuracy: 0.7586 - val_loss: 0.8209
Epoch 3/50
4/4 - 0s - 27ms/step - accuracy: 0.6814 - loss: 0.7954 - val_accuracy: 0.8276 - val_loss: 0.7057
Epoch 4/50
4/4 - 0s - 27ms/step - accuracy: 0.8142 - loss: 0.6868 - val_accuracy: 0.8966 - val_loss: 0.6049
Epoch 5/50
4/4 - 0s - 33ms/step - accuracy: 0.9381 - loss: 0.5916 - val_accuracy: 0.8966 - val_loss: 0.5136
Epoch 6/50
4/4 - 0s - 37ms/step - accuracy: 0.9646 - loss: 0.5059 - val_accuracy: 0.9655 - val_loss: 0.4340
Epoch 7/50
4/4 - 0s - 25ms/step - accuracy: 0.9646 - loss: 0.4272 - val_accuracy: 1.0000 - val_loss: 0.3649
Epoch 8/50
4/4 - 0s - 23ms/step - accuracy: 0.9646 - loss: 0.3582 - val_accuracy: 1.0000 - val_loss: 0.3053
Epoch 9/50
4/4 - 0s - 39ms/step - accuracy: 0.9735 - loss: 0.3003 - val_accuracy: 1.0000 - val_loss: 0.2549
Epoch 10/50
4/4 - 0s - 36ms

In [None]:
# Make predictions on the test set
y_pred = model.predict(X_test)

# Convert predictions to class labels
predicted_classes = y_pred.argmax(axis=1)

# Convert one-hot encoded y_test back to class labels
true_classes = y_test.argmax(axis=1)


[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 40ms/step


#Confusion Matrix:
The confusion matrix shows the true vs predicted classifications. It allows us to see where the model is making mistakes and how well it differentiates between classes.

In [39]:
cm = confusion_matrix(true_classes, predicted_classes)

# Convert to DataFrame for better handling
cm_df = pd.DataFrame(cm,
                     index=[f'True {i}' for i in range(cm.shape[0])],  # True labels
                     columns=[f'Pred {i}' for i in range(cm.shape[1])])  # Predicted labels

# Create an interactive confusion matrix plot
fig = px.imshow(cm_df,
                text_auto=True,
                color_continuous_scale='Blues',
                labels=dict(x="Predicted", y="True", color="Count"),
                title="Confusion Matrix")

# Update layout for better readability
fig.update_xaxes(title_text="Predicted")
fig.update_yaxes(title_text="True")
fig.show()

#**Task 5: Gradient Descent and Backpropagation**

# **Gradient Descent** is an optimization algorithm used to minimize the loss function (also known as the cost function or error function) in machine learning models, including neural networks. Here's how it works:
Goal: The primary objective of gradient descent is to find the set of weights (or parameters) in a neural network that minimizes the loss function. The loss function quantifies how far the network's predictions are from the actual target values.




> example from my deep learning professor in LTU ❤:

Gradient Descent is an optimization technique used in machine learning to help models minimize their error or loss. The goal is to find the "best" values for a model's parameters (or weights) that result in the lowest possible error when making predictions.

Imagine you’re hiking down a hill and trying to reach the lowest point, which represents the minimum error. Gradient descent is like taking small steps in the direction that will lead you downhill. Each step is based on the steepness (gradient) of the slope at your current position. If the slope is steep, it means you're far from the minimum, and you adjust your step accordingly. If it's flat, you're closer to the bottom.

The size of each step is controlled by a parameter called the learning rate. If the steps are too big (high learning rate), you might overshoot the minimum and bounce around. If they're too small (low learning rate), it can take forever to get there.

In neural networks, gradient descent updates the weights based on how much they contribute to the error. It keeps adjusting them in small steps until the error is minimized.


# **Backpropagation** is the method used to compute how much each weight in the network should be changed, ensuring that gradient descent knows which direction to move to reduce the error.

>Backpropagation is the process that powers gradient descent in neural networks. It’s like the engine under the hood that calculates how each weight in the network should be adjusted.


When you make a prediction with a neural network, the data flows from the input layer through the hidden layers to the output. This is called the **forward pass**. After the network makes a prediction, we measure how far off the prediction is by calculating the error or loss.

Backpropagation comes into play next, during the **backward pass**. It’s responsible for figuring out which weights caused the error and how much they need to be changed. It does this by moving backward through the network, layer by layer, calculating the gradient of the error with respect to each weight using a mathematical technique called the chain rule.

Once backpropagation has calculated these gradients, they are used by gradient descent to update the weights, reducing the overall error. Think of backpropagation as a way for the neural network to learn from its mistakes, gradually improving its predictions over time.

#**Task 6: Activation Functions**

The Sigmoid activation function squashes values to the range (0, 1), making it useful for binary classification problems but not ideal for deeper networks due to vanishing gradients.

In [None]:
# Build the neural network with Sigmoid activation in the hidden layers
model_sigmoid = Sequential()

# Input layer followed by Sigmoid hidden layers
model_sigmoid.add(Input(shape=(X_train.shape[1],)))
model_sigmoid.add(Dense(64, activation='sigmoid'))
model_sigmoid.add(Dense(32, activation='sigmoid'))

# Output layer with softmax activation (3 classes for classification)
model_sigmoid.add(Dense(3, activation='softmax'))

# Compile the model
model_sigmoid.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model
history_sigmoid = model_sigmoid.fit(X_train, y_train, epochs=50, batch_size=32, validation_split=0.2, verbose=2)

# Evaluate the model on the test set
test_loss_sigmoid, test_accuracy_sigmoid = model_sigmoid.evaluate(X_test, y_test)
print(f'Sigmoid Test Accuracy: {test_accuracy_sigmoid:.4f}')


Epoch 1/50
4/4 - 1s - 362ms/step - accuracy: 0.2832 - loss: 1.2520 - val_accuracy: 0.2759 - val_loss: 1.1933
Epoch 2/50
4/4 - 0s - 16ms/step - accuracy: 0.3097 - loss: 1.1713 - val_accuracy: 0.3103 - val_loss: 1.1240
Epoch 3/50
4/4 - 0s - 16ms/step - accuracy: 0.4336 - loss: 1.1060 - val_accuracy: 0.4828 - val_loss: 1.0760
Epoch 4/50
4/4 - 0s - 15ms/step - accuracy: 0.4336 - loss: 1.0622 - val_accuracy: 0.3793 - val_loss: 1.0428
Epoch 5/50
4/4 - 0s - 16ms/step - accuracy: 0.4071 - loss: 1.0328 - val_accuracy: 0.4483 - val_loss: 1.0192
Epoch 6/50
4/4 - 0s - 16ms/step - accuracy: 0.4602 - loss: 1.0136 - val_accuracy: 0.5862 - val_loss: 1.0005
Epoch 7/50
4/4 - 0s - 18ms/step - accuracy: 0.5575 - loss: 0.9963 - val_accuracy: 0.6207 - val_loss: 0.9831
Epoch 8/50
4/4 - 0s - 13ms/step - accuracy: 0.6195 - loss: 0.9794 - val_accuracy: 0.6552 - val_loss: 0.9651
Epoch 9/50
4/4 - 0s - 15ms/step - accuracy: 0.6637 - loss: 0.9613 - val_accuracy: 0.6897 - val_loss: 0.9456
Epoch 10/50
4/4 - 0s - 41ms

#**Tanh Activation Function in the Hidden Layer**

The Tanh activation function outputs values in the range (-1, 1). It tends to work better than Sigmoid for deeper networks because it has a broader output range and helps reduce the vanishing gradient problem.

In [None]:
model_tanh = Sequential()

# Input layer followed by Tanh hidden layers
model_tanh.add(Input(shape=(X_train.shape[1],)))
model_tanh.add(Dense(64, activation='tanh'))
model_tanh.add(Dense(32, activation='tanh'))

# Output layer with softmax activation (3 classes for classification)
model_tanh.add(Dense(3, activation='softmax'))

# Compile the model
model_tanh.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model
history_tanh = model_tanh.fit(X_train, y_train, epochs=50, batch_size=32, validation_split=0.2, verbose=2)

# Evaluate the model on the test set
test_loss_tanh, test_accuracy_tanh = model_tanh.evaluate(X_test, y_test)
print(f'Tanh Test Accuracy: {test_accuracy_tanh:.4f}')


Epoch 1/50
4/4 - 3s - 788ms/step - accuracy: 0.6903 - loss: 0.8304 - val_accuracy: 0.7586 - val_loss: 0.7421
Epoch 2/50
4/4 - 0s - 36ms/step - accuracy: 0.8053 - loss: 0.6435 - val_accuracy: 0.7931 - val_loss: 0.5753
Epoch 3/50
4/4 - 0s - 69ms/step - accuracy: 0.8673 - loss: 0.5098 - val_accuracy: 0.8276 - val_loss: 0.4549
Epoch 4/50
4/4 - 0s - 31ms/step - accuracy: 0.8938 - loss: 0.4115 - val_accuracy: 0.8966 - val_loss: 0.3673
Epoch 5/50
4/4 - 0s - 94ms/step - accuracy: 0.9646 - loss: 0.3396 - val_accuracy: 0.9310 - val_loss: 0.3019
Epoch 6/50
4/4 - 0s - 119ms/step - accuracy: 0.9646 - loss: 0.2891 - val_accuracy: 0.9655 - val_loss: 0.2542
Epoch 7/50
4/4 - 0s - 78ms/step - accuracy: 0.9646 - loss: 0.2490 - val_accuracy: 0.9655 - val_loss: 0.2173
Epoch 8/50
4/4 - 0s - 64ms/step - accuracy: 0.9646 - loss: 0.2177 - val_accuracy: 0.9655 - val_loss: 0.1902
Epoch 9/50
4/4 - 0s - 67ms/step - accuracy: 0.9646 - loss: 0.1928 - val_accuracy: 1.0000 - val_loss: 0.1682
Epoch 10/50
4/4 - 0s - 58m

#**ReLU Activation Function in the Hidden Layer**
We already used the ReLU (Rectified Linear Unit) activation function in Task 4. ReLU outputs 0 for negative values and the input value itself for positive values. It’s the most widely used activation function in deep learning because it helps reduce the vanishing gradient problem and speeds up convergence.

In [None]:
# This is from Task 4 using ReLU
model_relu = Sequential()

model_relu.add(Input(shape=(X_train.shape[1],)))
model_relu.add(Dense(64, activation='relu'))
model_relu.add(Dense(32, activation='relu'))

model_relu.add(Dense(3, activation='softmax'))

model_relu.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

history_relu = model_relu.fit(X_train, y_train, epochs=50, batch_size=32, validation_split=0.2, verbose=2)

test_loss_relu, test_accuracy_relu = model_relu.evaluate(X_test, y_test)
print(f'ReLU Test Accuracy: {test_accuracy_relu:.4f}')


Epoch 1/50
4/4 - 3s - 727ms/step - accuracy: 0.1504 - loss: 1.2575 - val_accuracy: 0.2414 - val_loss: 1.1527
Epoch 2/50
4/4 - 0s - 71ms/step - accuracy: 0.3097 - loss: 1.1201 - val_accuracy: 0.5862 - val_loss: 1.0270
Epoch 3/50
4/4 - 0s - 57ms/step - accuracy: 0.5929 - loss: 0.9949 - val_accuracy: 0.7241 - val_loss: 0.9164
Epoch 4/50
4/4 - 0s - 48ms/step - accuracy: 0.7522 - loss: 0.8837 - val_accuracy: 0.7586 - val_loss: 0.8199
Epoch 5/50
4/4 - 0s - 27ms/step - accuracy: 0.8761 - loss: 0.7860 - val_accuracy: 0.8276 - val_loss: 0.7359
Epoch 6/50
4/4 - 0s - 30ms/step - accuracy: 0.9027 - loss: 0.6960 - val_accuracy: 0.8621 - val_loss: 0.6615
Epoch 7/50
4/4 - 0s - 56ms/step - accuracy: 0.9292 - loss: 0.6168 - val_accuracy: 0.8966 - val_loss: 0.5959
Epoch 8/50
4/4 - 0s - 63ms/step - accuracy: 0.9381 - loss: 0.5473 - val_accuracy: 0.8966 - val_loss: 0.5376
Epoch 9/50
4/4 - 0s - 62ms/step - accuracy: 0.9558 - loss: 0.4845 - val_accuracy: 0.8966 - val_loss: 0.4862
Epoch 10/50
4/4 - 0s - 37ms

#Compare the Performance of the Models

In [None]:
print(f'ReLU Test Accuracy: {test_accuracy_relu:.4f}')
print(f'Sigmoid Test Accuracy: {test_accuracy_sigmoid:.4f}')
print(f'Tanh Test Accuracy: {test_accuracy_tanh:.4f}')


ReLU Test Accuracy: 1.0000
Sigmoid Test Accuracy: 1.0000
Tanh Test Accuracy: 1.0000


In [40]:
import plotly.graph_objects as go

# Create a figure
fig = go.Figure()

# ReLU
fig.add_trace(go.Scatter(
    x=list(range(1, len(history_relu.history['accuracy']) + 1)),
    y=history_relu.history['accuracy'],
    mode='lines+markers',
    name='ReLU - Training Accuracy',
    line=dict(color='blue'),
    marker=dict(size=6)
))
fig.add_trace(go.Scatter(
    x=list(range(1, len(history_relu.history['val_accuracy']) + 1)),
    y=history_relu.history['val_accuracy'],
    mode='lines+markers',
    name='ReLU - Validation Accuracy',
    line=dict(color='lightblue', dash='dash'),
    marker=dict(size=6)
))

# Sigmoid
fig.add_trace(go.Scatter(
    x=list(range(1, len(history_sigmoid.history['accuracy']) + 1)),
    y=history_sigmoid.history['accuracy'],
    mode='lines+markers',
    name='Sigmoid - Training Accuracy',
    line=dict(color='green'),
    marker=dict(size=6)
))
fig.add_trace(go.Scatter(
    x=list(range(1, len(history_sigmoid.history['val_accuracy']) + 1)),
    y=history_sigmoid.history['val_accuracy'],
    mode='lines+markers',
    name='Sigmoid - Validation Accuracy',
    line=dict(color='lightgreen', dash='dash'),
    marker=dict(size=6)
))

# Tanh
fig.add_trace(go.Scatter(
    x=list(range(1, len(history_tanh.history['accuracy']) + 1)),
    y=history_tanh.history['accuracy'],
    mode='lines+markers',
    name='Tanh - Training Accuracy',
    line=dict(color='orange'),
    marker=dict(size=6)
))
fig.add_trace(go.Scatter(
    x=list(range(1, len(history_tanh.history['val_accuracy']) + 1)),
    y=history_tanh.history['val_accuracy'],
    mode='lines+markers',
    name='Tanh - Validation Accuracy',
    line=dict(color='gold', dash='dash'),
    marker=dict(size=6)
))

# Update layout
fig.update_layout(
    title='Training and Validation Accuracy with Different Activation Functions',
    xaxis_title='Epochs',
    yaxis_title='Accuracy',
    legend_title='Activation Functions',
    hovermode='x unified',  # Hover mode for better clarity
    template='plotly_white'
)

# Show the plot
fig.show()



#**Conclusion**
**ReLU** often performs better in practice due to its simplicity and effectiveness in dealing with deeper networks.

**Sigmoid and Tanh** are less commonly used in modern deep networks, especially in hidden layers, because they suffer from gradient saturation, which slows learning.

Visualizing the training process showed how quickly models converge with different activation functions and how well they generalize on the test data.