In [None]:
# Instructions:

import requests

# URL of the dataset
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data"

# Destination file path to save the downloaded dataset
destination_file = "wine.data"

# Send a GET request to download the dataset
response = requests.get(url)

# Check if the request was successful (status code 200)
if response.status_code == 200:
    # Save the downloaded content to a file
    with open(destination_file, 'wb') as file:
        file.write(response.content)
    print("Dataset downloaded successfully.")
else:
    print("Failed to download the dataset. Status code:", response.status_code)


In [None]:
# Load the dataset into a Pandas dataframe.
import pandas as pd

# Path to the downloaded dataset
dataset_file = "wine.data"

# Load the dataset into a Pandas dataframe
df = pd.read_csv(dataset_file, header=None)

# Display the first few rows of the dataframe
print(df.head())


In [None]:

# Split the dataset into features and target variables.
import pandas as pd

# Path to the downloaded dataset
dataset_file = "wine.data"

# Load the dataset into a Pandas dataframe
df = pd.read_csv(dataset_file, header=None)

# Split the dataset into features and target variables
X = df.iloc[:, :-1]  # Features (all columns except the last one)
y = df.iloc[:, -1]   # Target variable (last column)

# Display the shape of the feature matrix and target variable
print("Feature matrix shape:", X.shape)
print("Target variable shape:", y.shape)


In [None]:
# Perform data preprocessing (e.g., scaling, normalisation, missing value imputation) as necessary.

import pandas as pd
from sklearn.preprocessing import StandardScaler, MinMaxScaler, Imputer

# Path to the downloaded dataset
dataset_file = "wine.data"

# Load the dataset into a Pandas dataframe
df = pd.read_csv(dataset_file, header=None)

# Split the dataset into features and target variables
X = df.iloc[:, :-1]  # Features (all columns except the last one)
y = df.iloc[:, -1]   # Target variable (last column)

# Perform data preprocessing steps

# Scaling the features using StandardScaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Normalizing the features using MinMaxScaler
normalizer = MinMaxScaler()
X_normalized = normalizer.fit_transform(X)

# Imputing missing values using mean imputation
imputer = Imputer(strategy='mean')
X_imputed = imputer.fit_transform(X)

# Display the preprocessed data
print("Scaled data:\n", X_scaled)
print("Normalized data:\n", X_normalized)
print("Imputed data:\n", X_imputed)


In [None]:
# Implement PCA on the preprocessed dataset using the scikit-learn library.
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

# Path to the downloaded dataset
dataset_file = "wine.data"

# Load the dataset into a Pandas dataframe
df = pd.read_csv(dataset_file, header=None)

# Split the dataset into features and target variables
X = df.iloc[:, :-1]  # Features (all columns except the last one)
y = df.iloc[:, -1]   # Target variable (last column)

# Preprocess the features by scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Apply PCA
pca = PCA(n_components=2)  # Specify the number of principal components to keep
X_pca = pca.fit_transform(X_scaled)

# Display the explained variance ratio
print("Explained variance ratio:", pca.explained_variance_ratio_)

# Display the transformed data after PCA
print("Transformed data after PCA:\n", X_pca)


In [None]:
# Determine the optimal number of principal components to retain based on the explained variance ratio.

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

# Path to the downloaded dataset
dataset_file = "wine.data"

# Load the dataset into a Pandas dataframe
df = pd.read_csv(dataset_file, header=None)

# Split the dataset into features and target variables
X = df.iloc[:, :-1]  # Features (all columns except the last one)
y = df.iloc[:, -1]   # Target variable (last column)

# Preprocess the features by scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Apply PCA
pca = PCA()
X_pca = pca.fit_transform(X_scaled)

# Calculate the cumulative explained variance ratio
cumulative_var_ratio = np.cumsum(pca.explained_variance_ratio_)

# Plot the cumulative explained variance ratio
plt.plot(range(1, len(cumulative_var_ratio) + 1), cumulative_var_ratio)
plt.xlabel('Number of Principal Components')
plt.ylabel('Cumulative Explained Variance Ratio')
plt.title('Cumulative Explained Variance Ratio vs. Number of Principal Components')
plt.grid(True)
plt.show()


In [None]:
# Visualise the results of PCA using a scatter plot.

import pandas as pd
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

# Path to the downloaded dataset
dataset_file = "wine.data"

# Load the dataset into a Pandas dataframe
df = pd.read_csv(dataset_file, header=None)

# Split the dataset into features and target variables
X = df.iloc[:, :-1]  # Features (all columns except the last one)
y = df.iloc[:, -1]   # Target variable (last column)

# Preprocess the features by scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Apply PCA
pca = PCA(n_components=2)  # Specify the number of principal components to retain
X_pca = pca.fit_transform(X_scaled)

# Create a scatter plot of the transformed data
plt.scatter(X_pca[:, 0], X_pca[:, 1], c=y, cmap='viridis')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.title('PCA Scatter Plot')
plt.grid(True)
plt.show()


In [None]:
# Perform clustering on the PCA-transformed data using K-Means clustering algorithm.
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans

# Path to the downloaded dataset
dataset_file = "wine.data"

# Load the dataset into a Pandas dataframe
df = pd.read_csv(dataset_file, header=None)

# Split the dataset into features and target variables
X = df.iloc[:, :-1]  # Features (all columns except the last one)
y = df.iloc[:, -1]   # Target variable (last column)

# Preprocess the features by scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Apply PCA
pca = PCA(n_components=2)  # Specify the number of principal components to retain
X_pca = pca.fit_transform(X_scaled)

# Perform K-Means clustering
kmeans = KMeans(n_clusters=3)  # Specify the number of clusters
kmeans.fit(X_pca)

# Get the cluster labels
cluster_labels = kmeans.labels_

# Create a scatter plot of the PCA-transformed data with cluster labels
plt.scatter(X_pca[:, 0], X_pca[:, 1], c=cluster_labels, cmap='viridis')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.title('K-Means Clustering on PCA-Transformed Data')
plt.grid(True)
plt.show()


# Interpret the results of PCA and clustering analysis.

+ Interpreting the results of PCA and clustering analysis involves understanding the information captured by PCA and the grouping of data points into clusters

## PCA Results:

+ Principal Component 1: This component represents the direction in the data with the highest variance. It captures the most significant patterns or trends in the original features. The x-axis of the scatter plot represents the variation along this component.
+ Principal Component 2: This component represents the direction in the data orthogonal to Principal Component 1 with the second highest variance. It captures additional patterns or trends in the original features. The y-axis of the scatter plot represents the variation along this component.
+ Explained Variance Ratio: The explained variance ratio indicates the proportion of variance in the original features explained by each principal component. It helps assess how much information is retained by the chosen number of components. The cumulative explained variance ratio curve shows the cumulative amount of variance explained as the number of components increases. In the example code, it is plotted to help determine the optimal number of components to retain.

## Clustering Analysis:

+ K-Means Clustering: The K-Means algorithm is applied to the PCA-transformed data to group similar data points into clusters based on their distances in the reduced feature space. In the scatter plot, the data points are color-coded according to the assigned clusters.

## Interpretation:
###  By combining PCA and clustering analysis, we can gain insights into the structure of the dataset. Here are some possible interpretations:

+ Separation of Clusters: If the clusters in the scatter plot appear well-separated and distinct, it indicates that the K-Means algorithm successfully identified groups of similar data points based on the PCA-transformed features. The distance between the clusters implies that the identified groups have different characteristics or patterns.

+ Overlapping Clusters: If the clusters overlap or have a less clear separation, it suggests that the data points might have some inherent complexity or noise, making it challenging to distinctly group them. In such cases, alternative clustering algorithms or feature engineering approaches might be explored.

+ Relationship with Principal Components: By examining the scatter plot in conjunction with the PCA results, you can analyze how the clusters are related to the principal components. It helps understand which combinations of features contribute most to the cluster separation or overlap.

+ Interpretation of Clusters: Based on domain knowledge or further analysis, you can interpret the clusters in terms of their characteristics or behaviors. For example, in the wine dataset, if the clusters correspond to different types of wine, you can assign labels to the clusters and infer that the algorithm successfully identified distinct wine types based on the PCA-transformed features.

### It's important to note that the interpretation may vary depending on the specific dataset, the number of principal components retained, the number of clusters, and the domain knowledge. Therefore, it's crucial to analyze and interpret the results in the context of the specific problem or analysis being conducted.

# Deliverables:

## PCA and Clustering Analysis Report

### In this report, we summarize the results of Principal Component Analysis (PCA) and clustering analysis on the wine dataset.

###  1. Principal Component Analysis (PCA) Results:

###  PCA was performed on the wine dataset to reduce the dimensionality of the data and capture its inherent patterns and trends. The following key results were obtained:

+ Principal Component 1: This component represents the direction in the data with the highest variance. It captures the most significant patterns or trends in the original features.

+ Principal Component 2: This component represents the direction in the data orthogonal to Principal Component 1 with the second highest variance. It captures additional patterns or trends in the original features.

+ Explained Variance Ratio: The explained variance ratio indicates the proportion of variance in the original features explained by each principal component. It helps assess how much information is retained by the chosen number of components. The cumulative explained variance ratio curve showed the cumulative amount of variance explained as the number of components increased.

## 2. Clustering Analysis Results:

K-Means clustering was applied to the PCA-transformed data to group similar data points into clusters based on their distances in the reduced feature space. The following results were observed:

+ Cluster Separation: The scatter plot of the PCA-transformed data with cluster labels showed the separation or overlap of clusters. If the clusters appeared well-separated, it indicated that the K-Means algorithm successfully identified groups of similar data points based on the PCA-transformed features. If the clusters overlapped, it suggested inherent complexity or noise in the data, making it challenging to distinctly group the points.

+ Relationship with Principal Components: The relationship between the clusters and the principal components was examined to understand which combinations of features contributed most to the cluster separation or overlap. The clustering results were analyzed in conjunction with the PCA results to gain insights into the structure of the dataset.

## 3. Interpretation:

Based on the results obtained, the following interpretations can be made:

+ The PCA analysis revealed the directions in the data with the highest variances, which represent the most significant patterns or trends in the original features.

+ The clustering analysis, based on the PCA-transformed data, successfully grouped similar data points into clusters. The scatter plot visually depicted the separation or overlap of clusters.

+ The interpretation of the clusters could be performed based on domain knowledge or further analysis. For example, in the wine dataset, if the clusters correspond to different types of wine, it can be inferred that the algorithm successfully identified distinct wine types based on the PCA-transformed features.

## 4. Recommendations and Further Analysis:

+ The optimal number of principal components to retain can be determined based on the explained variance ratio. It is important to strike a balance between reducing dimensionality and retaining sufficient information. The cumulative explained variance ratio curve can help in making an informed decision about the number of components to retain.

+ Additional clustering algorithms or feature engineering approaches can be explored to validate and compare the results obtained from K-Means clustering. Different algorithms may provide different perspectives on the grouping of data points.

+ The interpretation of the clusters should be further analyzed and validated with domain knowledge or additional statistical techniques. This will help in gaining a deeper understanding of the underlying patterns and characteristics of the data.

## Conclusion:

+ PCA and clustering analysis provided insights into the wine dataset by reducing its dimensionality and identifying groups of similar data points. The results of PCA helped understand the significant patterns and trends in the data, while the clustering analysis allowed for the identification of distinct clusters. These findings contribute to a better understanding of the dataset and can guide further analysis or decision-making processes.

+ It is important to note that the interpretation and conclusions drawn from the analysis are specific to the wine dataset used in this report. The findings may vary for different datasets and should be interpreted within the context of the specific problem or analysis being conducted.

In [None]:
# Scatter plot showing the results of PCA.

import pandas as pd
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

# Path to the downloaded dataset
dataset_file = "wine.data"

# Load the dataset into a Pandas dataframe
df = pd.read_csv(dataset_file, header=None)

# Split the dataset into features and target variables
X = df.iloc[:, :-1]  # Features (all columns except the last one)
y = df.iloc[:, -1]   # Target variable (last column)

# Preprocess the features by scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Apply PCA
pca = PCA(n_components=2)  # Specify the number of principal components to retain
X_pca = pca.fit_transform(X_scaled)

# Create a scatter plot of the transformed data
plt.scatter(X_pca[:, 0], X_pca[:, 1], c=y, cmap='viridis')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.title('PCA Scatter Plot')
plt.grid(True)
plt.show()


In [None]:
# A table showing the performance metrics for the clustering algorithm.
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, adjusted_rand_score

# Path to the downloaded dataset
dataset_file = "wine.data"

# Load the dataset into a Pandas dataframe
df = pd.read_csv(dataset_file, header=None)

# Split the dataset into features and target variables
X = df.iloc[:, :-1]  # Features (all columns except the last one)
y = df.iloc[:, -1]   # Target variable (last column)

# Perform K-Means clustering
kmeans = KMeans(n_clusters=3)  # Specify the number of clusters
kmeans.fit(X)

# Get the cluster labels
cluster_labels = kmeans.labels_

# Calculate performance metrics
silhouette = silhouette_score(X, cluster_labels)
ari = adjusted_rand_score(y, cluster_labels)

# Create a table to display the performance metrics
metrics_table = pd.DataFrame({'Silhouette Score': [silhouette], 'Adjusted Rand Index (ARI)': [ari]})
print(metrics_table)
