In [1]:
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler, StandardScaler

In [6]:
data = pd.read_csv("objects2.csv", index_col = 0)
data.head()

Unnamed: 0_level_0,Size,Weight,Intensity,Value
Sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,25,249,43,80
2,32,320,82,81
3,10,102,61,79
4,64,650,69,80
5,88,873,73,82


In [37]:
results = {}

scalers = {
    "Not Scaled": None,
    "MinMax": MinMaxScaler(),
    "Standard": StandardScaler()
}

for name, scaler in scalers.items():
    if scaler is not None:
        data_scaled =   scaler.fit_transform(data)
    else:
        data_scaled = data.values

    pca = PCA()
    pca.fit(data_scaled)

    var_ratios = pca.explained_variance_ratio_
    pc1_loadings = pca.components_[0]
    cumulative_variance = np.cumsum(var_ratios)
    num_components_90 = np.argmax(cumulative_variance >= 0.9) + 1
    top_feature_index = np.argmax(np.abs(pc1_loadings))
    top_feature = data.columns[top_feature_index]

    results[name] = {
        "Variance Ratios": var_ratios,
        "PC1 Loadings": dict(zip(data.columns, pc1_loadings)),
        "Components >= 90%": num_components_90,
        "Top Feature in PC1": top_feature
    }

for version, info in results.items():
    print("\nVariance Ratios:", [float(f"{v:.6f}") for v in info["Variance Ratios"]])
    print("PC1 Loadings:", {k: float(round(v, 4)) for k, v in info["PC1 Loadings"].items()})
    print("Components to reach >= 90% variance:", info["Components >= 90%"])
    print("Top contributing feature in PC1:", info["Top Feature in PC1"])



Variance Ratios: [0.99487, 0.005114, 1.1e-05, 5e-06]
PC1 Loadings: {'Size': 0.1003, 'Weight': 0.9948, 'Intensity': 0.0181, 'Value': 0.002}
Components to reach >= 90% variance: 1
Top contributing feature in PC1: Weight

Variance Ratios: [0.6631, 0.254122, 0.0827, 7.8e-05]
PC1 Loadings: {'Size': 0.6227, 'Weight': 0.6261, 'Intensity': 0.2779, 'Value': 0.3781}
Components to reach >= 90% variance: 2
Top contributing feature in PC1: Weight

Variance Ratios: [0.631511, 0.272298, 0.096123, 6.8e-05]
PC1 Loadings: {'Size': 0.5682, 'Weight': 0.5701, 'Intensity': 0.3589, 'Value': 0.4726}
Components to reach >= 90% variance: 2
Top contributing feature in PC1: Weight


## How does PCA respond to similar or almost-identical features (e.g., Size and Weight)?
We are considering that size and weight are both features that are nearly identical or highly correlated. In this case, PCA becomes more sensitive to the structure of the data rather than the absolute magnitudes of the features. It will treat this redundancy by capturing the shared information within a single principal component. This is because PCA seeks directions in feature space that maximize variance while remaining uncorrelated to one another. If Size and Weight both vary together across samples (i.e. as Size increases, Weight does too, and vice versa), PCA doesn’t "double count" this information. Instead, it combines them into the same direction in the transformed space (usually the first principal component), with each contributing a significant, and possibly opposite-signed, loading. As a result, this would imply a strong PC1, capturing the joint variance of Size and Weight, and a weaker PC2.

## What is the impact of normalization on the Intensity and Value features of this dataset with respect to PCA?
Without normalization, features with larger numeric scales (such as Intensity, if its values are much higher than others) would dominate the principal components, regardless of their actual relevance or variance structure. PCA on unnormalized data tends to overweight high-magnitude features and underrepresent others. However, in the standard normalized version, all features are transformed to have mean 0 and standard deviation 1, effectively placing them on the same scale. This avoids PCA assigning biased weight to large-valued features, and instead compares the relative variance between features equitably.

In [24]:
wine = pd.read_csv("wine.csv")
# wine.describe()
wine.head()

Unnamed: 0,Wine Variety,Alcohol,Malic acid,Ash,Alcalinity of ash,Magnesium,Total Phenols,Flavanoids,Nonflavanoid phenols,Proanthocyanins,Color intensity,Hue,OD280/OD315 of diluted wines,Proline
0,1,14.23,1.71,2.43,15.6,127,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065
1,1,13.2,1.78,2.14,11.2,100,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050
2,1,13.16,2.36,2.67,18.6,101,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185
3,1,14.37,1.95,2.5,16.8,113,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480
4,1,13.24,2.59,2.87,21.0,118,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735


In [35]:
# 1) Output the variance ratios of the principal components

std = StandardScaler()
wine_scaled = std.fit_transform(wine)
pca = PCA()
pca.fit(wine_scaled)

var_ratios = pca.explained_variance_ratio_

# 2) Answer this question:  how much variance is represented by the first 5 principal components?
var5_sum = np.sum(var_ratios[:5])

# Output
print("Explained Variance Ratios:")
for i, var in enumerate(var_ratios):
    print(f"PC{i+1}: {var:.4f}")

print(f"\nTotal variance explained by first 5 components: {var5_sum:.4f}")

Explained Variance Ratios:
PC1: 0.3954
PC2: 0.1784
PC3: 0.1033
PC4: 0.0663
PC5: 0.0627
PC6: 0.0481
PC7: 0.0396
PC8: 0.0250
PC9: 0.0210
PC10: 0.0187
PC11: 0.0161
PC12: 0.0121
PC13: 0.0093
PC14: 0.0041

Total variance explained by first 5 components: 0.8060
