### Jeremy Tan

### Lab 8

### Task 1

In [1]:
import numpy as np
import pandas as pd

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, MinMaxScaler


In [2]:
df = pd.read_csv('objects2.csv', index_col='Sample')
df.head(10)

Unnamed: 0_level_0,Size,Weight,Intensity,Value
Sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,25,249,43,80
2,32,320,82,81
3,10,102,61,79
4,64,650,69,80
5,88,873,73,82
6,12,121,48,78
7,66,651,42,79
8,37,380,27,80
9,54,549,92,81
10,77,764,55,79


In [3]:
# Standard Scaling
standard_scaler = StandardScaler()
df_standard_scaled = pd.DataFrame(
    standard_scaler.fit_transform(df),
    columns=df.columns,
    index=df.index
)

# For Min-Max Scaling
minmax_scaler = MinMaxScaler()
df_minmax_scaled = pd.DataFrame(
    minmax_scaler.fit_transform(df),
    columns=df.columns,
    index=df.index
)

In [4]:
df_standard_scaled

Unnamed: 0_level_0,Size,Weight,Intensity,Value
Sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,-0.831829,-0.846372,-0.85103,0.088045
2,-0.561001,-0.569321,1.197746,0.968496
3,-1.412175,-1.419985,0.094559,-0.792406
4,0.67707,0.718382,0.514821,0.088045
5,1.605623,1.588557,0.724952,1.848947
6,-1.334795,-1.345844,-0.588366,-1.672857
7,0.754449,0.722284,-0.903563,-0.792406
8,-0.367552,-0.335193,-1.691554,0.088045
9,0.290173,0.324267,1.723073,0.968496
10,1.180036,1.163225,-0.220637,-0.792406


In [5]:
df_minmax_scaled

Unnamed: 0_level_0,Size,Weight,Intensity,Value
Sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,0.192308,0.190661,0.246154,0.5
2,0.282051,0.28275,0.846154,0.75
3,0.0,0.0,0.523077,0.25
4,0.692308,0.710765,0.646154,0.5
5,1.0,1.0,0.707692,1.0
6,0.025641,0.024643,0.323077,0.0
7,0.717949,0.712062,0.230769,0.25
8,0.346154,0.360571,0.0,0.5
9,0.564103,0.579767,1.0,0.75
10,0.858974,0.858625,0.430769,0.25


In [6]:
def perform_pca(data, name):
    pca = PCA()
    principal_components = pca.fit_transform(data)
    
    variance_ratios = pca.explained_variance_ratio_
    
    loadings = pca.components_[0]
    
    return {
        'name': name,
        'pca': pca,
        'principal_components': principal_components,
        'variance_ratios': variance_ratios,
        'pc1_loadings': loadings,
        'cumulative_variance': np.cumsum(variance_ratios)
    }

pca_results = {
    'original': perform_pca(df, 'Original'),
    'standard': perform_pca(df_standard_scaled, 'Standard Normalized'),
    'minmax': perform_pca(df_minmax_scaled, 'Min-Max Normalized')
}

print("PCA Results Comparison")
print("======================\n")

print("Variance Ratios for Each Principal Component")
print("-------------------------------------------")
for dataset_name, results in pca_results.items():
    print(f"\n{results['name']}:")
    for i, ratio in enumerate(results['variance_ratios']):
        print(f"PC{i+1}: {ratio:.6f} ({results['cumulative_variance'][i]:.6f} cumulative)")

print("\n\nPC1 Loadings (Feature Importance/Direction)")
print("-------------------------------------------")
for dataset_name, results in pca_results.items():
    print(f"\n{results['name']}:")
    for i, (feature, loading) in enumerate(zip(df.columns, results['pc1_loadings'])):
        print(f"{feature}: {loading:.6f}")


PCA Results Comparison

Variance Ratios for Each Principal Component
-------------------------------------------

Original:
PC1: 0.994870 (0.994870 cumulative)
PC2: 0.005114 (0.999984 cumulative)
PC3: 0.000011 (0.999995 cumulative)
PC4: 0.000005 (1.000000 cumulative)

Standard Normalized:
PC1: 0.631511 (0.631511 cumulative)
PC2: 0.272298 (0.903809 cumulative)
PC3: 0.096123 (0.999932 cumulative)
PC4: 0.000068 (1.000000 cumulative)

Min-Max Normalized:
PC1: 0.663100 (0.663100 cumulative)
PC2: 0.254122 (0.917222 cumulative)
PC3: 0.082700 (0.999922 cumulative)
PC4: 0.000078 (1.000000 cumulative)


PC1 Loadings (Feature Importance/Direction)
-------------------------------------------

Original:
Size: 0.100301
Weight: 0.994791
Intensity: 0.018074
Value: 0.002005

Standard Normalized:
Size: 0.568171
Weight: 0.570098
Intensity: 0.358882
Value: 0.472625

Min-Max Normalized:
Size: 0.622737
Weight: 0.626086
Intensity: 0.277921
Value: 0.378119


Questions:
1. How many principal components are required to achieve at least 90% variance coverage?
2. Which original feature contributes most to the variance (using PC1 as basis)?

Answers:
1. For the original dataset, the first principal component already achieved more than 90% variance coverage. For the standard normalized and min-max normalized datasets, the first two principal components were required to achieve 90% variance coverage.
2. Using the first principal component as basis, the weight feature contributed the most to the variance of the first principal component in the original dataset. For the standard and min-max normalized datasets, the weight feature contributed the most to the variance of the first principal component.

Consider the standard normalized version.

When the dataset was standard normalized, the size and weight features had similar values. The amount of variance explained by the size and weight features for the first principal component was roughly the same, with the weight feature contributing slightly more to the principal component. 

Comparing the loadings of the standard normalized and original datasets, the contribution of the intensity and value features on the first principal component greatly increased. In the original dataset, the PCA analysis became skewed since the scale of the weight feature was much greater than the other features. This scale difference caused the weight feature to dominate the first principal component, overshadowing the contributions of the other three features.

In [7]:
pca_results

{'original': {'name': 'Original',
  'pca': PCA(),
  'principal_components': array([[-2.18219224e+02, -1.22389596e+01,  8.00159495e-01,
           4.97986532e-01],
         [-1.46180073e+02,  2.54740754e+01,  5.09387337e-01,
           5.55097556e-01],
         [-3.65634683e+02,  8.39942427e+00, -4.68546300e-01,
           1.37499899e-01],
         [ 1.85073605e+02,  6.45302144e+00, -1.86335973e-01,
          -1.09302132e+00],
         [ 4.09395515e+02,  6.43329919e+00,  9.21671566e-01,
           7.65012026e-01],
         [-3.46770018e+02, -4.96896308e+00, -1.08252163e+00,
          -4.60228777e-02],
         [ 1.85778999e+02, -2.05956648e+01, -8.12732951e-01,
           4.81342172e-01],
         [-8.69871807e+01, -3.06038123e+01,  1.42151039e+00,
          -7.25521225e-01],
         [ 8.40144133e+01,  3.13063125e+01,  1.65060698e-01,
          -6.13112373e-01],
         [ 2.99528647e+02, -9.65873312e+00, -1.26765263e+00,
           4.07396083e-02]]),
  'variance_ratios': array([9.9486

### Task 2

In [8]:
wine_df = pd.read_csv("wine.csv")

In [9]:
wine_df

Unnamed: 0,Wine Variety,Alcohol,Malic acid,Ash,Alcalinity of ash,Magnesium,Total Phenols,Flavanoids,Nonflavanoid phenols,Proanthocyanins,Color intensity,Hue,OD280/OD315 of diluted wines,Proline
0,1,14.23,1.71,2.43,15.6,127,2.80,3.06,0.28,2.29,5.64,1.04,3.92,1065
1,1,13.20,1.78,2.14,11.2,100,2.65,2.76,0.26,1.28,4.38,1.05,3.40,1050
2,1,13.16,2.36,2.67,18.6,101,2.80,3.24,0.30,2.81,5.68,1.03,3.17,1185
3,1,14.37,1.95,2.50,16.8,113,3.85,3.49,0.24,2.18,7.80,0.86,3.45,1480
4,1,13.24,2.59,2.87,21.0,118,2.80,2.69,0.39,1.82,4.32,1.04,2.93,735
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
173,3,13.71,5.65,2.45,20.5,95,1.68,0.61,0.52,1.06,7.70,0.64,1.74,740
174,3,13.40,3.91,2.48,23.0,102,1.80,0.75,0.43,1.41,7.30,0.70,1.56,750
175,3,13.27,4.28,2.26,20.0,120,1.59,0.69,0.43,1.35,10.20,0.59,1.56,835
176,3,13.17,2.59,2.37,20.0,120,1.65,0.68,0.53,1.46,9.30,0.60,1.62,840


In [10]:
# Standard Scaling
standard_scaler = StandardScaler()
wind_df_standard = pd.DataFrame(
    standard_scaler.fit_transform(wine_df),
    columns=wine_df.columns,
    index=wine_df.index
)

In [11]:
pca = PCA().fit(wind_df_standard)
X_pca = pca.transform(wind_df_standard)

In [12]:
X_pca

array([[-3.52293390e+00,  1.45309844e+00, -1.64795488e-01, ...,
         5.52927766e-01, -3.02978176e-01,  2.00857450e-01],
       [-2.52885806e+00, -3.30019252e-01, -2.02670665e+00, ...,
         3.94971160e-01, -1.46645308e-01,  1.26402355e-01],
       [-2.78502898e+00,  1.03693595e+00,  9.83237703e-01, ...,
         1.89799314e-03,  2.12780166e-02, -5.57515155e-02],
       ...,
       [ 3.02727243e+00,  2.75604024e+00, -9.40803036e-01, ...,
         6.93336340e-01,  1.67035660e-01, -3.16957430e-02],
       [ 2.75522166e+00,  2.29378408e+00, -5.50473677e-01, ...,
         3.44119826e-01, -1.09514873e-01,  1.02823104e-01],
       [ 3.49633565e+00,  2.76060799e+00,  1.01315115e+00, ...,
        -1.89866131e-01, -1.64090011e-01, -2.64916504e-01]])

In [13]:
variance_ratios = pca.explained_variance_ratio_
variance_ratios

array([0.39542486, 0.17836259, 0.10329102, 0.06627984, 0.06267875,
       0.0480556 , 0.03955707, 0.02500244, 0.02103871, 0.01873615,
       0.01613203, 0.01205691, 0.00925458, 0.00412945])

In [14]:
for i, ratio in enumerate(variance_ratios[:5]):
    print(f'Principal Component {i+1}: {ratio}')
print(f'The total variance explained by the first five principal components is {sum(variance_ratios[:5])}')

Principal Component 1: 0.3954248599255516
Principal Component 2: 0.1783625890830064
Principal Component 3: 0.10329101590003709
Principal Component 4: 0.0662798448677364
Principal Component 5: 0.06267875127845521
The total variance explained by the first five principal components is 0.8060370610547867
