# PCA (Feature Extraction)

## Dataset: Graduate Admissions

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("grad_admit_.csv")
df.head()

Unnamed: 0,GRE Score,TOEFL Score,University Rating,SOP,LOR,GPA,Research,Admitted
0,337,118,4,4.5,4.5,3.89,1,1
1,324,107,4,4.0,4.5,3.58,1,1
2,316,104,3,3.0,3.5,3.23,1,0
3,322,110,3,3.5,2.5,3.5,1,1
4,314,103,2,2.0,3.0,3.31,0,0


### Train/Test Split and Standardize

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

X = df.iloc[:,:-1].values
y = df.iloc[:,-1].values

# split dataset into test/train sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, test_size=0.3)

# standardize features
stdsc = StandardScaler()
X_train_std = stdsc.fit_transform(X_train)
X_test_std = stdsc.transform(X_test)

---

## Display the proportional importance of each feature

In [4]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier()
rf.fit(X_train_std, y_train)

# feature_importances_ will always sum to 1
rf.feature_importances_

array([0.22856882, 0.14750039, 0.07919014, 0.09460759, 0.07213119,
       0.33659486, 0.041407  ])

---

---

# Principal Component Analysis (PCA)

In [5]:
from sklearn.decomposition import PCA

pca = PCA()
X_train_pca = pca.fit_transform(X_train_std)
X_test_pca = pca.transform(X_test_std)

### Explained variance ratios of all 7 of the extracted principal components

In [6]:
pca.explained_variance_ratio_

array([0.69792769, 0.10935086, 0.0776954 , 0.04027621, 0.03213353,
       0.02288612, 0.01973018])

---

In [7]:
from sklearn.decomposition import PCA

pca = PCA(n_components=3)
X_train_pca = pca.fit_transform(X_train_std)
X_test_pca = pca.transform(X_test_std)

### Explained variance ratios of the 3 best extracted principal components

In [8]:
pca.explained_variance_ratio_

array([0.69792769, 0.10935086, 0.0776954 ])

---

# Machine Learning

## Logistic Regression

In [9]:
from sklearn.linear_model import LogisticRegression

log_reg = LogisticRegression()
log_reg.fit(X_train_std, y_train)
log_reg.score(X_test_std, y_test)

0.9

#### Using PCA

In [10]:
# Train on the 3 extracted features
log_reg.fit(X_train_pca, y_train)
log_reg.score(X_test_pca, y_test)

0.9083333333333333

---

## K-nearest Neighbors

In [11]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train_std, y_train)
knn.score(X_test_std, y_test)

0.8666666666666667

#### Using PCA

In [12]:
# Train on the 3 extracted features
knn.fit(X_train_pca, y_train)
knn.score(X_test_pca, y_test)

0.8916666666666667

---