# Drop Highly Correlated Features
### Preliminaries

In [2]:
# Load libraries
import numpy as np
import pandas as pd

### Load Data

In [3]:
# Create feature matrix with two highly correlated features
X = np.array([[1, 1, 1],
              [2, 2, 0],
              [3, 3, 1],
              [4, 4, 0],
              [5, 5, 1],
              [6, 6, 0],
              [7, 7, 1],
              [8, 7, 0],
              [9, 7, 1]])

# Convert feature matrix into DataFrame
df = pd.DataFrame(X)

# View the data frame
df

Unnamed: 0,0,1,2
0,1,1,1
1,2,2,0
2,3,3,1
3,4,4,0
4,5,5,1
5,6,6,0
6,7,7,1
7,8,7,0
8,9,7,1


### Identify Highly Correlated Features

In [5]:
# Create correlation matrix
corr_matrix = df.corr().abs()
corr_matrix

Unnamed: 0,0,1,2
0,1.0,0.976103,0.0
1,0.976103,1.0,0.034503
2,0.0,0.034503,1.0


In [7]:
# Select upper triangle of correlation matrix
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))
upper

Unnamed: 0,0,1,2
0,,0.976103,0.0
1,,,0.034503
2,,,


In [9]:
# Find index of feature columns with correlation greater than 0.95
to_drop = [column for column in upper.columns if any(upper[column] > 0.95)]

### Drop Marked Features

In [None]:
# Drop featru