### Importing packages

In [187]:
# common libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

### Varience Threshold

In [188]:
from sklearn.feature_selection import VarianceThreshold

In [189]:
# data
X = [[0, 2, 0, 3],
     [0, 1, 4, 3],
     [0, 1, 1, 3]]

In [190]:
# Set the threshold for variance
threshold_value = 0.1

In [191]:
# Create the VarianceThreshold object
selector = VarianceThreshold(threshold=threshold_value)

In [192]:
# Fit the selector to your data
X_filtered = selector.fit_transform(X)

In [193]:
X_filtered

array([[2, 0],
       [1, 4],
       [1, 1]])

### select k best

In [194]:
from sklearn.feature_selection import SelectKBest
from sklearn.datasets import load_digits

#### f classif

In [195]:
from sklearn.feature_selection import f_classif

In [196]:
# data
X, y = load_digits(return_X_y=True)

X.shape, y.shape

((1797, 64), (1797,))

In [197]:
selector = SelectKBest(f_classif, k=20)
X_new = selector.fit_transform(X, y)

  f = msb / msw


In [198]:
X_new.shape

(1797, 20)

In [199]:
selector.scores_[:5]

array([         nan,  39.77670877, 161.66962131,  68.01284271,
        37.085693  ])

In [200]:
selector.pvalues_[:5]

array([            nan, 4.99834376e-065, 8.12770776e-224, 7.82141288e-108,
       1.03883191e-060])

#### chi square

In [201]:
from sklearn.feature_selection import chi2

In [202]:
selector = SelectKBest(chi2, k=20)
X_new = selector.fit_transform(X, y)

In [203]:
X_new.shape

(1797, 20)

#### mutual info

In [204]:
from sklearn.feature_selection import mutual_info_classif

In [205]:
selector = SelectKBest(mutual_info_classif, k=20)
X_new = selector.fit_transform(X, y)

In [206]:
X_new.shape

(1797, 20)

#### r regression

In [207]:
from sklearn.feature_selection import r_regression
from sklearn.datasets import make_regression

In [208]:
# data
X, y = make_regression(n_features=50, n_samples=100, bias=5, noise=5)

X.shape, y.shape

((100, 50), (100,))

In [209]:
selector = SelectKBest(r_regression, k=20)
X_new = selector.fit_transform(X, y)

In [210]:
X_new.shape

(100, 20)

### RFE

In [211]:
from sklearn.feature_selection import RFE
from sklearn.svm import SVR

In [212]:
# data
from sklearn.datasets import make_friedman1

X, y = make_friedman1(n_samples=50, n_features=10, random_state=0)
X.shape, y.shape

((50, 10), (50,))

In [213]:
estimator = SVR(kernel='linear')
selector = RFE(estimator, n_features_to_select=5, step=1)
filtered = selector.fit_transform(X, y)
filtered.shape

(50, 5)

In [214]:
selector.support_

array([ True,  True,  True,  True,  True, False, False, False, False,
       False])

In [215]:
selector.ranking_

array([1, 1, 1, 1, 1, 6, 4, 3, 2, 5])

### Select from model

In [216]:
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LogisticRegression

In [217]:
X = [[ 0.87, -1.34,  0.31 ],
      [-2.79, -0.02, -0.85 ],
      [-1.34, -0.48, -2.55 ],
      [ 1.92,  1.48,  0.65 ]]
y = [0, 1, 0, 1]

In [218]:
selector = SelectFromModel(estimator=LogisticRegression())
filtered = selector.fit_transform(X, y)

filtered.shape

(4, 1)

In [219]:
selector.estimator_.coef_

array([[-0.3252302 ,  0.83462377,  0.49750423]])

In [220]:
selector.get_support()

array([False,  True, False])

### count vectorizer

In [221]:
from sklearn.feature_extraction.text import CountVectorizer

In [222]:
# text data
corpus = [
    'This is the first document.',
    'This document is the second document.',
    'And this is the third one.',
    'Is this the first document?',
]

In [223]:
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(corpus)

In [224]:
vectorizer.get_feature_names_out()

array(['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third',
       'this'], dtype=object)

In [225]:
X.toarray()

array([[0, 1, 1, 1, 0, 0, 1, 0, 1],
       [0, 2, 0, 1, 0, 1, 1, 0, 1],
       [1, 0, 0, 1, 1, 0, 1, 1, 1],
       [0, 1, 1, 1, 0, 0, 1, 0, 1]])

### tf-idf

In [226]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [227]:
corpus = [
    'This is the first document.',
    'This document is the second document.',
    'And this is the third one.',
    'Is this the first document?',
]

In [228]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(corpus)

In [229]:
vectorizer.get_feature_names_out()

array(['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third',
       'this'], dtype=object)

In [230]:
X.shape

(4, 9)

In [231]:
X.toarray().round()

array([[0., 0., 1., 0., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 1., 0., 0., 0.],
       [1., 0., 0., 0., 1., 0., 0., 1., 0.],
       [0., 0., 1., 0., 0., 0., 0., 0., 0.]])

## PCA

#### with sklearn

In [232]:
from sklearn.decomposition import PCA

In [233]:
# data
X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]])

In [234]:
pca = PCA(n_components=2)
pca.fit(X)

In [235]:
pca.explained_variance_ratio_

array([0.99244289, 0.00755711])

In [236]:
pca.singular_values_

array([6.30061232, 0.54980396])

In [237]:
pca.transform(X)

array([[ 1.38340578,  0.2935787 ],
       [ 2.22189802, -0.25133484],
       [ 3.6053038 ,  0.04224385],
       [-1.38340578, -0.2935787 ],
       [-2.22189802,  0.25133484],
       [-3.6053038 , -0.04224385]])

#### from scratch

In [238]:
def pca(X, num_components):
  X_standardized = (X - np.mean(X, axis=0)) / np.std(X, axis=0)

  covarience_matrix = np.cov(X_standardized, rowvar=False)

  eigenvalues, eigenvectors = np.linalg.eig(covarience_matrix)

  sorted_indices = np.argsort(eigenvalues)[::-1]
  eigenvalues = eigenvalues[sorted_indices]
  eigenvectors = eigenvectors[:, sorted_indices]

  top_eigenvectors = eigenvectors[:, :num_components]

  X_pca = np.dot(X_standardized, top_eigenvectors)
  return X_pca

In [239]:
# testing
dummy_data = np.array([[1, 2, 3],
                       [4, 5, 6],
                       [7, 8, 9]])

# Perform PCA and reduce to 2 components
num_components = 2
result = pca(dummy_data, num_components)

print("Original data:\n", dummy_data)
print("\nPCA result:\n", result)

Original data:
 [[1 2 3]
 [4 5 6]
 [7 8 9]]

PCA result:
 [[-2.12132034e+00  1.68306266e-16]
 [ 0.00000000e+00  0.00000000e+00]
 [ 2.12132034e+00 -1.68306266e-16]]


## ICA

In [240]:
from sklearn.datasets import load_digits
from sklearn.decomposition import FastICA

In [241]:
#data
X, _ = load_digits(return_X_y=True)

X.shape

(1797, 64)

In [242]:
# model
transformer = FastICA(n_components=7,
        random_state=0,
        whiten='unit-variance')

In [243]:
X_transformed = transformer.fit_transform(X)



In [244]:
X_transformed.shape

(1797, 7)

## t-SNE

In [245]:
from sklearn.manifold import TSNE

In [246]:
# data
X = np.array([[0, 0, 0],
              [0, 1, 1],
              [1, 0, 1],
              [1, 1, 1]])

In [247]:
model = TSNE(n_components=2, learning_rate='auto',
                   init='random', perplexity=3)

In [248]:
X_transformed = model.fit_transform(X)

In [249]:
X_transformed.shape

(4, 2)

## LDA

In [250]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

In [251]:
# data

X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]])
y = np.array([1, 1, 1, 2, 2, 2])

In [252]:
clf = LinearDiscriminantAnalysis()
clf.fit(X, y)

In [253]:
clf.predict([[-0.8, -1]])

array([1])

## NMF

In [254]:
from sklearn.decomposition import NMF

In [255]:
# data
X = np.array([[1, 1], [2, 1], [3, 1.2], [4, 1], [5, 0.8], [6, 1]])

In [256]:
model = NMF(n_components=2, init='random', random_state=0)

In [257]:
W = model.fit_transform(X)
W

array([[0.        , 0.46880684],
       [0.55699523, 0.3894146 ],
       [1.00331638, 0.41925352],
       [1.6733999 , 0.22926926],
       [2.34349311, 0.03927954],
       [2.78981512, 0.06911798]])

In [258]:
model.components_

array([[2.09783018, 0.30560234],
       [2.13443044, 2.13171694]])