# Dimensionality Reduction

## PCA

In [91]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    mean_absolute_percentage_error,
)

In [92]:
df = pd.read_csv("./datasets/housePrice.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4600 entries, 0 to 4599
Data columns (total 18 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   date           4600 non-null   object 
 1   price          4600 non-null   float64
 2   bedrooms       4600 non-null   float64
 3   bathrooms      4600 non-null   float64
 4   sqft_living    4600 non-null   int64  
 5   sqft_lot       4600 non-null   int64  
 6   floors         4600 non-null   float64
 7   waterfront     4600 non-null   int64  
 8   view           4600 non-null   int64  
 9   condition      4600 non-null   int64  
 10  sqft_above     4600 non-null   int64  
 11  sqft_basement  4600 non-null   int64  
 12  yr_built       4600 non-null   int64  
 13  yr_renovated   4600 non-null   int64  
 14  street         4600 non-null   object 
 15  city           4600 non-null   object 
 16  statezip       4600 non-null   object 
 17  country        4600 non-null   object 
dtypes: float

In [93]:
df = df.drop(columns=["date"])

In [94]:
X = df.drop(columns=["price"])
y = df["price"]

In [95]:
numerical_features = X.select_dtypes(exclude=["object"]).columns
categorical_features = X.select_dtypes(include=["object"]).columns
print(numerical_features)
print(categorical_features)

Index(['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors',
       'waterfront', 'view', 'condition', 'sqft_above', 'sqft_basement',
       'yr_built', 'yr_renovated'],
      dtype='object')
Index(['street', 'city', 'statezip', 'country'], dtype='object')


In [96]:
numerical_pipe = Pipeline(
    [("impute", SimpleImputer(strategy="mean")), ("minMax", MinMaxScaler())]
)
categorical_pipe = Pipeline(
    [
        ("impute", SimpleImputer(strategy="most_frequent")),
        ("oe", OneHotEncoder(handle_unknown="ignore")),
    ]
)
preprocessing_pipe = ColumnTransformer(
    [
        ("numerical", numerical_pipe, numerical_features),
        # ('categorical',categorical_pipe , categorical_features)
    ]
)
X = preprocessing_pipe.fit_transform(X)

In [97]:
lr = LabelEncoder()
X = df.select_dtypes(exclude=["object"])

In [98]:
from sklearn.decomposition import PCA

In [99]:
type(X)

pandas.core.frame.DataFrame

In [100]:
pca = PCA(n_components=4)
X = pca.fit_transform(X)

In [101]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [102]:
model = LinearRegression()
model.fit(X_train, y_train)

In [103]:
y_pred = model.predict(X_test)
mean_absolute_percentage_error(y_pred, y_test)

0.010869670163211262

## LDA

In [104]:
df = pd.read_csv("./datasets/cancer.csv")
print(df)
X = df.select_dtypes(exclude=["object"])
y = df["diagnosis"]
numerical_features = X.select_dtypes(exclude=["object"]).columns
categorical_features = X.select_dtypes(include=["object"]).columns
numerical_pipe = Pipeline(
    [("impute", SimpleImputer(strategy="mean")), ("minMax", MinMaxScaler())]
)
categorical_pipe = Pipeline(
    [
        ("impute", SimpleImputer(strategy="most_frequent")),
        ("oe", OneHotEncoder(handle_unknown="ignore")),
    ]
)
preprocessing_pipe = ColumnTransformer(
    [
        ("numerical", numerical_pipe, numerical_features),
        # ('categorical',categorical_pipe , categorical_features)
    ]
)
X = preprocessing_pipe.fit_transform(X)
y = LabelEncoder().fit_transform(y)

           id diagnosis  radius_mean  texture_mean  perimeter_mean  area_mean   
0      842302         M        17.99         10.38          122.80     1001.0  \
1      842517         M        20.57         17.77          132.90     1326.0   
2    84300903         M        19.69         21.25          130.00     1203.0   
3    84348301         M        11.42         20.38           77.58      386.1   
4    84358402         M        20.29         14.34          135.10     1297.0   
..        ...       ...          ...           ...             ...        ...   
564    926424         M        21.56         22.39          142.00     1479.0   
565    926682         M        20.13         28.25          131.20     1261.0   
566    926954         M        16.60         28.08          108.30      858.1   
567    927241         M        20.60         29.33          140.10     1265.0   
568     92751         B         7.76         24.54           47.92      181.0   

     smoothness_mean  compa

In [105]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

lda = LinearDiscriminantAnalysis()
lda.fit_transform(X, y)

array([[ 3.32573950e+00],
       [ 2.32980231e+00],
       [ 3.74168594e+00],
       [ 4.02099034e+00],
       [ 2.27542857e+00],
       [ 1.61359707e+00],
       [ 2.36099631e+00],
       [ 1.26548864e+00],
       [ 1.61020071e+00],
       [ 3.84779257e+00],
       [ 8.74434723e-01],
       [ 2.38966562e+00],
       [ 1.32604413e+00],
       [ 4.39305589e-01],
       [ 1.23446201e+00],
       [ 3.05707022e+00],
       [ 2.11843040e+00],
       [ 3.15449180e+00],
       [ 3.45996483e+00],
       [-2.16734075e-01],
       [-1.98246260e+00],
       [-2.45782808e+00],
       [ 1.79234919e+00],
       [ 3.03844379e+00],
       [ 3.88840091e+00],
       [ 4.00012163e+00],
       [ 2.74716629e+00],
       [ 2.58914794e+00],
       [ 3.08685045e+00],
       [ 7.99429368e-01],
       [ 3.42293932e+00],
       [ 2.68742343e+00],
       [ 3.62626449e+00],
       [ 3.09282743e+00],
       [ 2.69315757e+00],
       [ 2.58174452e+00],
       [ 1.08250229e+00],
       [-1.87622980e+00],
       [-4.7

In [106]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [107]:
from sklearn.tree import DecisionTreeClassifier

model = DecisionTreeClassifier()
model.fit(X_train, y_train)
model.score(X_test, y_test)

0.9122807017543859

## LLE

In [108]:
from sklearn.datasets import load_digits
from sklearn.manifold import LocallyLinearEmbedding

X, _ = load_digits(return_X_y=True)
print(X.shape)

embedding = LocallyLinearEmbedding(n_components=2)
X_transformed = embedding.fit_transform(X[:100])
print(X_transformed.shape)

(1797, 64)
(100, 2)


In [109]:
from sklearn.datasets import load_digits
from sklearn.decomposition import FastICA

X, _ = load_digits(return_X_y=True)
print(X.shape)
transformer = FastICA(n_components=7, random_state=0, whiten="unit-variance")
X_transformed = transformer.fit_transform(X)
X_transformed.shape

(1797, 64)


(1797, 7)