# Dimensionality reduction using data bricks

In [3]:
!pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.0.tar.gz (316.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m316.9/316.9 MB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.0-py2.py3-none-any.whl size=317425345 sha256=fe9f230990cd378bc7ca188159fcfe534589d98c07ca71026a45886c8ae2fe32
  Stored in directory: /root/.cache/pip/wheels/41/4e/10/c2cf2467f71c678cfc8a6b9ac9241e5e44a01940da8fbb17fc
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.0


In [15]:
# Import necessary libraries
from pyspark.ml.feature import PCA
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler
from pyspark.sql import SparkSession


In [5]:
# Initialize Spark Session
spark = SparkSession.builder.appName("dimensionalityReduction").getOrCreate()

In [8]:
from sklearn import datasets
# Load the Iris dataset
iris_df = spark.createDataFrame(datasets.load_iris().data.tolist(), ["sepal_length", "sepal_width", "petal_length", "petal_width"])

In [9]:
# Assemble features
assembler = VectorAssembler(inputCols=iris_df.columns, outputCol="features")
iris_features = assembler.transform(iris_df)

In [10]:
# Initialize PCA
pca = PCA(k=2, inputCol="features", outputCol="pcaFeatures")

In [11]:
# Fit PCA model
model = pca.fit(iris_features)

In [12]:
# Transform the data
result = model.transform(iris_features).select("pcaFeatures")


In [13]:
# Show the results
result.show(truncate=False)


+-----------------------------------------+
|pcaFeatures                              |
+-----------------------------------------+
|[-2.8182395066394674,-5.6463498234127965]|
|[-2.788223445314678,-5.149951351762915]  |
|[-2.613374563549707,-5.182003150742138]  |
|[-2.757022276967594,-5.00865359757578]   |
|[-2.7736485960544734,-5.653707089762616] |
|[-3.221505499764511,-6.06828302589061]   |
|[-2.681827381868395,-5.237491192299126]  |
|[-2.8762201594623704,-5.490337536526024] |
|[-2.6159824008284502,-4.748640822640992] |
|[-2.8296093347880493,-5.213178330953578] |
|[-2.9954180419571474,-5.97202147547627]  |
|[-2.8896099017002808,-5.341682515989071] |
|[-2.716255866420986,-5.091840576625977]  |
|[-2.278561388743351,-4.815557989821313]  |
|[-2.8576147426669736,-6.505717213265271] |
|[-3.11632609907787,-6.665014907228348]   |
|[-2.8788372573845766,-6.137632091008953] |
|[-2.854068426354622,-5.63880172142105]   |
|[-3.3025448089914233,-6.1997916157899065]|
|[-2.914378732730235,-5.84051288

In [14]:
# Stop the Spark session
spark.stop()