<a href="https://colab.research.google.com/github/hbisgin/BigDatav1/blob/main/Lecture12_MLlibIntro.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("StringIndexerExample").getOrCreate()

# Data Types - Vectors

In [10]:
from pyspark.ml.linalg import Vectors

dense_vec = Vectors.dense([1.0, 0.0, 3.0])
sparse_vec = Vectors.sparse(5, [0, 3], [1.0, 2.5])
dense_vec, sparse_vec

(DenseVector([1.0, 0.0, 3.0]), SparseVector(5, {0: 1.0, 3: 2.5}))

# Data Types - Labeled Points

In [13]:
from pyspark.mllib.linalg import Vectors
from pyspark.mllib.regression import LabeledPoint

point = LabeledPoint(1.0, Vectors.dense([2.0, 0.0, 1.0]))
print(point)

(1.0,[2.0,0.0,1.0])


# Data Types - Matrix

In [17]:
from pyspark.ml.linalg import Matrices

# Dense matrix: 3 rows x 2 columns
dense = Matrices.dense(3, 2, [1.0, 3.0, 5.0, 2.0, 4.0, 6.0])

# Sparse matrix
sparse = Matrices.sparse(3, 3, [0, 1, 3, 3], [0, 1, 2], [1.0, 2.0, 3.0])

dense, sparse

#The False at the end is from isTransposed flag


(DenseMatrix(3, 2, [1.0, 3.0, 5.0, 2.0, 4.0, 6.0], False),
 SparseMatrix(3, 3, [0, 1, 3, 3], [0, 1, 2], [1.0, 2.0, 3.0], False))

In [None]:
from pyspark.ml.feature import StringIndexer

df = spark.createDataFrame(
    [(0, "a"), (1, "b"), (2, "c"), (3, "a"), (4, "a"), (5, "c")],
    ["id", "category"]
)

indexer = StringIndexer(
    inputCol="category",
    outputCol="categoryIndex"
)

indexed = indexer.fit(df).transform(df)
indexed.show()

+---+--------+-------------+
| id|category|categoryIndex|
+---+--------+-------------+
|  0|       a|          0.0|
|  1|       b|          2.0|
|  2|       c|          1.0|
|  3|       a|          0.0|
|  4|       a|          0.0|
|  5|       c|          1.0|
+---+--------+-------------+



# Sample DataFrame

In [4]:

data = [
    (1, 10.0, 20.0, 1.0),
    (2, 5.0,  10.0, 0.0),
    (3, 3.0,  15.0, 1.0)
]

columns = ["id", "feature1", "feature2", "label"]

df = spark.createDataFrame(data, columns)
df.show()

+---+--------+--------+-----+
| id|feature1|feature2|label|
+---+--------+--------+-----+
|  1|    10.0|    20.0|  1.0|
|  2|     5.0|    10.0|  0.0|
|  3|     3.0|    15.0|  1.0|
+---+--------+--------+-----+



# Combine feature1 and feature2 into a single "features" vector


In [6]:
assembler = VectorAssembler(
    inputCols=["feature1", "feature2"],
    outputCol="features"
)

# Transform the data
output = assembler.transform(df)

output.select("id", "features", "label").show(truncate=False)


+---+-----------+-----+
|id |features   |label|
+---+-----------+-----+
|1  |[10.0,20.0]|1.0  |
|2  |[5.0,10.0] |0.0  |
|3  |[3.0,15.0] |1.0  |
+---+-----------+-----+



# VectorIndexer Example

In [7]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler, VectorIndexer

# Create a Spark session
spark = SparkSession.builder.appName("VectorIndexerExample").getOrCreate()

# Sample data
data = [
    (0, 1.0, 0.1, -1.0),
    (1, 0.0, 0.2, -1.0),
    (2, 1.0, 0.1, 1.0),
    (3, 2.0, 0.2, 1.0),
    (4, 2.0, 0.2, -1.0)
]

columns = ["id", "feature1", "feature2", "label"]
df = spark.createDataFrame(data, columns)

# Assemble features into a single vector column
assembler = VectorAssembler(
    inputCols=["feature1", "feature2"],
    outputCol="features"
)
assembled = assembler.transform(df)

assembled.show(truncate=False)


+---+--------+--------+-----+---------+
|id |feature1|feature2|label|features |
+---+--------+--------+-----+---------+
|0  |1.0     |0.1     |-1.0 |[1.0,0.1]|
|1  |0.0     |0.2     |-1.0 |[0.0,0.2]|
|2  |1.0     |0.1     |1.0  |[1.0,0.1]|
|3  |2.0     |0.2     |1.0  |[2.0,0.2]|
|4  |2.0     |0.2     |-1.0 |[2.0,0.2]|
+---+--------+--------+-----+---------+





      - Automatically detects categorical features.
      - Encodes them as category indices under the hood.
      - Works seamlessly with downstream ML models.

    

In [8]:
# Automatically identify categorical features
# and index them (features with <= 2 distinct values)
indexer = VectorIndexer(
    inputCol="features",
    outputCol="indexedFeatures",
    maxCategories=2  # Features with <=2 distinct values are treated as categorical
)

# Fit the indexer
indexer_model = indexer.fit(assembled)

# Transform the data
indexed = indexer_model.transform(assembled)

indexed.select("features", "indexedFeatures").show(truncate=False)


+---------+---------------+
|features |indexedFeatures|
+---------+---------------+
|[1.0,0.1]|[1.0,0.0]      |
|[0.0,0.2]|[0.0,1.0]      |
|[1.0,0.1]|[1.0,0.0]      |
|[2.0,0.2]|[2.0,1.0]      |
|[2.0,0.2]|[2.0,1.0]      |
+---------+---------------+

