<a href="https://colab.research.google.com/github/hbisgin/BigDatav1/blob/main/Lecture13_MLlibIntro.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [20]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("StringIndexerExample").getOrCreate()

# One hot encoder example
First create a dummy dataframe

In [2]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler
from pyspark.ml import Pipeline

# Create Spark session
spark = SparkSession.builder \
    .appName("OneHotEncodingExample") \
    .getOrCreate()

# Sample data
data = [
    (0, "red", 1.0),
    (1, "blue", 2.0),
    (2, "green", 3.0),
    (3, "blue", 4.0),
    (4, "red", 5.0),
]
df = spark.createDataFrame(data, ["id", "color", "value"])
df.show()


+---+-----+-----+
| id|color|value|
+---+-----+-----+
|  0|  red|  1.0|
|  1| blue|  2.0|
|  2|green|  3.0|
|  3| blue|  4.0|
|  4|  red|  5.0|
+---+-----+-----+



#Build a pipeline step by step to tackle with the categorical variable, color

In [12]:


# 1. StringIndexer: convert categorical string column → numeric index column, we have seen this
indexer = StringIndexer(inputCol="color", outputCol="colorIndex", handleInvalid="keep")
model = indexer.fit(df)
df_indexed = model.transform(df)
df_indexed.show()

+---+-----+-----+----------+
| id|color|value|colorIndex|
+---+-----+-----+----------+
|  0|  red|  1.0|       1.0|
|  1| blue|  2.0|       0.0|
|  2|green|  3.0|       2.0|
|  3| blue|  4.0|       0.0|
|  4|  red|  5.0|       1.0|
+---+-----+-----+----------+



In [13]:
# 2. OneHotEncoder: convert index column → one-hot encoded vector
encoder = OneHotEncoder(
    inputCols=["colorIndex"],
    outputCols=["colorVec"],
    dropLast=True,          # default: drop the last category to avoid full collinearity
    handleInvalid="error"    # how to treat unseen/invalid categories error vs. keep
)
model = encoder.fit(df_indexed)
df_encoded = model.transform(df_indexed)
df_encoded.show()

+---+-----+-----+----------+-------------+
| id|color|value|colorIndex|     colorVec|
+---+-----+-----+----------+-------------+
|  0|  red|  1.0|       1.0|(3,[1],[1.0])|
|  1| blue|  2.0|       0.0|(3,[0],[1.0])|
|  2|green|  3.0|       2.0|(3,[2],[1.0])|
|  3| blue|  4.0|       0.0|(3,[0],[1.0])|
|  4|  red|  5.0|       1.0|(3,[1],[1.0])|
+---+-----+-----+----------+-------------+



Instead of storing every value in the vector, we only store:

-the length of the vector,

-the indices where non-zero elements appear,

-and the values at those positions.

This is called a sparse vector.

🔹 Example 1: simple one-hot vector

For [1, 0, 0]
We can write it as:

```(3, [0], [1.0])```

In [14]:
# 3. (Optional) Combine with numeric features via VectorAssembler
assembler = VectorAssembler(
    inputCols=["colorVec", "value"],
    outputCol="features"
)
assembled = assembler.transform(df_encoded)
assembled.show()

+---+-----+-----+----------+-------------+-----------------+
| id|color|value|colorIndex|     colorVec|         features|
+---+-----+-----+----------+-------------+-----------------+
|  0|  red|  1.0|       1.0|(3,[1],[1.0])|[0.0,1.0,0.0,1.0]|
|  1| blue|  2.0|       0.0|(3,[0],[1.0])|[1.0,0.0,0.0,2.0]|
|  2|green|  3.0|       2.0|(3,[2],[1.0])|[0.0,0.0,1.0,3.0]|
|  3| blue|  4.0|       0.0|(3,[0],[1.0])|[1.0,0.0,0.0,4.0]|
|  4|  red|  5.0|       1.0|(3,[1],[1.0])|[0.0,1.0,0.0,5.0]|
+---+-----+-----+----------+-------------+-----------------+



Let's do it at once with the help of Pipeline

In [15]:
# Build a pipeline
pipeline = Pipeline(stages=[indexer, encoder, assembler])

# Fit and transform
model = pipeline.fit(df)
df_transformed = model.transform(df)

df_transformed.select("id", "color", "colorIndex", "colorVec", "features").show(truncate=False)


# Stop Spark when done
spark.stop()

+---+-----+----------+-------------+-----------------+
|id |color|colorIndex|colorVec     |features         |
+---+-----+----------+-------------+-----------------+
|0  |red  |1.0       |(3,[1],[1.0])|[0.0,1.0,0.0,1.0]|
|1  |blue |0.0       |(3,[0],[1.0])|[1.0,0.0,0.0,2.0]|
|2  |green|2.0       |(3,[2],[1.0])|[0.0,0.0,1.0,3.0]|
|3  |blue |0.0       |(3,[0],[1.0])|[1.0,0.0,0.0,4.0]|
|4  |red  |1.0       |(3,[1],[1.0])|[0.0,1.0,0.0,5.0]|
+---+-----+----------+-------------+-----------------+



# VectorIndexer Example



      - Automatically detects categorical features.
      - Encodes them as category indices under the hood.
      - Works seamlessly with downstream ML models.

    

In [None]:
# Automatically identify categorical features
# and index them (features with <= 2 distinct values)
indexer = VectorIndexer(
    inputCol="features",
    outputCol="indexedFeatures",
    maxCategories=2  # Features with <=2 distinct values are treated as categorical
)

# Fit the indexer
indexer_model = indexer.fit(assembled)

# Transform the data
indexed = indexer_model.transform(assembled)

indexed.select("features", "indexedFeatures").show(truncate=False)


# R formula

In [29]:

data = [
    ("red", 1.0, 10.0, 0.0),
    ("blue", 2.0, 20.0, 1.0),
    ("green", 3.0, 30.0, 0.0),
    ("red", 4.0, 40.0, 1.0)
]

drf = spark.createDataFrame(data, ["color", "value1", "value2", "label"])
drf.show()


+-----+------+------+-----+
|color|value1|value2|label|
+-----+------+------+-----+
|  red|   1.0|  10.0|  0.0|
| blue|   2.0|  20.0|  1.0|
|green|   3.0|  30.0|  0.0|
|  red|   4.0|  40.0|  1.0|
+-----+------+------+-----+



In [33]:
from pyspark.ml.feature import RFormula

formula = RFormula(
    formula="label ~ . + color:value1"
)
model = formula.fit(drf)
output = model.transform(drf)
output.show()

+-----+------+------+-----+--------------------+
|color|value1|value2|label|            features|
+-----+------+------+-----+--------------------+
|  red|   1.0|  10.0|  0.0|[1.0,0.0,1.0,10.0...|
| blue|   2.0|  20.0|  1.0|[0.0,1.0,2.0,20.0...|
|green|   3.0|  30.0|  0.0|(7,[2,3,6],[3.0,3...|
|  red|   4.0|  40.0|  1.0|[1.0,0.0,4.0,40.0...|
+-----+------+------+-----+--------------------+



# Sample data

In [23]:
!wget https://raw.githubusercontent.com/databricks/Spark-The-Definitive-Guide/refs/heads/master/data/sample_libsvm_data.txt

--2025-10-07 16:29:55--  https://raw.githubusercontent.com/databricks/Spark-The-Definitive-Guide/refs/heads/master/data/sample_libsvm_data.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 104736 (102K) [text/plain]
Saving to: ‘sample_libsvm_data.txt’


2025-10-07 16:29:55 (9.18 MB/s) - ‘sample_libsvm_data.txt’ saved [104736/104736]



In [24]:
sdf = spark.read.format("libsvm").load("/content/sample_libsvm_data.txt")

In [25]:
sdf.show()

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  0.0|(692,[127,128,129...|
|  1.0|(692,[158,159,160...|
|  1.0|(692,[124,125,126...|
|  1.0|(692,[152,153,154...|
|  1.0|(692,[151,152,153...|
|  0.0|(692,[129,130,131...|
|  1.0|(692,[158,159,160...|
|  1.0|(692,[99,100,101,...|
|  0.0|(692,[154,155,156...|
|  0.0|(692,[127,128,129...|
|  1.0|(692,[154,155,156...|
|  0.0|(692,[153,154,155...|
|  0.0|(692,[151,152,153...|
|  1.0|(692,[129,130,131...|
|  0.0|(692,[154,155,156...|
|  1.0|(692,[150,151,152...|
|  0.0|(692,[124,125,126...|
|  0.0|(692,[152,153,154...|
|  1.0|(692,[97,98,99,12...|
|  1.0|(692,[124,125,126...|
+-----+--------------------+
only showing top 20 rows



Import the LogisticRegression class
Define (instantiate) the logistic regression estimator



In [36]:
from pyspark.ml.classification import LogisticRegression

lr = LogisticRegression(
    labelCol="label",       # column containing true labels
    featuresCol="features", # column containing feature vectors
    maxIter=10,             # number of optimization iterations
    regParam=0.3,           # regularization strength
    elasticNetParam=0.8     # ElasticNet mixing (0=L2, 1=L1)
)

In [37]:
lr.explainParams()

"aggregationDepth: suggested depth for treeAggregate (>= 2). (default: 2)\nelasticNetParam: the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty. (default: 0.0, current: 0.8)\nfamily: The name of family which is a description of the label distribution to be used in the model. Supported options: auto, binomial, multinomial (default: auto)\nfeaturesCol: features column name. (default: features, current: features)\nfitIntercept: whether to fit an intercept term. (default: True)\nlabelCol: label column name. (default: label, current: label)\nlowerBoundsOnCoefficients: The lower bounds on coefficients if fitting under bound constrained optimization. The bound matrix must be compatible with the shape (1, number of features) for binomial regression, or (number of classes, number of features) for multinomial regression. (undefined)\nlowerBoundsOnIntercepts: The lower bounds on intercepts if fitting under bound constra

In [35]:

# Fit the model to your data (train)
lr_model = lr.fit(sdf)

# Inspect model summary and coefficients
print("Coefficients:", lr_model.coefficients)
print("Intercept:", lr_model.intercept)

# Make predictions
predictions = lr_model.transform(sdf)
predictions.select("label", "prediction", "probability").show(5, truncate=False)


Coefficients: (692,[272,300,323,350,351,378,379,405,406,407,428,433,434,435,455,456,461,462,483,484,489,490,496,511,512,517,539,540,568],[-7.520689871384125e-05,-8.115773146847006e-05,3.814692771846427e-05,0.0003776490540424338,0.0003405148366194403,0.0005514455157343107,0.0004085386116096912,0.0004197467332749452,0.0008119171358670031,0.000502770837266875,-2.3929260406600902e-05,0.0005745048020902297,0.0009037546426803677,7.818229700243899e-05,-2.1787551952911914e-05,-3.402165821789542e-05,0.0004966517360637633,0.0008190557828370372,-8.017982139522613e-05,-2.743169403783527e-05,0.0004810832226238988,0.0004840801762677878,-8.926472920009901e-06,-0.00034148812330427297,-8.950592574121382e-05,0.00048645469116892156,-8.478698005186097e-05,-0.00042347832158317705,-7.296535777631246e-05])
Intercept: -0.5991460286401438
+-----+----------+----------------------------------------+
|label|prediction|probability                             |
+-----+----------+------------------------------------