<a href="https://colab.research.google.com/github/hbisgin/BigDatav1/blob/main/Lecture14_PreprocessingFeatureEngineering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# CONTINOUS FEATURES, NORMALIZATION, & MORE ON CATEGORICAL VARIABLES

In [1]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import (
    Bucketizer, QuantileDiscretizer, StandardScaler,
    MinMaxScaler, MaxAbsScaler, ElementwiseProduct,
    Normalizer, StringIndexer, IndexToString
)
from pyspark.ml.linalg import Vectors

# Create Spark session
spark = SparkSession.builder.appName("ContinuousFeaturesExample").getOrCreate()

## 1. Continuous Features Example


In [2]:

contDF = spark.range(0, 20).selectExpr("cast(id as double)")

# Bucketing (manual splits)
bucketBorders = [-1.0, 5.0, 10.0, 250.0, 600.0]
bucketer = Bucketizer(splits=bucketBorders, inputCol="id", outputCol="bucketed_id")
bucketedDF = bucketer.transform(contDF)
bucketedDF.show()

# Bucketing with QuantileDiscretizer (automatic percentiles)
quantBucketer = QuantileDiscretizer(numBuckets=5, inputCol="id", outputCol="quantile_bucket")
fittedBucketer = quantBucketer.fit(contDF)
quantiledDF = fittedBucketer.transform(contDF)
quantiledDF.show()

+----+-----------+
|  id|bucketed_id|
+----+-----------+
| 0.0|        0.0|
| 1.0|        0.0|
| 2.0|        0.0|
| 3.0|        0.0|
| 4.0|        0.0|
| 5.0|        1.0|
| 6.0|        1.0|
| 7.0|        1.0|
| 8.0|        1.0|
| 9.0|        1.0|
|10.0|        2.0|
|11.0|        2.0|
|12.0|        2.0|
|13.0|        2.0|
|14.0|        2.0|
|15.0|        2.0|
|16.0|        2.0|
|17.0|        2.0|
|18.0|        2.0|
|19.0|        2.0|
+----+-----------+

+----+---------------+
|  id|quantile_bucket|
+----+---------------+
| 0.0|            0.0|
| 1.0|            0.0|
| 2.0|            0.0|
| 3.0|            1.0|
| 4.0|            1.0|
| 5.0|            1.0|
| 6.0|            1.0|
| 7.0|            2.0|
| 8.0|            2.0|
| 9.0|            2.0|
|10.0|            2.0|
|11.0|            3.0|
|12.0|            3.0|
|13.0|            3.0|
|14.0|            3.0|
|15.0|            4.0|
|16.0|            4.0|
|17.0|            4.0|
|18.0|            4.0|
|19.0|            4.0|
+----+--------

## 2. Scaling and Normalization Example

In [3]:
data = [(0, Vectors.dense([1.0, 0.1, -1.0])),
        (1, Vectors.dense([3.0, 10.1, 3.0]))]
scaleDF = spark.createDataFrame(data, ["id", "features"])

# StandardScaler
from pyspark.ml.feature import StandardScaler
sScaler = StandardScaler(inputCol="features", outputCol="scaled_std", withMean=False, withStd=True)
scaledDF = sScaler.fit(scaleDF).transform(scaleDF)
scaledDF.show(truncate=False)

# MinMaxScaler
minMax = MinMaxScaler(inputCol="features", outputCol="scaled_minmax", min=0.0, max=1.0)
minmaxDF = minMax.fit(scaleDF).transform(scaleDF)
minmaxDF.show(truncate=False)

# MaxAbsScaler
maScaler = MaxAbsScaler(inputCol="features", outputCol="scaled_maxabs")
maxabsDF = maScaler.fit(scaleDF).transform(scaleDF)
maxabsDF.show(truncate=False)

# ElementwiseProduct
scaleUpVec = Vectors.dense(10.0, 15.0, 20.0)
scalingUp = ElementwiseProduct(scalingVec=scaleUpVec, inputCol="features", outputCol="scaled_custom")
scaledCustomDF = scalingUp.transform(scaleDF)
scaledCustomDF.show(truncate=False)

# Normalizer (Manhattan norm p=1)
manhattan = Normalizer(inputCol="features", outputCol="norm_L1", p=1.0)
normalizedDF = manhattan.transform(scaleDF)
normalizedDF.show(truncate=False)


+---+--------------+-------------------------------------------------------------+
|id |features      |scaled_std                                                   |
+---+--------------+-------------------------------------------------------------+
|0  |[1.0,0.1,-1.0]|[0.7071067811865475,0.01414213562373095,-0.35355339059327373]|
|1  |[3.0,10.1,3.0]|[2.1213203435596424,1.428355697996826,1.0606601717798212]    |
+---+--------------+-------------------------------------------------------------+

+---+--------------+-------------+
|id |features      |scaled_minmax|
+---+--------------+-------------+
|0  |[1.0,0.1,-1.0]|(3,[],[])    |
|1  |[3.0,10.1,3.0]|[1.0,1.0,1.0]|
+---+--------------+-------------+

+---+--------------+-------------------------------------------------------------+
|id |features      |scaled_maxabs                                                |
+---+--------------+-------------------------------------------------------------+
|0  |[1.0,0.1,-1.0]|[0.3333333333333333,0

## 3. StringIndexer + IndexToString Example


In [4]:
from pyspark.sql import Row
data = [Row(category="apple"), Row(category="banana"), Row(category="orange"), Row(category="banana")]
catDF = spark.createDataFrame(data)

# Convert text labels to indices
indexer = StringIndexer(inputCol="category", outputCol="categoryIndex")
indexedDF = indexer.fit(catDF).transform(catDF)
indexedDF.show()

# Convert indices back to text labels
converter = IndexToString(inputCol="categoryIndex", outputCol="originalCategory",
                          labels=indexer.fit(catDF).labels)
convertedDF = converter.transform(indexedDF)
convertedDF.show()

# -----------------------------------------------
# Stop Spark
# -----------------------------------------------
spark.stop()


+--------+-------------+
|category|categoryIndex|
+--------+-------------+
|   apple|          1.0|
|  banana|          0.0|
|  orange|          2.0|
|  banana|          0.0|
+--------+-------------+

+--------+-------------+----------------+
|category|categoryIndex|originalCategory|
+--------+-------------+----------------+
|   apple|          1.0|           apple|
|  banana|          0.0|          banana|
|  orange|          2.0|          orange|
|  banana|          0.0|          banana|
+--------+-------------+----------------+



# TEXT TRANSFORMATION + FEATURE SELECTION (Step-by-Step View)


In [5]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import (
    Tokenizer, StopWordsRemover, NGram,
    CountVectorizer, ChiSqSelector
)

# Initialize Spark session
spark = SparkSession.builder.appName("TextFeatureSelectionSteps").getOrCreate()

## 1. Sample dataset

In [6]:
data = [
    (0, "Big data analytics with Spark and Hadoop"),
    (1, "Python for machine learning and data science"),
    (0, "Big data tools like Spark, Hive, and HDFS"),
    (1, "Deep learning using TensorFlow and PyTorch"),
]
df = spark.createDataFrame(data, ["label", "Description"])
print("=== Original Data ===")
df.show(truncate=False)

=== Original Data ===
+-----+--------------------------------------------+
|label|Description                                 |
+-----+--------------------------------------------+
|0    |Big data analytics with Spark and Hadoop    |
|1    |Python for machine learning and data science|
|0    |Big data tools like Spark, Hive, and HDFS   |
|1    |Deep learning using TensorFlow and PyTorch  |
+-----+--------------------------------------------+



## 2. Tokenization

In [7]:
from pyspark.ml.feature import Tokenizer
tokenizer = Tokenizer(inputCol="Description", outputCol="tokens")
tokenized = tokenizer.transform(df)
print("=== After Tokenization ===")
tokenized.select("Description", "tokens").show(truncate=False)

=== After Tokenization ===
+--------------------------------------------+----------------------------------------------------+
|Description                                 |tokens                                              |
+--------------------------------------------+----------------------------------------------------+
|Big data analytics with Spark and Hadoop    |[big, data, analytics, with, spark, and, hadoop]    |
|Python for machine learning and data science|[python, for, machine, learning, and, data, science]|
|Big data tools like Spark, Hive, and HDFS   |[big, data, tools, like, spark,, hive,, and, hdfs]  |
|Deep learning using TensorFlow and PyTorch  |[deep, learning, using, tensorflow, and, pytorch]   |
+--------------------------------------------+----------------------------------------------------+



## 3. StopWords Removal

In [8]:
from pyspark.ml.feature import StopWordsRemover
remover = StopWordsRemover(
    inputCol="tokens",
    outputCol="filtered",
    stopWords=StopWordsRemover.loadDefaultStopWords("english")
)
filtered = remover.transform(tokenized)
print("=== After StopWords Removal ===")
filtered.select("tokens", "filtered").show(truncate=False)

=== After StopWords Removal ===
+----------------------------------------------------+---------------------------------------------+
|tokens                                              |filtered                                     |
+----------------------------------------------------+---------------------------------------------+
|[big, data, analytics, with, spark, and, hadoop]    |[big, data, analytics, spark, hadoop]        |
|[python, for, machine, learning, and, data, science]|[python, machine, learning, data, science]   |
|[big, data, tools, like, spark,, hive,, and, hdfs]  |[big, data, tools, like, spark,, hive,, hdfs]|
|[deep, learning, using, tensorflow, and, pytorch]   |[deep, learning, using, tensorflow, pytorch] |
+----------------------------------------------------+---------------------------------------------+



## 4. N-Gram Creation


In [9]:
from pyspark.ml.feature import NGram
ngram = NGram(n=2, inputCol="filtered", outputCol="ngrams")
ngramDF = ngram.transform(filtered)
print("=== After N-Gram Creation (Bigrams) ===")
ngramDF.select("filtered", "ngrams").show(truncate=False)

=== After N-Gram Creation (Bigrams) ===
+---------------------------------------------+-------------------------------------------------------------------------+
|filtered                                     |ngrams                                                                   |
+---------------------------------------------+-------------------------------------------------------------------------+
|[big, data, analytics, spark, hadoop]        |[big data, data analytics, analytics spark, spark hadoop]                |
|[python, machine, learning, data, science]   |[python machine, machine learning, learning data, data science]          |
|[big, data, tools, like, spark,, hive,, hdfs]|[big data, data tools, tools like, like spark,, spark, hive,, hive, hdfs]|
|[deep, learning, using, tensorflow, pytorch] |[deep learning, learning using, using tensorflow, tensorflow pytorch]    |
+---------------------------------------------+-----------------------------------------------------------

## 5. CountVectorizer


In [10]:
from pyspark.ml.feature import CountVectorizer
cv = CountVectorizer(
    inputCol="ngrams",  # can also use "filtered"
    outputCol="countVec",
    vocabSize=1000,
    minDF=1
)
cvModel = cv.fit(ngramDF)
counted = cvModel.transform(ngramDF)
vocab = cvModel.vocabulary
print("=== Vocabulary ===")
for i, term in enumerate(vocab):
    print(f"{i}: {term}")

print("=== After CountVectorizer (Sparse Vectors) ===")
counted.select("ngrams", "countVec").show(truncate=False)

=== Vocabulary ===
0: big data
1: spark, hive,
2: like spark,
3: using tensorflow
4: spark hadoop
5: hive, hdfs
6: tensorflow pytorch
7: python machine
8: learning data
9: data tools
10: learning using
11: analytics spark
12: data science
13: machine learning
14: deep learning
15: data analytics
16: tools like
=== After CountVectorizer (Sparse Vectors) ===
+-------------------------------------------------------------------------+---------------------------------------------+
|ngrams                                                                   |countVec                                     |
+-------------------------------------------------------------------------+---------------------------------------------+
|[big data, data analytics, analytics spark, spark hadoop]                |(17,[0,4,11,15],[1.0,1.0,1.0,1.0])           |
|[python machine, machine learning, learning data, data science]          |(17,[7,8,12,13],[1.0,1.0,1.0,1.0])           |
|[big data, data tools, tools l

## 6. ChiSqSelector (Feature Selection)


In [11]:
from pyspark.ml.feature import ChiSqSelector
chisq = ChiSqSelector(
    featuresCol="countVec",
    labelCol="label",
    outputCol="selectedFeatures",
    numTopFeatures=5
)
chisq_model = chisq.fit(counted)
selected = chisq_model.transform(counted)

## 7. Viewing select features and their indexes

In [12]:
# Show transformed data
selected.select("label", "selectedFeatures").show(truncate=False)

selected_indices = chisq_model.selectedFeatures
print("=== Selected feature indices ===")
print(selected_indices)

# Step 6: Print corresponding feature names using vocabulary
selected_terms = [vocab[i] for i in selected_indices]
print("=== Selected feature names ===")
for idx, term in zip(selected_indices, selected_terms):
    print(f"{idx}: {term}")

+-----+-------------------------+
|label|selectedFeatures         |
+-----+-------------------------+
|0    |(5,[0,4],[1.0,1.0])      |
|1    |(5,[],[])                |
|0    |(5,[0,1,2],[1.0,1.0,1.0])|
|1    |(5,[3],[1.0])            |
+-----+-------------------------+

=== Selected feature indices ===
[0, 1, 2, 3, 4]
=== Selected feature names ===
0: big data
1: spark, hive,
2: like spark,
3: using tensorflow
4: spark hadoop
