### One Hot Encoding

In [1]:
# !sudo apt update
# !sudo apt install openjdk-17-jre-headless -y
import pyspark
from pyspark import pandas as ps
from pyspark.sql import SparkSession
from pyspark.sql import SQLContext
from pyspark import SparkContext, SparkConf
from pyspark.ml.feature import OneHotEncoder, StringIndexer



In [2]:
conf = pyspark.SparkConf().setAll([\
            ('spark.master', 'local[*]'),\
            ('spark.app.name', 'Glucose_Analysis_Spark')])\
            .set('spark.sql.shuffle.partitions', '1500')
spark = SparkSession.builder.config(conf=conf)\
    .getOrCreate()  

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


23/04/26 19:09:53 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
df = spark.read.options(header='True', inferSchema='True', delimiter=',')\
        .csv('/cephfs/data/cohort.csv')

In [4]:
df.show(2)

23/04/26 19:10:00 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , UserId, Gender, DOB, Age, DiabetesType, Treatment
 Schema: _c0, UserId, Gender, DOB, Age, DiabetesType, Treatment
Expected: _c0 but found: 
CSV file: file:///cephfs/data/cohort.csv
+---+--------------------+------+-------------------+---+------------+---------+
|_c0|              UserId|Gender|                DOB|Age|DiabetesType|Treatment|
+---+--------------------+------+-------------------+---+------------+---------+
|  0|5lZPrCk6qk8L6Jw+S...|Female|1931-01-01 00:00:00| 92|    type-two|       no|
|  1|9qY9mZ+GV5Kd/O/NB...|  Male|1937-01-01 00:00:00| 86|    type-two|       no|
+---+--------------------+------+-------------------+---+------------+---------+
only showing top 2 rows



In [5]:
encodedCols = ['Gender', 'Treatment'] # not doing'DiabetesType' because all type-two
encodedLabels = []

for name in encodedCols:
    indexer = StringIndexer(inputCol=name, outputCol= name + '_Num')
    indexer_fitted = indexer.fit(df)
    encodedLabels.append([name, indexer_fitted.labels])
                          
    df = indexer_fitted.transform(df)

In [6]:
encodedLabels

[['Gender', ['Female', 'Male']],
 ['Treatment', ['yes-both', 'yes-long-acting', 'no', 'yes-fast-acting']]]

In [7]:
df.show(2)

23/04/26 19:10:02 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , UserId, Gender, DOB, Age, DiabetesType, Treatment
 Schema: _c0, UserId, Gender, DOB, Age, DiabetesType, Treatment
Expected: _c0 but found: 
CSV file: file:///cephfs/data/cohort.csv
+---+--------------------+------+-------------------+---+------------+---------+----------+-------------+
|_c0|              UserId|Gender|                DOB|Age|DiabetesType|Treatment|Gender_Num|Treatment_Num|
+---+--------------------+------+-------------------+---+------------+---------+----------+-------------+
|  0|5lZPrCk6qk8L6Jw+S...|Female|1931-01-01 00:00:00| 92|    type-two|       no|       0.0|          2.0|
|  1|9qY9mZ+GV5Kd/O/NB...|  Male|1937-01-01 00:00:00| 86|    type-two|       no|       1.0|          2.0|
+---+--------------------+------+-------------------+---+------------+---------+----------+-------------+
only showing top 2 rows



In [8]:
single_col_ohe = OneHotEncoder(inputCol="Gender_Num", outputCol="Gender_Encoded", dropLast=True)
df = single_col_ohe.fit(df).transform(df)

single_col_ohe = OneHotEncoder(inputCol="Treatment_Num", outputCol="Treatment_Encoded", dropLast=True)
df = single_col_ohe.fit(df).transform(df)

In [9]:
df.show()

23/04/26 19:10:03 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , UserId, Gender, DOB, Age, DiabetesType, Treatment
 Schema: _c0, UserId, Gender, DOB, Age, DiabetesType, Treatment
Expected: _c0 but found: 
CSV file: file:///cephfs/data/cohort.csv
+---+--------------------+------+-------------------+---+------------+---------+----------+-------------+--------------+-----------------+
|_c0|              UserId|Gender|                DOB|Age|DiabetesType|Treatment|Gender_Num|Treatment_Num|Gender_Encoded|Treatment_Encoded|
+---+--------------------+------+-------------------+---+------------+---------+----------+-------------+--------------+-----------------+
|  0|5lZPrCk6qk8L6Jw+S...|Female|1931-01-01 00:00:00| 92|    type-two|       no|       0.0|          2.0| (1,[0],[1.0])|    (3,[2],[1.0])|
|  1|9qY9mZ+GV5Kd/O/NB...|  Male|1937-01-01 00:00:00| 86|    type-two|       no|       1.0|          2.0|     (1,[],[])|    (3,[2],[1.0])|
|  2|uhsyLhr4Zl6NfGbNB...|Fem

In [12]:
df.write.parquet('/cephfs/data/cohort_encoded.parquet')

23/04/26 19:18:20 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , UserId, Gender, DOB, Age, DiabetesType, Treatment
 Schema: _c0, UserId, Gender, DOB, Age, DiabetesType, Treatment
Expected: _c0 but found: 
CSV file: file:///cephfs/data/cohort.csv


                                                                                