In [72]:
sc

In [73]:
spark = SparkSession.builder.appName('deep_learning').getOrCreate()

In [74]:
# Load the dl_data.csv into hadoop in the named folder 'user1'

df = spark.read.csv('/user1/fer2013.csv', header=True, inferSchema=True)

In [75]:
# Display the structure of schema
df.printSchema()

root
 |-- emotion: integer (nullable = true)
 |-- pixels: string (nullable = true)
 |-- Usage: string (nullable = true)



In [76]:
df.show(5)

+-------+--------------------+--------+
|emotion|              pixels|   Usage|
+-------+--------------------+--------+
|      0|70 80 82 72 58 58...|Training|
|      0|151 150 147 155 1...|Training|
|      2|231 212 156 164 1...|Training|
|      4|24 32 36 30 32 23...|Training|
|      6|4 0 0 0 0 0 0 0 0...|Training|
+-------+--------------------+--------+
only showing top 5 rows



In [77]:
# print of data shape
print('Shape of dataset:',(df.count(),len(df.columns)))

Shape of dataset: (35887, 3)


In [78]:
df.describe().show()



+-------+------------------+--------------------+-----------+
|summary|           emotion|              pixels|      Usage|
+-------+------------------+--------------------+-----------+
|  count|             35887|               35887|      35887|
|   mean|3.3232646919497313|                null|       null|
| stddev|1.8738187592999593|                null|       null|
|    min|                 0|0 0 0 0 0 0 0 0 0...|PrivateTest|
|    max|                 6|99 99 99 99 101 1...|   Training|
+-------+------------------+--------------------+-----------+



                                                                                

In [79]:
df.columns

['emotion', 'pixels', 'Usage']

In [80]:
uniqe_usages = df.select("Usage").distinct()
uniqe_usages.show()

+-----------+
|      Usage|
+-----------+
|   Training|
| PublicTest|
|PrivateTest|
+-----------+



In [81]:
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType

# Define mapping of emotions to labels
emotion_mapping = {
    0: "Angry",
    1: "Disgust",
    2: "Fear",
    3: "Happy",
    4: "Sad",
    5: "Surprise",
    6: "Neutral"
}

# Create 'label' column based on 'emotion' column
emotion_to_label_udf = udf(lambda emotion: emotion_mapping.get(emotion, "Unknown"), StringType())

df = df.withColumn("label", emotion_to_label_udf(df["emotion"]))

In [82]:
df = df.withColumn("pixels", F.split(df["pixels"], " ").cast("array<int>").alias("pixels"))

In [83]:
df.printSchema()

root
 |-- emotion: integer (nullable = true)
 |-- pixels: array (nullable = true)
 |    |-- element: integer (containsNull = true)
 |-- Usage: string (nullable = true)
 |-- label: string (nullable = true)



In [84]:
from pyspark.sql.functions import udf
from pyspark.sql.types import ArrayType, FloatType

@udf(ArrayType(FloatType()))
def normalize_pixels(pixels):
    return [float(pixel) / 255.0 for pixel in pixels]

df = df.withColumn("pixels", normalize_pixels(df["pixels"]))

In [85]:
df.show()

+-------+--------------------+--------+--------+
|emotion|              pixels|   Usage|   label|
+-------+--------------------+--------+--------+
|      0|[0.27450982, 0.31...|Training|   Angry|
|      0|[0.5921569, 0.588...|Training|   Angry|
|      2|[0.90588236, 0.83...|Training|    Fear|
|      4|[0.09411765, 0.12...|Training|     Sad|
|      6|[0.015686275, 0.0...|Training| Neutral|
|      2|[0.21568628, 0.21...|Training|    Fear|
|      4|[0.078431375, 0.0...|Training|     Sad|
|      3|[0.3019608, 0.305...|Training|   Happy|
|      3|[0.33333334, 0.32...|Training|   Happy|
|      2|[1.0, 0.99607843,...|Training|    Fear|
|      0|[0.11764706, 0.09...|Training|   Angry|
|      6|[0.15294118, 0.29...|Training| Neutral|
|      6|[0.85882354, 0.83...|Training| Neutral|
|      6|[0.5803922, 0.564...|Training| Neutral|
|      3|[0.015686275, 0.0...|Training|   Happy|
|      5|[0.41960785, 0.41...|Training|Surprise|
|      3|[0.05490196, 0.05...|Training|   Happy|
|      2|[1.0, 1.0, 

In [88]:
sample_pixels = df.select("pixels").first()["pixels"]
print(len(sample_pixels))

2304


In [89]:
df.show(5)

+-------+--------------------+--------+-------+
|emotion|              pixels|   Usage|  label|
+-------+--------------------+--------+-------+
|      0|[0.27450982, 0.31...|Training|  Angry|
|      0|[0.5921569, 0.588...|Training|  Angry|
|      2|[0.90588236, 0.83...|Training|   Fear|
|      4|[0.09411765, 0.12...|Training|    Sad|
|      6|[0.015686275, 0.0...|Training|Neutral|
+-------+--------------------+--------+-------+
only showing top 5 rows



In [86]:
train_df = df.filter(df["Usage"] == "Training")
test_df = df.filter(df["Usage"] == "PrivateTest")
val_df = df.filter(df["Usage"] == "PublicTest")

In [87]:
print('Shape of train_df:',(train_df.count(),len(train_df.columns)))
print('Shape of test_df:',(test_df.count(),len(test_df.columns)))
print('Shape of val_df:',(val_df.count(),len(val_df.columns)))

Shape of train_df: (28709, 4)
Shape of test_df: (3589, 4)
Shape of val_df: (3589, 4)
