# Install Pyspark

In [None]:
!sudo apt update
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
#Check this site for the latest download link https://www.apache.org/dyn/closer.lua/spark/spark-3.2.1/spark-3.2.1-bin-hadoop3.2.tgz
!wget -q https://dlcdn.apache.org/spark/spark-3.2.1/spark-3.2.1-bin-hadoop3.2.tgz
!tar xf spark-3.2.1-bin-hadoop3.2.tgz
!pip install -q findspark
!pip install pyspark
!pip install py4j

import os
import sys
# os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
# os.environ["SPARK_HOME"] = "/content/spark-3.2.1-bin-hadoop3.2"


import findspark
findspark.init()
findspark.find()

import pyspark

from pyspark.sql import DataFrame, SparkSession
from typing import List
import pyspark.sql.types as T
import pyspark.sql.functions as F

spark= SparkSession \
       .builder \
       .appName("Our First Spark Example") \
       .getOrCreate()

spark

[33m0% [Working][0m            Get:1 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,632 B]
[33m0% [Connecting to archive.ubuntu.com (185.125.190.83)] [Connecting to security.[0m[33m0% [Connecting to archive.ubuntu.com (185.125.190.83)] [Connecting to security.[0m                                                                               Get:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease [1,581 B]
Hit:3 https://cli.github.com/packages stable InRelease
Get:4 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]
Hit:5 http://archive.ubuntu.com/ubuntu jammy InRelease
Get:6 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [128 kB]
Get:7 https://r2u.stat.illinois.edu/ubuntu jammy InRelease [6,555 B]
Get:8 https://r2u.stat.illinois.edu/ubuntu jammy/main amd64 Packages [2,832 kB]
Get:9 http://security.ubuntu.com/ubuntu jammy-security/main amd64 Packages [3,532 kB]
Get:10 http://archive.ubunt

# Create Single Column Spark Dataframe using List

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.types import IntegerType

# Initialize SparkSession
spark = SparkSession.builder.appName("SingleColumnDataFrame").getOrCreate()

# Sample list of integers
data = [1,2,3,4,5,6,7,8,9,10]



# Convert the list to a DataFrame with a single column named 'Numbers'

demodf = spark.createDataFrame(data,IntegerType())


# Show the DataFrame
demodf.show()


+-----+
|value|
+-----+
|    1|
|    2|
|    3|
|    4|
|    5|
|    6|
|    7|
|    8|
|    9|
|   10|
+-----+



In [None]:
df = spark.createDataFrame(data,'int')
df.show()

+-----+
|value|
+-----+
|    1|
|    2|
|    3|
|    4|
|    5|
|    6|
|    7|
|    8|
|    9|
|   10|
+-----+



In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StringType

# Initialize SparkSession
spark = SparkSession.builder.appName("SingleColumnDataFrame").getOrCreate()

# Sample list of strings
data = ["apple","banana","carrot","cow"]

# Convert the list to a DataFrame with a single column named 'Fruits'
df = spark.createDataFrame(data,StringType() ).toDF("fruits")

# Show the DataFrame
df.show()


+------+
|fruits|
+------+
| apple|
|banana|
|carrot|
|   cow|
+------+



In [None]:
df =spark.createDataFrame(data,'string')
df.show()

+------+
| value|
+------+
| apple|
|banana|
|carrot|
|   cow|
+------+



In [None]:
from pyspark.sql import SparkSession

# Initialize SparkSession
spark = SparkSession.builder.appName("SingleColumnDataFrame").getOrCreate()

# Sample list of tuples (even though it's a single element tuple)
data = [(10,), (20,), (30,), (40,), (50,)]

# Convert the list to a DataFrame with a single column named 'Values'
df = spark.createDataFrame(data)

# Show the DataFrame
df.show()


+---+
| _1|
+---+
| 10|
| 20|
| 30|
| 40|
| 50|
+---+



In [None]:
from pyspark.sql import SparkSession

# Initialize SparkSession
spark = SparkSession.builder.appName("SingleColumnDataFrame").getOrCreate()

# Sample list of mixed data types
data = [1, "two", 3.0, True, None]

# Convert the list to a DataFrame with a single column named 'MixedValues'
df = spark.createDataFrame([(i,) for i in data]).toDF("MixedValues")
#df = spark.createDataFrame(data,'string')

# Show the DataFrame
df.show()


+-----------+
|MixedValues|
+-----------+
|          1|
|        two|
|        3.0|
|       true|
|       NULL|
+-----------+



In PySpark, `IntegerType` and `StringType` are data types used to define the schema of DataFrames, which represent structured data. Understanding these types is crucial for working effectively with PySpark DataFrames, as they help ensure that data is interpreted correctly during processing.

### 1. **`IntegerType`**

#### Explanation:
- **`IntegerType`** is used to represent integer values in a DataFrame.
- It maps to the integer data type, which means it can store whole numbers (both positive and negative) within a specific range (typically -2^31 to 2^31-1).

#### Significance:
- **Precision**: Ensures that the data is stored and processed as integers, which is important for operations that require numeric precision, like mathematical calculations or aggregations.
- **Optimization**: DataFrames that use `IntegerType` can benefit from optimized performance, as integers are simpler to store and manipulate compared to floating-point numbers or strings.

#### Uses:
- **Numeric Columns**: Use `IntegerType` for columns that represent whole numbers, such as age, count, or any other numeric fields without decimals.
- **Aggregations**: When performing operations like sum, count, or average, using `IntegerType` ensures accurate results without the overhead of dealing with decimal points.

#### Example:
```python
from pyspark.sql import SparkSession
from pyspark.sql.types import IntegerType

spark = SparkSession.builder.appName("IntegerTypeExample").getOrCreate()

data = [(1, "Amit", 29), (2, "Priya", 31)]
columns = ["ID", "Name", "Age"]

df = spark.createDataFrame(data, schema=[IntegerType(), StringType(), IntegerType()])
df.show()
```

### 2. **`StringType`**

#### Explanation:
- **`StringType`** is used to represent string values (text) in a DataFrame.
- It maps to the string data type, which means it can store any sequence of characters.

#### Significance:
- **Flexibility**: `StringType` allows storing any text data, making it versatile for handling different kinds of information such as names, descriptions, or categorical data.
- **Human-Readable**: Text data is often used for fields that need to be human-readable, like names, addresses, or any descriptive fields.

#### Uses:
- **Text Columns**: Use `StringType` for columns that store text data, such as names, product descriptions, or any categorical fields.
- **Joining DataFrames**: When performing joins based on string columns (like matching customer names), using `StringType` ensures that the data is compared correctly.
- **Filtering**: `StringType` allows you to easily filter DataFrames based on text criteria (e.g., find all rows where the name is "Priya").

#### Example:
```python
from pyspark.sql import SparkSession
from pyspark.sql.types import StringType

spark = SparkSession.builder.appName("StringTypeExample").getOrCreate()

data = [(1, "Amit", 29), (2, "Priya", 31)]
columns = ["ID", "Name", "Age"]

df = spark.createDataFrame(data, schema=[IntegerType(), StringType(), IntegerType()])
df.show()
```

### Summary of Significance and Uses:
- **Data Integrity**: Using `IntegerType` and `StringType` ensures that your DataFrame's schema matches the expected data types, which helps prevent errors during data processing.
- **Performance**: Properly typed data leads to better performance, especially in large-scale data processing, as Spark can optimize operations based on data types.
- **Ease of Use**: By defining data types explicitly, you make your code more readable and maintainable, as it’s clear what kind of data each column contains.

Understanding and utilizing `IntegerType` and `StringType` correctly is essential for building robust and efficient PySpark applications that handle structured data.

# Create Multi Column Spark Dataframe using Python list

In [None]:
from pyspark.sql import SparkSession

# Initialize SparkSession
spark = SparkSession.builder.appName("MultiColumnDataFrame").getOrCreate()

# Sample list of tuples, each tuple representing a row
data = [
    (1, "Amit", 29),
    (2, "Priya", 31),
    (3, "Vikram", 25),
    (4, "Neha", 40)
]

# Specify the column names
columns = ["ID","Name","Age"]

# Create the DataFrame
df = spark.createDataFrame(data,columns)

# Show the DataFrame
df.show()


+---+------+---+
| ID|  Name|Age|
+---+------+---+
|  1|  Amit| 29|
|  2| Priya| 31|
|  3|Vikram| 25|
|  4|  Neha| 40|
+---+------+---+



# Overview of row on Dataframes

### PySpark Row:

#### Introduction to PySpark Row
- **`Row` in PySpark**: The `Row` class in PySpark is a part of the `pyspark.sql` module and represents a single record or row in a DataFrame. It is analogous to a tuple in Python but provides more flexibility and better readability when dealing with structured data.

- **Structured Data Representation**: In a DataFrame, each row is a `Row` object. This structure allows you to access and manipulate each field within a row by either position (like a tuple) or by name (like a dictionary).

- **Importance**: Understanding how to work with `Row` objects is essential when you need to interact with individual records in a DataFrame, especially when converting RDDs to DataFrames or dealing with complex data transformations.

#### Creating and Using PySpark Row

1. **Basic Creation**:
   - A `Row` can be created manually by importing the `Row` class and passing the field values.
   - Example:
     ```python
     from pyspark.sql import Row

     # Creating a Row object
     row1 = Row(ID=1, Name="Amit", Age=29)

     # Accessing the fields
     print(row1.ID)  # Output: 1
     print(row1.Name)  # Output: Amit
     print(row1.Age)  # Output: 29
     ```

2. **Accessing Row Data**:
   - **By Name**: You can access the data in a `Row` by the field name, which makes the code more readable and easier to understand.
   - **By Position**: You can also access the fields by their position, similar to how you would access elements in a tuple.

   - Example:
     ```python
     print(row1[0])  # Output: 1
     print(row1['Name'])  # Output: Amit
     ```

3. **Using Row in DataFrame**:
   - Rows are often used in DataFrames, where each record in the DataFrame is a `Row` object. This is particularly useful when converting RDDs to DataFrames.
   
   - Example:
     ```python
     from pyspark.sql import SparkSession
     from pyspark.sql import Row

     # Initialize SparkSession
     spark = SparkSession.builder.appName("RowExample").getOrCreate()

     # Sample data in the form of Rows
     data = [Row(ID=1, Name="Amit", Age=29),
             Row(ID=2, Name="Priya", Age=31),
             Row(ID=3, Name="Vikram", Age=25)]

     # Creating a DataFrame from Rows
     df = spark.createDataFrame(data)

     # Show the DataFrame
     df.show()
     ```

   **Output**:
   ```
   +---+-------+---+
   | ID|   Name|Age|
   +---+-------+---+
   |  1|   Amit| 29|
   |  2|  Priya| 31|
   |  3|Vikram| 25|
   +---+-------+---+
   ```

4. **Dynamic Fields**:
   - The `Row` class allows you to create Rows with dynamic fields, making it flexible for use cases where the structure of the data isn't known upfront.
   - Example:
     ```python
     person = Row("Name", "Age")
     row2 = person("Neha", 40)

     print(row2.Name)  # Output: Neha
     print(row2.Age)  # Output: 40
     ```

5. **Converting RDD to DataFrame using Row**:
   - When working with RDDs (Resilient Distributed Datasets), you can convert them to DataFrames by mapping each RDD element to a `Row`.
   - Example:
     ```python
     # Sample RDD data
     rdd = spark.sparkContext.parallelize([
         (1, "Amit", 29),
         (2, "Priya", 31),
         (3, "Vikram", 25)
     ])

     # Convert RDD to DataFrame using Row
     df = rdd.map(lambda x: Row(ID=x[0], Name=x[1], Age=x[2])).toDF()

     # Show the DataFrame
     df.show()
     ```

   **Output**:
   ```
   +---+-------+---+
   | ID|   Name|Age|
   +---+-------+---+
   |  1|   Amit| 29|
   |  2|  Priya| 31|
   |  3|Vikram| 25|
   +---+-------+---+
   ```

6. **Working with Nested Structures**:
   - `Row` can also handle nested structures, making it possible to represent complex data hierarchies.
   - Example:
     ```python
     address = Row(City="Mumbai", Zip="400001")
     person = Row(ID=1, Name="Amit", Age=29, Address=address)

     print(person.Address.City)  # Output: Mumbai
     print(person.Address.Zip)  # Output: 400001
     ```

#### Summary of Key Points:
- **Flexibility**: PySpark’s `Row` is highly flexible, allowing both positional and named access to data. This is particularly useful when converting RDDs to DataFrames or working with complex data structures.
  
- **Integration with DataFrames**: `Row` is integral to the functioning of DataFrames in PySpark, where each record in a DataFrame is represented as a `Row`.

- **Dynamic and Nested Data**: Rows can be dynamically created with variable fields and can also handle nested data structures, making them suitable for complex data representations.

- **Conversion of RDDs to DataFrames**: The `Row` object plays a crucial role in converting RDDs to DataFrames, providing a bridge between the unstructured world of RDDs and the structured world of DataFrames.

#### Practical Examples Summary:
1. **Creating and Accessing Rows**:
   - Created a `Row` and accessed its fields by both name and position.
   
2. **Using Rows in DataFrames**:
   - Demonstrated how to create a DataFrame from a list of `Row` objects and access the data.

3. **Dynamic Row Creation**:
   - Showed how to create `Row` objects with dynamic fields.

4. **Converting RDDs to DataFrames**:
   - Converted an RDD to a DataFrame using `Row`, illustrating the importance of `Row` in PySpark workflows.

5. **Nested Rows**:
   - Demonstrated handling nested data structures within a `Row`, emphasizing the flexibility of the `Row` class in managing complex data.

Understanding and effectively using `Row` in PySpark is crucial for anyone working with PySpark, as it enables you to handle structured data more effectively and leverage the full power of DataFrames.

In [None]:
from pyspark.sql import Row
row1=Row(Id=1,Name="Amit",Age=29)
print(row1.Id)
print(row1.Name)
print(row1.Age)
print(row1['Name'])
print(row1[0])


from pyspark.sql import SparkSession
from pyspark.sql import Row

# Initialize SparkSession
spark = SparkSession.builder.appName("RowExample").getOrCreate()

# Sample data in the form of Rows
data = [Row(ID=1, Name="Amit", Age=29),
        Row(ID=2, Name="Priya", Age=31),
        Row(ID=3, Name="Vikram", Age=25)]

# Creating a DataFrame from Rows
df = spark.createDataFrame(data)

# Show the DataFrame
df.show()

1
Amit
29
Amit
1
+---+------+---+
| ID|  Name|Age|
+---+------+---+
|  1|  Amit| 29|
|  2| Priya| 31|
|  3|Vikram| 25|
+---+------+---+



In [None]:
# convert RDD to Dataframe

rdd =spark.sparkContext.parallelize([
    (1,"Amit",29),
    (2,"Priya",31),
    (3,"Vikram",25)
])
rdd.collect()

rdd_rows = rdd.map(lambda x: Row(ID=x[0],Name=x[1],Age=x[2]))
df=spark.createDataFrame(rdd_rows)
df.show()


+---+------+---+
| ID|  Name|Age|
+---+------+---+
|  1|  Amit| 29|
|  2| Priya| 31|
|  3|Vikram| 25|
+---+------+---+



# Convert List of Lists into Spark Dataframe using Row

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql import Row
users_list = [[1, 'Scott'], [2, 'Donald'], [3, 'Mickey'], [4, 'Elvis']]
df=spark.createDataFrame(users_list,'user_id int, user_first_name string')
df.show()

+-------+---------------+
|user_id|user_first_name|
+-------+---------------+
|      1|          Scott|
|      2|         Donald|
|      3|         Mickey|
|      4|          Elvis|
+-------+---------------+



In [None]:
from pyspark.sql import SparkSession
from pyspark.sql import Row

# Initialize SparkSession
spark = SparkSession.builder.appName("ListToDataFrame").getOrCreate()

# Sample List of Lists
data = [
    [1, "Amit", 29],
    [2, "Priya", 31],
    [3, "Vikram", 25],
    [4, "Neha", 40]
]

# Specify column names
columns = ["ID","Name","AGE"]

# Convert List of Lists to a List of Row objects
row_data = [Row(ID=row[0], Name=row[1],Age=row[2]) for row in data]

# Create a DataFrame from the List of Row objects
df = spark.createDataFrame(row_data)

# Show the DataFrame
df.show()


+---+------+---+
| ID|  Name|Age|
+---+------+---+
|  1|  Amit| 29|
|  2| Priya| 31|
|  3|Vikram| 25|
|  4|  Neha| 40|
+---+------+---+



# Convert List of tuples into Spark Dataframe using Row

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql import Row

# Initialize SparkSession
spark = SparkSession.builder.appName("TupleToDataFrame").getOrCreate()

# Sample List of Tuples
data = [
    (1, "Amit", 29),
    (2, "Priya", 31),
    (3, "Vikram", 25),
    (4, "Neha", 40)
]

# Specify column names
columns = ["ID","Name","age"]

# Convert List of Tuples to a List of Row objects
row_data = [Row(ID=row[0],Name=row[1],age=row[2])for row in data]

# Create a DataFrame from the List of Row objects
df = spark.createDataFrame(row_data)

# Show the DataFrame
df.show()


+---+------+---+
| ID|  Name|age|
+---+------+---+
|  1|  Amit| 29|
|  2| Priya| 31|
|  3|Vikram| 25|
|  4|  Neha| 40|
+---+------+---+



# Convert List of dict into Spark Dataframe using Row

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql import Row

# Initialize SparkSession
spark = SparkSession.builder.appName("DictToDataFrame").getOrCreate()

# Sample List of Dictionaries
data = [
    {"ID": 1, "Name": "Amit", "Age": 29},
    {"ID": 2, "Name": "Priya", "Age": 31},
    {"ID": 3, "Name": "Vikram", "Age": 25},
    {"ID": 4, "Name": "Neha", "Age": 40}
]

# Convert List of Dictionaries to a List of Row objects
row_data = [Row(**row)for row in data]

# Create a DataFrame from the List of Row objects
df = spark.createDataFrame(row_data)

# Show the DataFrame
df.show()


+---+------+---+
| ID|  Name|Age|
+---+------+---+
|  1|  Amit| 29|
|  2| Priya| 31|
|  3|Vikram| 25|
|  4|  Neha| 40|
+---+------+---+



Row Mapping:

The list comprehension [Row(**row) for row in data] uses **row to unpack each dictionary into keyword arguments for Row, creating a Row object for each dictionary.

# Basic Data Types in Spark

### **Basic Data Types in Spark: Comprehensive Notes**

Apache Spark is a powerful distributed computing framework designed for processing large-scale data. Understanding the basic data types in Spark is crucial for effectively managing and manipulating data in Spark DataFrames and RDDs (Resilient Distributed Datasets). These data types form the foundation for working with structured data in Spark SQL and DataFrames.

#### **1. Overview of Spark Data Types**
- **Categories**: Spark data types can be broadly categorized into primitive types (e.g., integers, floats, strings), complex types (e.g., arrays, maps), and structured types (e.g., structs).
- **Usage**: These data types are used to define the schema of DataFrames, ensuring that data is stored, processed, and manipulated correctly.
- **Interoperability**: Spark’s data types closely resemble SQL data types, making it easier to integrate with relational databases and work with structured data.

#### **2. Primitive Data Types**
Primitive data types are the basic building blocks in Spark and represent single values like integers, floating-point numbers, and strings.

##### **a. IntegerType**
- **Description**: Represents 32-bit signed integers.
- **Range**: From -2,147,483,648 to 2,147,483,647.
- **Usage**: Used for columns that require whole numbers without decimal points.
- **Example**: Age, number of items, etc.
- **Code Example**:
  ```python
  from pyspark.sql.types import IntegerType

  # Defining a column with IntegerType
  age_column = IntegerType()
  ```

##### **b. LongType**
- **Description**: Represents 64-bit signed integers.
- **Range**: From -9,223,372,036,854,775,808 to 9,223,372,036,854,775,807.
- **Usage**: Used for columns that require large whole numbers.
- **Example**: Big data counts, large monetary values.
- **Code Example**:
  ```python
  from pyspark.sql.types import LongType

  # Defining a column with LongType
  large_number_column = LongType()
  ```

##### **c. FloatType**
- **Description**: Represents 32-bit floating-point numbers.
- **Precision**: Approximately 7 decimal digits of precision.
- **Usage**: Used for columns that require decimal numbers but with limited precision.
- **Example**: Measurements, ratings, etc.
- **Code Example**:
  ```python
  from pyspark.sql.types import FloatType

  # Defining a column with FloatType
  rating_column = FloatType()
  ```

##### **d. DoubleType**
- **Description**: Represents 64-bit floating-point numbers.
- **Precision**: Approximately 15 decimal digits of precision.
- **Usage**: Used for columns that require high precision with decimal numbers.
- **Example**: Scientific calculations, precise financial data.
- **Code Example**:
  ```python
  from pyspark.sql.types import DoubleType

  # Defining a column with DoubleType
  precise_measurement_column = DoubleType()
  ```

##### **e. StringType**
- **Description**: Represents string values (text).
- **Usage**: Used for columns that require text data.
- **Example**: Names, descriptions, categories, etc.
- **Code Example**:
  ```python
  from pyspark.sql.types import StringType

  # Defining a column with StringType
  name_column = StringType()
  ```

##### **f. BooleanType**
- **Description**: Represents boolean values (`True` or `False`).
- **Usage**: Used for columns that require binary decisions or flags.
- **Example**: IsActive status, boolean checks.
- **Code Example**:
  ```python
  from pyspark.sql.types import BooleanType

  # Defining a column with BooleanType
  active_status_column = BooleanType()
  ```

##### **g. ByteType**
- **Description**: Represents 8-bit signed integers.
- **Range**: From -128 to 127.
- **Usage**: Used for very small integer values.
- **Example**: Small counters, binary data in integer form.
- **Code Example**:
  ```python
  from pyspark.sql.types import ByteType

  # Defining a column with ByteType
  small_counter_column = ByteType()
  ```

##### **h. ShortType**
- **Description**: Represents 16-bit signed integers.
- **Range**: From -32,768 to 32,767.
- **Usage**: Used for columns that require small integer values but more than `ByteType`.
- **Example**: Smaller numerical ranges, minor levels.
- **Code Example**:
  ```python
  from pyspark.sql.types import ShortType

  # Defining a column with ShortType
  minor_level_column = ShortType()
  ```

##### **i. BinaryType**
- **Description**: Represents binary data.
- **Usage**: Used for columns that store binary data, such as images or encrypted data.
- **Example**: Image files, encrypted strings, serialized data.
- **Code Example**:
  ```python
  from pyspark.sql.types import BinaryType

  # Defining a column with BinaryType
  image_data_column = BinaryType()
  ```

##### **j. DecimalType**
- **Description**: Represents arbitrary-precision signed decimal numbers.
- **Usage**: Used for columns that require precise decimal numbers with a specific scale and precision.
- **Example**: Financial data, precise measurements.
- **Code Example**:
  ```python
  from pyspark.sql.types import DecimalType

  # Defining a column with DecimalType (precision = 10, scale = 2)
  financial_data_column = DecimalType(10, 2)
  ```

#### **3. Complex Data Types**
Complex data types in Spark are used to represent collections or structured data within a single column.

##### **a. ArrayType**
- **Description**: Represents a column of arrays (lists).
- **Usage**: Used for columns that need to store multiple values in a single field.
- **Example**: Tags, list of features, etc.
- **Code Example**:
  ```python
  from pyspark.sql.types import ArrayType, StringType

  # Defining a column with ArrayType containing strings
  tags_column = ArrayType(StringType())
  data = [
    (1, "Amit", ["Reading", "Traveling", "Music"]),
    (2, "Priya", ["Cooking", "Yoga"]),
    (3, "Vikram", ["Gaming", "Hiking", "Photography"]),
    (4, "Neha", ["Dancing", "Painting"])
]
  schema = StructType([
    StructField("ID", IntegerType(), True),
    StructField("Name", StringType(), True),
    StructField("Hobbies", ArrayType(StringType()), True)
])
  ```

##### **b. MapType**
- **Description**: Represents a column of key-value pairs (maps).
- **Usage**: Used for columns that store key-value pairs within a single field.
- **Example**: Metadata, configuration settings.
- **Code Example**:

  ```python
  from pyspark.sql.types import MapType, StringType, IntegerType

  # Defining a column with MapType with String keys and Integer values
  metadata_column = MapType(StringType(), IntegerType())
  data = [
    (1, "Amit", {"Math": 85, "Science": 90}),
    (2, "Priya", {"Math": 95, "English": 88}),
    (3, "Vikram", {"Science": 75, "History": 80}),
    (4, "Neha", {"English": 92, "Math": 89})
]
schema = StructType([
    StructField("ID", IntegerType(), True),
    StructField("Name", StringType(), True),
    StructField("Scores", MapType(StringType(), IntegerType()), True)
])
  ```



##### **c. StructType**
- **Description**: Represents a column of nested fields (like a record or struct).
- **Usage**: Used for columns that store a complex structure with multiple fields.
- **Example**: Address (with fields like street, city, zip).
- **Code Example**:

  ```python
  from pyspark.sql.types import StructType, StructField, StringType, IntegerType

  # Defining a column with StructType (nested fields)
  address_column = StructType([
      StructField("Street", StringType(), True),
      StructField("City", StringType(), True),
      StructField("Zip", IntegerType(), True)
  ])

  data = [
    (1, "Amit", ("123 Main St", "Mumbai", "400001")),
    (2, "Priya", ("456 Park Ave", "Delhi", "110001")),
    (3, "Vikram", ("789 Hill Rd", "Bangalore", "560001")),
    (4, "Neha", ("101 Lake View", "Chennai", "600001"))
]
schema = StructType([
    StructField("ID", IntegerType(), True),
    StructField("Name", StringType(), True),
    StructField("Address", StructType([
        StructField("Street", StringType(), True),
        StructField("City", StringType(), True),
        StructField("ZipCode", StringType(), True)
    ]), True)
])
  ```



#### **4. Special Data Types**
Special data types in Spark are used for specific scenarios such as dates and timestamps.



##### **a. DateType**
- **Description**: Represents a date without a time component.
- **Usage**: Used for columns that store dates.
- **Example**: Birthdates, event dates.
- **Code Example**:
  ```python
  from pyspark.sql.types import DateType

  # Defining a column with DateType
  birthdate_column = DateType()

  data = [
    (1, "Amit", "2023-01-01"),
    (2, "Priya", "2023-02-15"),
    (3, "Vikram", "2023-03-20"),
    (4, "Neha", "2023-04-25")
]
schema = StructType([
    StructField("ID", IntegerType(), True),
    StructField("Name", StringType(), True),
    StructField("Birthdate", DateType(), True)
])

  ```


##### **b. TimestampType**
- **Description**: Represents a timestamp (date and time).
- **Usage**: Used for columns that store date and time.
- **Example**: Event logs, transaction times.
- **Code Example**:
  ```python
  from pyspark.sql.types import TimestampType

  # Defining a column with TimestampType
  event_timestamp_column = TimestampType()

  data = [
    (1, "Amit", "2023-01-01 10:00:00"),
    (2, "Priya", "2023-02-15 14:30:00"),
    (3, "Vikram", "2023-03-20 09:15:00"),
    (4, "Neha", "2023-04-25 18:45:00")
]
schema = StructType([
    StructField("ID", IntegerType(), True),
    StructField("Name", StringType(), True),
    StructField("EventTime", TimestampType(), True)
])
 ```


#### **5. NullType**
- **Description**: Represents a null value or unknown type.
- **Usage**: Generally used for internal purposes or as a placeholder in some cases.
- **Example**: Undefined fields, default values during schema evolution.
- **Code Example**:
  ```python
  from pyspark.sql.types import NullType

  # Defining a column with NullType (rarely used directly)
  undefined_column = NullType()
  ```

#### **6. Type Conversion in Spark**
- **Automatic Type Conversion**: Spark can automatically infer types when reading data from sources like CSV, Parquet, or JSON, and can cast data types during operations where necessary.
- **Explicit Type Casting**: You can cast columns from one type to another using the `cast()` function.
  - **Example**:
    ```python
    df = df.withColumn("Age", df["Age"].cast(IntegerType()))
    ```

#### **7. Summary of Spark Data Types**
- **Primitive Types**: Handle single values (e.g., IntegerType, StringType).
- **Complex Types**: Handle collections or nested data (

e.g., ArrayType, StructType).
- **Special Types**: Handle dates, timestamps, and nulls (e.g., DateType, TimestampType, NullType).
- **Usage**: Understanding these data types helps in defining schemas, performing data transformations, and optimizing data processing in Spark.

#### **8. Importance of Choosing the Right Data Type**
- **Performance**: Proper data types ensure efficient memory usage and faster computations.
- **Accuracy**: Choosing the right type prevents data loss or inaccuracies (e.g., using `DecimalType` for financial calculations).
- **Interoperability**: Proper types ensure seamless integration with databases and external systems.

By understanding and effectively using Spark’s data types, you can better manage your data, write more efficient queries, and develop more robust data processing pipelines. Whether you are defining schemas for DataFrames or working with RDDs, knowing these data types is fundamental to leveraging the full power of Spark.

In [None]:
from pyspark.sql.types import ArrayType, StringType, StructType, StructField, IntegerType

# Defining a column with ArrayType containing strings
tags_column = ArrayType(StringType())
data = [
  (1, "Amit", ["Reading", "Traveling", "Music"]),
  (2, "Priya", ["Cooking", "Yoga"]),
  (3, "Vikram", ["Gaming", "Hiking", "Photography"]),
  (4, "Neha", ["Dancing", "Painting"])
]
schema = StructType([
  StructField("ID", IntegerType(), True),
  StructField("Name", StringType(), True),
  StructField("Hobbies", ArrayType(StringType()), True)
])

df = spark.createDataFrame(data, schema)
df.show()

from pyspark.sql.types import MapType, StringType, IntegerType

# Defining a column with MapType with String keys and Integer values
metadata_column = MapType(StringType(), IntegerType())
data = [
  (1, "Amit", {"Math": 85, "Science": 90}),
  (2, "Priya", {"Math": 95, "English": 88}),
  (3, "Vikram", {"Science": 75, "History": 80}),
  (4, "Neha", {"English": 92, "Math": 89})
]
schema = StructType([
  StructField("ID", IntegerType(), True),
  StructField("Name", StringType(), True),
  StructField("Scores", MapType(StringType(), IntegerType()), True)
])

df = spark.createDataFrame(data, schema)
df.show()
display(df)

+---+------+--------------------+
| ID|  Name|             Hobbies|
+---+------+--------------------+
|  1|  Amit|[Reading, Traveli...|
|  2| Priya|     [Cooking, Yoga]|
|  3|Vikram|[Gaming, Hiking, ...|
|  4|  Neha| [Dancing, Painting]|
+---+------+--------------------+

+---+------+--------------------+
| ID|  Name|              Scores|
+---+------+--------------------+
|  1|  Amit|{Science -> 90, M...|
|  2| Priya|{Math -> 95, Engl...|
|  3|Vikram|{Science -> 75, H...|
|  4|  Neha|{Math -> 89, Engl...|
+---+------+--------------------+



DataFrame[ID: int, Name: string, Scores: map<string,int>]

In [None]:
display(df)

DataFrame[ID: int, Name: string, Scores: map<string,int>]

In [None]:
address = Row(City="Mumbai", Zip="400001")
person = Row(ID=1, Name="Amit", Age=29, Address=address)

print(person.Address.City)  # Output: Mumbai
print(person.Address.Zip)  # Output: 400001

Mumbai
400001
